diff --git a/.github/workflows/simdutf-is-genuine.yml b/.github/workflows/simdutf-is-genuine.yml new file mode 100644 index 00000000..b053d8dd --- /dev/null +++ b/.github/workflows/simdutf-is-genuine.yml @@ -0,0 +1,18 @@ +name: simdutf-is-genuine + +on: + - push + - pull_request + +jobs: + simdutf-is-genuine: + runs-on: ubuntu-slim + steps: + - uses: actions/checkout@v6 + + - name: Check if vendored simdutf is genuine + run: | + cd simdutf + curl -L https://github.com/simdutf/simdutf/releases/download/v8.0.0/singleheader.zip -O + unzip -o singleheader.zip + git diff --exit-code diff --git a/cbits/validate_utf8.cpp b/cbits/validate_utf8.cpp deleted file mode 100644 index 9fada135..00000000 --- a/cbits/validate_utf8.cpp +++ /dev/null @@ -1,11 +0,0 @@ -#include "simdutf.h" - -extern "C" -int _hs_text_is_valid_utf8(const char* str, size_t len){ - return simdutf::validate_utf8(str, len); -} - -extern "C" -int _hs_text_is_valid_utf8_offset(const char* str, size_t off, size_t len){ - return simdutf::validate_utf8(str + off, len); -} diff --git a/simdutf/hs_simdutf.c b/simdutf/hs_simdutf.c new file mode 100644 index 00000000..317aec20 --- /dev/null +++ b/simdutf/hs_simdutf.c @@ -0,0 +1,9 @@ +#include "simdutf_c.h" + +int _hs_text_is_valid_utf8(const char *buf, size_t len) { + return simdutf_validate_utf8(buf, len); +} + +int _hs_text_is_valid_utf8_offset(const char *buf, size_t off, size_t len) { + return simdutf_validate_utf8(buf + off, len); +} diff --git a/simdutf/simdutf.cpp b/simdutf/simdutf.cpp index 3aee47b2..b9e32da9 100644 --- a/simdutf/simdutf.cpp +++ b/simdutf/simdutf.cpp @@ -1,733 +1,1772 @@ -/* auto-generated on 2024-05-07 22:33:11 -0400. Do not edit! */ +/* auto-generated on 2026-01-13 09:03:21 +0100. Do not edit! */ /* begin file src/simdutf.cpp */ #include "simdutf.h" -// We include base64_tables once. -/* begin file src/tables/base64_tables.h */ -#ifndef SIMDUTF_BASE64_TABLES_H -#define SIMDUTF_BASE64_TABLES_H -#include + +/* begin file src/encoding_types.cpp */ + +namespace simdutf { +std::string to_string(encoding_type bom) { + switch (bom) { + case UTF16_LE: + return "UTF16 little-endian"; + case UTF16_BE: + return "UTF16 big-endian"; + case UTF32_LE: + return "UTF32 little-endian"; + case UTF32_BE: + return "UTF32 big-endian"; + case UTF8: + return "UTF8"; + case unspecified: + return "unknown"; + default: + return "error"; + } +} + +namespace BOM { +// Note that BOM for UTF8 is discouraged. +encoding_type check_bom(const uint8_t *byte, size_t length) { + if (length >= 2 && byte[0] == 0xff and byte[1] == 0xfe) { + if (length >= 4 && byte[2] == 0x00 and byte[3] == 0x0) { + return encoding_type::UTF32_LE; + } else { + return encoding_type::UTF16_LE; + } + } else if (length >= 2 && byte[0] == 0xfe and byte[1] == 0xff) { + return encoding_type::UTF16_BE; + } else if (length >= 4 && byte[0] == 0x00 and byte[1] == 0x00 and + byte[2] == 0xfe and byte[3] == 0xff) { + return encoding_type::UTF32_BE; + } else if (length >= 3 && byte[0] == 0xef and byte[1] == 0xbb and + byte[2] == 0xbf) { + return encoding_type::UTF8; + } + return encoding_type::unspecified; +} + +encoding_type check_bom(const char *byte, size_t length) { + return check_bom(reinterpret_cast(byte), length); +} + +size_t bom_byte_size(encoding_type bom) { + switch (bom) { + case UTF16_LE: + return 2; + case UTF16_BE: + return 2; + case UTF32_LE: + return 4; + case UTF32_BE: + return 4; + case UTF8: + return 3; + case unspecified: + return 0; + default: + return 0; + } +} + +} // namespace BOM +} // namespace simdutf +/* end file src/encoding_types.cpp */ +/* begin file src/error.cpp */ +namespace simdutf { +// deliberately empty +} +/* end file src/error.cpp */ +// The large tables should be included once and they +// should not depend on a kernel. +/* begin file src/tables/utf8_to_utf16_tables.h */ +#ifndef SIMDUTF_UTF8_TO_UTF16_TABLES_H +#define SIMDUTF_UTF8_TO_UTF16_TABLES_H #include namespace simdutf { namespace { namespace tables { -namespace base64 { -namespace base64_default { - -const char e0[256] = { - 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'D', 'D', 'D', - 'D', 'E', 'E', 'E', 'E', 'F', 'F', 'F', 'F', 'G', 'G', 'G', 'G', 'H', 'H', - 'H', 'H', 'I', 'I', 'I', 'I', 'J', 'J', 'J', 'J', 'K', 'K', 'K', 'K', 'L', - 'L', 'L', 'L', 'M', 'M', 'M', 'M', 'N', 'N', 'N', 'N', 'O', 'O', 'O', 'O', - 'P', 'P', 'P', 'P', 'Q', 'Q', 'Q', 'Q', 'R', 'R', 'R', 'R', 'S', 'S', 'S', - 'S', 'T', 'T', 'T', 'T', 'U', 'U', 'U', 'U', 'V', 'V', 'V', 'V', 'W', 'W', - 'W', 'W', 'X', 'X', 'X', 'X', 'Y', 'Y', 'Y', 'Y', 'Z', 'Z', 'Z', 'Z', 'a', - 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'd', 'd', 'd', 'd', - 'e', 'e', 'e', 'e', 'f', 'f', 'f', 'f', 'g', 'g', 'g', 'g', 'h', 'h', 'h', - 'h', 'i', 'i', 'i', 'i', 'j', 'j', 'j', 'j', 'k', 'k', 'k', 'k', 'l', 'l', - 'l', 'l', 'm', 'm', 'm', 'm', 'n', 'n', 'n', 'n', 'o', 'o', 'o', 'o', 'p', - 'p', 'p', 'p', 'q', 'q', 'q', 'q', 'r', 'r', 'r', 'r', 's', 's', 's', 's', - 't', 't', 't', 't', 'u', 'u', 'u', 'u', 'v', 'v', 'v', 'v', 'w', 'w', 'w', - 'w', 'x', 'x', 'x', 'x', 'y', 'y', 'y', 'y', 'z', 'z', 'z', 'z', '0', '0', - '0', '0', '1', '1', '1', '1', '2', '2', '2', '2', '3', '3', '3', '3', '4', - '4', '4', '4', '5', '5', '5', '5', '6', '6', '6', '6', '7', '7', '7', '7', - '8', '8', '8', '8', '9', '9', '9', '9', '+', '+', '+', '+', '/', '/', '/', - '/'}; - -const char e1[256] = { - 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', - 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', - 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', - 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', - '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', - 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', - 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', - 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', - '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G', - 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', - 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', - 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', - '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C', - 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', - 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', - 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', - 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', - '/'}; - -const char e2[256] = { - 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', - 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', - 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', - 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', - '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', - 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', - 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', - 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', - '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G', - 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', - 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', - 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', - '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C', - 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', - 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', - 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', - 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', - '/'}; - -const uint32_t d0[256] = { - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x000000f8, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x000000fc, - 0x000000d0, 0x000000d4, 0x000000d8, 0x000000dc, 0x000000e0, 0x000000e4, - 0x000000e8, 0x000000ec, 0x000000f0, 0x000000f4, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, - 0x00000004, 0x00000008, 0x0000000c, 0x00000010, 0x00000014, 0x00000018, - 0x0000001c, 0x00000020, 0x00000024, 0x00000028, 0x0000002c, 0x00000030, - 0x00000034, 0x00000038, 0x0000003c, 0x00000040, 0x00000044, 0x00000048, - 0x0000004c, 0x00000050, 0x00000054, 0x00000058, 0x0000005c, 0x00000060, - 0x00000064, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x00000068, 0x0000006c, 0x00000070, 0x00000074, 0x00000078, - 0x0000007c, 0x00000080, 0x00000084, 0x00000088, 0x0000008c, 0x00000090, - 0x00000094, 0x00000098, 0x0000009c, 0x000000a0, 0x000000a4, 0x000000a8, - 0x000000ac, 0x000000b0, 0x000000b4, 0x000000b8, 0x000000bc, 0x000000c0, - 0x000000c4, 0x000000c8, 0x000000cc, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}; - -const uint32_t d1[256] = { - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x0000e003, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x0000f003, - 0x00004003, 0x00005003, 0x00006003, 0x00007003, 0x00008003, 0x00009003, - 0x0000a003, 0x0000b003, 0x0000c003, 0x0000d003, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, - 0x00001000, 0x00002000, 0x00003000, 0x00004000, 0x00005000, 0x00006000, - 0x00007000, 0x00008000, 0x00009000, 0x0000a000, 0x0000b000, 0x0000c000, - 0x0000d000, 0x0000e000, 0x0000f000, 0x00000001, 0x00001001, 0x00002001, - 0x00003001, 0x00004001, 0x00005001, 0x00006001, 0x00007001, 0x00008001, - 0x00009001, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x0000a001, 0x0000b001, 0x0000c001, 0x0000d001, 0x0000e001, - 0x0000f001, 0x00000002, 0x00001002, 0x00002002, 0x00003002, 0x00004002, - 0x00005002, 0x00006002, 0x00007002, 0x00008002, 0x00009002, 0x0000a002, - 0x0000b002, 0x0000c002, 0x0000d002, 0x0000e002, 0x0000f002, 0x00000003, - 0x00001003, 0x00002003, 0x00003003, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}; - -const uint32_t d2[256] = { - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x00800f00, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00c00f00, - 0x00000d00, 0x00400d00, 0x00800d00, 0x00c00d00, 0x00000e00, 0x00400e00, - 0x00800e00, 0x00c00e00, 0x00000f00, 0x00400f00, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, - 0x00400000, 0x00800000, 0x00c00000, 0x00000100, 0x00400100, 0x00800100, - 0x00c00100, 0x00000200, 0x00400200, 0x00800200, 0x00c00200, 0x00000300, - 0x00400300, 0x00800300, 0x00c00300, 0x00000400, 0x00400400, 0x00800400, - 0x00c00400, 0x00000500, 0x00400500, 0x00800500, 0x00c00500, 0x00000600, - 0x00400600, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x00800600, 0x00c00600, 0x00000700, 0x00400700, 0x00800700, - 0x00c00700, 0x00000800, 0x00400800, 0x00800800, 0x00c00800, 0x00000900, - 0x00400900, 0x00800900, 0x00c00900, 0x00000a00, 0x00400a00, 0x00800a00, - 0x00c00a00, 0x00000b00, 0x00400b00, 0x00800b00, 0x00c00b00, 0x00000c00, - 0x00400c00, 0x00800c00, 0x00c00c00, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}; - -const uint32_t d3[256] = { - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x003e0000, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x003f0000, - 0x00340000, 0x00350000, 0x00360000, 0x00370000, 0x00380000, 0x00390000, - 0x003a0000, 0x003b0000, 0x003c0000, 0x003d0000, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, - 0x00010000, 0x00020000, 0x00030000, 0x00040000, 0x00050000, 0x00060000, - 0x00070000, 0x00080000, 0x00090000, 0x000a0000, 0x000b0000, 0x000c0000, - 0x000d0000, 0x000e0000, 0x000f0000, 0x00100000, 0x00110000, 0x00120000, - 0x00130000, 0x00140000, 0x00150000, 0x00160000, 0x00170000, 0x00180000, - 0x00190000, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x001a0000, 0x001b0000, 0x001c0000, 0x001d0000, 0x001e0000, - 0x001f0000, 0x00200000, 0x00210000, 0x00220000, 0x00230000, 0x00240000, - 0x00250000, 0x00260000, 0x00270000, 0x00280000, 0x00290000, 0x002a0000, - 0x002b0000, 0x002c0000, 0x002d0000, 0x002e0000, 0x002f0000, 0x00300000, - 0x00310000, 0x00320000, 0x00330000, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}; -} // namespace base64_default - -namespace base64_url { - -const char e0[256] = { - 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'D', 'D', 'D', - 'D', 'E', 'E', 'E', 'E', 'F', 'F', 'F', 'F', 'G', 'G', 'G', 'G', 'H', 'H', - 'H', 'H', 'I', 'I', 'I', 'I', 'J', 'J', 'J', 'J', 'K', 'K', 'K', 'K', 'L', - 'L', 'L', 'L', 'M', 'M', 'M', 'M', 'N', 'N', 'N', 'N', 'O', 'O', 'O', 'O', - 'P', 'P', 'P', 'P', 'Q', 'Q', 'Q', 'Q', 'R', 'R', 'R', 'R', 'S', 'S', 'S', - 'S', 'T', 'T', 'T', 'T', 'U', 'U', 'U', 'U', 'V', 'V', 'V', 'V', 'W', 'W', - 'W', 'W', 'X', 'X', 'X', 'X', 'Y', 'Y', 'Y', 'Y', 'Z', 'Z', 'Z', 'Z', 'a', - 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'd', 'd', 'd', 'd', - 'e', 'e', 'e', 'e', 'f', 'f', 'f', 'f', 'g', 'g', 'g', 'g', 'h', 'h', 'h', - 'h', 'i', 'i', 'i', 'i', 'j', 'j', 'j', 'j', 'k', 'k', 'k', 'k', 'l', 'l', - 'l', 'l', 'm', 'm', 'm', 'm', 'n', 'n', 'n', 'n', 'o', 'o', 'o', 'o', 'p', - 'p', 'p', 'p', 'q', 'q', 'q', 'q', 'r', 'r', 'r', 'r', 's', 's', 's', 's', - 't', 't', 't', 't', 'u', 'u', 'u', 'u', 'v', 'v', 'v', 'v', 'w', 'w', 'w', - 'w', 'x', 'x', 'x', 'x', 'y', 'y', 'y', 'y', 'z', 'z', 'z', 'z', '0', '0', - '0', '0', '1', '1', '1', '1', '2', '2', '2', '2', '3', '3', '3', '3', '4', - '4', '4', '4', '5', '5', '5', '5', '6', '6', '6', '6', '7', '7', '7', '7', - '8', '8', '8', '8', '9', '9', '9', '9', '-', '-', '-', '-', '_', '_', '_', - '_'}; - -const char e1[256] = { - 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', - 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', - 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', - 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', - '8', '9', '-', '_', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', - 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', - 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', - 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', - '4', '5', '6', '7', '8', '9', '-', '_', 'A', 'B', 'C', 'D', 'E', 'F', 'G', - 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', - 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', - 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', - '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_', 'A', 'B', 'C', - 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', - 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', - 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', - 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', - '_'}; - -const char e2[256] = { - 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', - 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', - 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', - 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', - '8', '9', '-', '_', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', - 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', - 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', - 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', - '4', '5', '6', '7', '8', '9', '-', '_', 'A', 'B', 'C', 'D', 'E', 'F', 'G', - 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', - 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', - 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', - '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_', 'A', 'B', 'C', - 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', - 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', - 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', - 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', - '_'}; - -const uint32_t d0[256] = { - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x000000f8, 0x01ffffff, 0x01ffffff, - 0x000000d0, 0x000000d4, 0x000000d8, 0x000000dc, 0x000000e0, 0x000000e4, - 0x000000e8, 0x000000ec, 0x000000f0, 0x000000f4, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, - 0x00000004, 0x00000008, 0x0000000c, 0x00000010, 0x00000014, 0x00000018, - 0x0000001c, 0x00000020, 0x00000024, 0x00000028, 0x0000002c, 0x00000030, - 0x00000034, 0x00000038, 0x0000003c, 0x00000040, 0x00000044, 0x00000048, - 0x0000004c, 0x00000050, 0x00000054, 0x00000058, 0x0000005c, 0x00000060, - 0x00000064, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x000000fc, - 0x01ffffff, 0x00000068, 0x0000006c, 0x00000070, 0x00000074, 0x00000078, - 0x0000007c, 0x00000080, 0x00000084, 0x00000088, 0x0000008c, 0x00000090, - 0x00000094, 0x00000098, 0x0000009c, 0x000000a0, 0x000000a4, 0x000000a8, - 0x000000ac, 0x000000b0, 0x000000b4, 0x000000b8, 0x000000bc, 0x000000c0, - 0x000000c4, 0x000000c8, 0x000000cc, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}; -const uint32_t d1[256] = { - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x0000e003, 0x01ffffff, 0x01ffffff, - 0x00004003, 0x00005003, 0x00006003, 0x00007003, 0x00008003, 0x00009003, - 0x0000a003, 0x0000b003, 0x0000c003, 0x0000d003, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, - 0x00001000, 0x00002000, 0x00003000, 0x00004000, 0x00005000, 0x00006000, - 0x00007000, 0x00008000, 0x00009000, 0x0000a000, 0x0000b000, 0x0000c000, - 0x0000d000, 0x0000e000, 0x0000f000, 0x00000001, 0x00001001, 0x00002001, - 0x00003001, 0x00004001, 0x00005001, 0x00006001, 0x00007001, 0x00008001, - 0x00009001, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x0000f003, - 0x01ffffff, 0x0000a001, 0x0000b001, 0x0000c001, 0x0000d001, 0x0000e001, - 0x0000f001, 0x00000002, 0x00001002, 0x00002002, 0x00003002, 0x00004002, - 0x00005002, 0x00006002, 0x00007002, 0x00008002, 0x00009002, 0x0000a002, - 0x0000b002, 0x0000c002, 0x0000d002, 0x0000e002, 0x0000f002, 0x00000003, - 0x00001003, 0x00002003, 0x00003003, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}; -const uint32_t d2[256] = { - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00800f00, 0x01ffffff, 0x01ffffff, - 0x00000d00, 0x00400d00, 0x00800d00, 0x00c00d00, 0x00000e00, 0x00400e00, - 0x00800e00, 0x00c00e00, 0x00000f00, 0x00400f00, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, - 0x00400000, 0x00800000, 0x00c00000, 0x00000100, 0x00400100, 0x00800100, - 0x00c00100, 0x00000200, 0x00400200, 0x00800200, 0x00c00200, 0x00000300, - 0x00400300, 0x00800300, 0x00c00300, 0x00000400, 0x00400400, 0x00800400, - 0x00c00400, 0x00000500, 0x00400500, 0x00800500, 0x00c00500, 0x00000600, - 0x00400600, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00c00f00, - 0x01ffffff, 0x00800600, 0x00c00600, 0x00000700, 0x00400700, 0x00800700, - 0x00c00700, 0x00000800, 0x00400800, 0x00800800, 0x00c00800, 0x00000900, - 0x00400900, 0x00800900, 0x00c00900, 0x00000a00, 0x00400a00, 0x00800a00, - 0x00c00a00, 0x00000b00, 0x00400b00, 0x00800b00, 0x00c00b00, 0x00000c00, - 0x00400c00, 0x00800c00, 0x00c00c00, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}; -const uint32_t d3[256] = { - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x003e0000, 0x01ffffff, 0x01ffffff, - 0x00340000, 0x00350000, 0x00360000, 0x00370000, 0x00380000, 0x00390000, - 0x003a0000, 0x003b0000, 0x003c0000, 0x003d0000, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, - 0x00010000, 0x00020000, 0x00030000, 0x00040000, 0x00050000, 0x00060000, - 0x00070000, 0x00080000, 0x00090000, 0x000a0000, 0x000b0000, 0x000c0000, - 0x000d0000, 0x000e0000, 0x000f0000, 0x00100000, 0x00110000, 0x00120000, - 0x00130000, 0x00140000, 0x00150000, 0x00160000, 0x00170000, 0x00180000, - 0x00190000, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x003f0000, - 0x01ffffff, 0x001a0000, 0x001b0000, 0x001c0000, 0x001d0000, 0x001e0000, - 0x001f0000, 0x00200000, 0x00210000, 0x00220000, 0x00230000, 0x00240000, - 0x00250000, 0x00260000, 0x00270000, 0x00280000, 0x00290000, 0x002a0000, - 0x002b0000, 0x002c0000, 0x002d0000, 0x002e0000, 0x002f0000, 0x00300000, - 0x00310000, 0x00320000, 0x00330000, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, - 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}; -} // namespace base64_url -const uint64_t thintable_epi8[256] = { - 0x0706050403020100, 0x0007060504030201, 0x0007060504030200, - 0x0000070605040302, 0x0007060504030100, 0x0000070605040301, - 0x0000070605040300, 0x0000000706050403, 0x0007060504020100, - 0x0000070605040201, 0x0000070605040200, 0x0000000706050402, - 0x0000070605040100, 0x0000000706050401, 0x0000000706050400, - 0x0000000007060504, 0x0007060503020100, 0x0000070605030201, - 0x0000070605030200, 0x0000000706050302, 0x0000070605030100, - 0x0000000706050301, 0x0000000706050300, 0x0000000007060503, - 0x0000070605020100, 0x0000000706050201, 0x0000000706050200, - 0x0000000007060502, 0x0000000706050100, 0x0000000007060501, - 0x0000000007060500, 0x0000000000070605, 0x0007060403020100, - 0x0000070604030201, 0x0000070604030200, 0x0000000706040302, - 0x0000070604030100, 0x0000000706040301, 0x0000000706040300, - 0x0000000007060403, 0x0000070604020100, 0x0000000706040201, - 0x0000000706040200, 0x0000000007060402, 0x0000000706040100, - 0x0000000007060401, 0x0000000007060400, 0x0000000000070604, - 0x0000070603020100, 0x0000000706030201, 0x0000000706030200, - 0x0000000007060302, 0x0000000706030100, 0x0000000007060301, - 0x0000000007060300, 0x0000000000070603, 0x0000000706020100, - 0x0000000007060201, 0x0000000007060200, 0x0000000000070602, - 0x0000000007060100, 0x0000000000070601, 0x0000000000070600, - 0x0000000000000706, 0x0007050403020100, 0x0000070504030201, - 0x0000070504030200, 0x0000000705040302, 0x0000070504030100, - 0x0000000705040301, 0x0000000705040300, 0x0000000007050403, - 0x0000070504020100, 0x0000000705040201, 0x0000000705040200, - 0x0000000007050402, 0x0000000705040100, 0x0000000007050401, - 0x0000000007050400, 0x0000000000070504, 0x0000070503020100, - 0x0000000705030201, 0x0000000705030200, 0x0000000007050302, - 0x0000000705030100, 0x0000000007050301, 0x0000000007050300, - 0x0000000000070503, 0x0000000705020100, 0x0000000007050201, - 0x0000000007050200, 0x0000000000070502, 0x0000000007050100, - 0x0000000000070501, 0x0000000000070500, 0x0000000000000705, - 0x0000070403020100, 0x0000000704030201, 0x0000000704030200, - 0x0000000007040302, 0x0000000704030100, 0x0000000007040301, - 0x0000000007040300, 0x0000000000070403, 0x0000000704020100, - 0x0000000007040201, 0x0000000007040200, 0x0000000000070402, - 0x0000000007040100, 0x0000000000070401, 0x0000000000070400, - 0x0000000000000704, 0x0000000703020100, 0x0000000007030201, - 0x0000000007030200, 0x0000000000070302, 0x0000000007030100, - 0x0000000000070301, 0x0000000000070300, 0x0000000000000703, - 0x0000000007020100, 0x0000000000070201, 0x0000000000070200, - 0x0000000000000702, 0x0000000000070100, 0x0000000000000701, - 0x0000000000000700, 0x0000000000000007, 0x0006050403020100, - 0x0000060504030201, 0x0000060504030200, 0x0000000605040302, - 0x0000060504030100, 0x0000000605040301, 0x0000000605040300, - 0x0000000006050403, 0x0000060504020100, 0x0000000605040201, - 0x0000000605040200, 0x0000000006050402, 0x0000000605040100, - 0x0000000006050401, 0x0000000006050400, 0x0000000000060504, - 0x0000060503020100, 0x0000000605030201, 0x0000000605030200, - 0x0000000006050302, 0x0000000605030100, 0x0000000006050301, - 0x0000000006050300, 0x0000000000060503, 0x0000000605020100, - 0x0000000006050201, 0x0000000006050200, 0x0000000000060502, - 0x0000000006050100, 0x0000000000060501, 0x0000000000060500, - 0x0000000000000605, 0x0000060403020100, 0x0000000604030201, - 0x0000000604030200, 0x0000000006040302, 0x0000000604030100, - 0x0000000006040301, 0x0000000006040300, 0x0000000000060403, - 0x0000000604020100, 0x0000000006040201, 0x0000000006040200, - 0x0000000000060402, 0x0000000006040100, 0x0000000000060401, - 0x0000000000060400, 0x0000000000000604, 0x0000000603020100, - 0x0000000006030201, 0x0000000006030200, 0x0000000000060302, - 0x0000000006030100, 0x0000000000060301, 0x0000000000060300, - 0x0000000000000603, 0x0000000006020100, 0x0000000000060201, - 0x0000000000060200, 0x0000000000000602, 0x0000000000060100, - 0x0000000000000601, 0x0000000000000600, 0x0000000000000006, - 0x0000050403020100, 0x0000000504030201, 0x0000000504030200, - 0x0000000005040302, 0x0000000504030100, 0x0000000005040301, - 0x0000000005040300, 0x0000000000050403, 0x0000000504020100, - 0x0000000005040201, 0x0000000005040200, 0x0000000000050402, - 0x0000000005040100, 0x0000000000050401, 0x0000000000050400, - 0x0000000000000504, 0x0000000503020100, 0x0000000005030201, - 0x0000000005030200, 0x0000000000050302, 0x0000000005030100, - 0x0000000000050301, 0x0000000000050300, 0x0000000000000503, - 0x0000000005020100, 0x0000000000050201, 0x0000000000050200, - 0x0000000000000502, 0x0000000000050100, 0x0000000000000501, - 0x0000000000000500, 0x0000000000000005, 0x0000000403020100, - 0x0000000004030201, 0x0000000004030200, 0x0000000000040302, - 0x0000000004030100, 0x0000000000040301, 0x0000000000040300, - 0x0000000000000403, 0x0000000004020100, 0x0000000000040201, - 0x0000000000040200, 0x0000000000000402, 0x0000000000040100, - 0x0000000000000401, 0x0000000000000400, 0x0000000000000004, - 0x0000000003020100, 0x0000000000030201, 0x0000000000030200, - 0x0000000000000302, 0x0000000000030100, 0x0000000000000301, - 0x0000000000000300, 0x0000000000000003, 0x0000000000020100, - 0x0000000000000201, 0x0000000000000200, 0x0000000000000002, - 0x0000000000000100, 0x0000000000000001, 0x0000000000000000, - 0x0000000000000000, -}; +namespace utf8_to_utf16 { +/** + * utf8bigindex uses about 8 kB + * shufutf8 uses about 3344 B + * + * So we use a bit over 11 kB. It would be + * easy to save about 4 kB by only + * storing the index in utf8bigindex, and + * deriving the consumed bytes otherwise. + * However, this may come at a significant (10% to 20%) + * performance penalty. + */ -const uint8_t pshufb_combine_table[272] = { - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, - 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x08, - 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0x00, 0x01, 0x02, 0x03, - 0x04, 0x05, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, - 0x00, 0x01, 0x02, 0x03, 0x04, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, - 0x0f, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, - 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x08, - 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, - 0x00, 0x01, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0x00, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, - 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x08, 0x09, 0x0a, 0x0b, - 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, -}; +const uint8_t shufutf8[209][16] = { + {0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 0, 0, 0, 0}, + {0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 6, 5, 0, 0, 0, 0}, + {0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 6, 255, 0, 0, 0, 0}, + {0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 7, 6, 0, 0, 0, 0}, + {0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 6, 255, 0, 0, 0, 0}, + {0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 7, 6, 0, 0, 0, 0}, + {0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 7, 255, 0, 0, 0, 0}, + {0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 8, 7, 0, 0, 0, 0}, + {0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0}, + {0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0}, + {0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0}, + {0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0}, + {0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0}, + {0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0}, + {0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0}, + {0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0}, + {0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0}, + {0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0}, + {0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0}, + {0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0}, + {0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0}, + {0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0}, + {0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0}, + {0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0}, + {0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0}, + {0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0}, + {0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0}, + {0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0}, + {0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0}, + {0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0}, + {0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0}, + {0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0}, + {1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0}, + {1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0}, + {1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0}, + {1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0}, + {1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0}, + {1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0}, + {1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0}, + {1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0}, + {1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0}, + {1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0}, + {1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0}, + {1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0}, + {1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0}, + {1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0}, + {1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0}, + {1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0}, + {1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0}, + {1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0}, + {1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0}, + {1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0}, + {1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0}, + {1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0}, + {1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0}, + {1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0}, + {1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 8, 255, 0, 0, 0, 0}, + {1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 9, 8, 0, 0, 0, 0}, + {1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 9, 255, 0, 0, 0, 0}, + {1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 10, 9, 0, 0, 0, 0}, + {1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 9, 255, 0, 0, 0, 0}, + {1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 10, 9, 0, 0, 0, 0}, + {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 255, 0, 0, 0, 0}, + {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 0, 0, 0, 0}, + {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255}, + {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255}, + {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255}, + {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255}, + {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255}, + {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255}, + {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255}, + {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255}, + {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255}, + {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255}, + {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255}, + {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255}, + {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255}, + {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255}, + {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255}, + {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255}, + {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255}, + {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255}, + {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 5, 255, 255, 255}, + {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 6, 5, 255, 255}, + {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 7, 6, 5, 255}, + {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 6, 255, 255, 255}, + {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 7, 6, 255, 255}, + {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 8, 7, 6, 255}, + {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 7, 255, 255, 255}, + {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 8, 7, 255, 255}, + {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 9, 8, 7, 255}, + {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255}, + {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255}, + {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255}, + {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255}, + {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255}, + {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255}, + {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255}, + {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255}, + {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255}, + {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255}, + {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255}, + {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255}, + {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255}, + {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255}, + {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255}, + {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255}, + {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255}, + {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255}, + {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 6, 255, 255, 255}, + {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 7, 6, 255, 255}, + {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 8, 7, 6, 255}, + {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 7, 255, 255, 255}, + {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 8, 7, 255, 255}, + {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 9, 8, 7, 255}, + {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 8, 255, 255, 255}, + {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 9, 8, 255, 255}, + {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 10, 9, 8, 255}, + {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255}, + {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255}, + {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255}, + {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255}, + {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255}, + {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255}, + {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255}, + {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255}, + {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255}, + {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255}, + {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 7, 6, 255, 255}, + {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 8, 7, 6, 255}, + {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 7, 255, 255, 255}, + {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 8, 7, 255, 255}, + {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 9, 8, 7, 255}, + {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 8, 255, 255, 255}, + {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 9, 8, 255, 255}, + {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 10, 9, 8, 255}, + {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 7, 255, 255, 255}, + {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 8, 7, 255, 255}, + {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 9, 8, 7, 255}, + {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 8, 255, 255, 255}, + {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 9, 8, 255, 255}, + {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 10, 9, 8, 255}, + {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 9, 255, 255, 255}, + {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 10, 9, 255, 255}, + {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255}, + {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 1, 255, 255, 255, 5, 4, 3, 2, 0, 0, 0, 0}, + {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 2, 1, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0}, + {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 3, 2, 1, 255, 7, 6, 5, 4, 0, 0, 0, 0}, + {0, 255, 255, 255, 4, 3, 2, 1, 5, 255, 255, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 4, 3, 2, 1, 6, 5, 255, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 4, 3, 2, 1, 7, 6, 5, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 4, 3, 2, 1, 8, 7, 6, 5, 0, 0, 0, 0}, + {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 2, 255, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0}, + {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 3, 2, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0}, + {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 4, 3, 2, 255, 8, 7, 6, 5, 0, 0, 0, 0}, + {1, 0, 255, 255, 5, 4, 3, 2, 6, 255, 255, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 5, 4, 3, 2, 7, 6, 255, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 5, 4, 3, 2, 8, 7, 6, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 5, 4, 3, 2, 9, 8, 7, 6, 0, 0, 0, 0}, + {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 3, 255, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0}, + {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 4, 3, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0}, + {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 5, 4, 3, 255, 9, 8, 7, 6, 0, 0, 0, 0}, + {2, 1, 0, 255, 6, 5, 4, 3, 7, 255, 255, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 6, 5, 4, 3, 8, 7, 255, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 6, 5, 4, 3, 9, 8, 7, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 6, 5, 4, 3, 10, 9, 8, 7, 0, 0, 0, 0}, + {3, 2, 1, 0, 4, 255, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 4, 255, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 4, 255, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 4, 255, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0}, + {3, 2, 1, 0, 5, 4, 255, 255, 6, 255, 255, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 5, 4, 255, 255, 7, 6, 255, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 5, 4, 255, 255, 8, 7, 6, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 5, 4, 255, 255, 9, 8, 7, 6, 0, 0, 0, 0}, + {3, 2, 1, 0, 6, 5, 4, 255, 7, 255, 255, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 6, 5, 4, 255, 8, 7, 255, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 6, 5, 4, 255, 9, 8, 7, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 6, 5, 4, 255, 10, 9, 8, 7, 0, 0, 0, 0}, + {3, 2, 1, 0, 7, 6, 5, 4, 8, 255, 255, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 7, 6, 5, 4, 9, 8, 255, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 7, 6, 5, 4, 10, 9, 8, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 0, 0, 0, 0}}; +/* number of two bytes : 64 */ +/* number of two + three bytes : 145 */ +/* number of two + three + four bytes : 209 */ +const uint8_t utf8bigindex[4096][2] = { + {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, + {145, 3}, {209, 12}, {209, 12}, {209, 12}, {146, 4}, {209, 12}, {149, 4}, + {161, 4}, {64, 4}, {209, 12}, {209, 12}, {209, 12}, {147, 5}, {209, 12}, + {150, 5}, {162, 5}, {65, 5}, {209, 12}, {153, 5}, {165, 5}, {67, 5}, + {177, 5}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, {209, 12}, + {148, 6}, {209, 12}, {151, 6}, {163, 6}, {66, 6}, {209, 12}, {154, 6}, + {166, 6}, {68, 6}, {178, 6}, {74, 6}, {92, 6}, {64, 4}, {209, 12}, + {157, 6}, {169, 6}, {70, 6}, {181, 6}, {76, 6}, {94, 6}, {65, 5}, + {193, 6}, {82, 6}, {100, 6}, {67, 5}, {118, 6}, {73, 5}, {91, 5}, + {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {152, 7}, + {164, 7}, {145, 3}, {209, 12}, {155, 7}, {167, 7}, {69, 7}, {179, 7}, + {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, {170, 7}, {71, 7}, + {182, 7}, {77, 7}, {95, 7}, {65, 5}, {194, 7}, {83, 7}, {101, 7}, + {67, 5}, {119, 7}, {73, 5}, {91, 5}, {1, 7}, {209, 12}, {209, 12}, + {173, 7}, {148, 6}, {185, 7}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, + {85, 7}, {103, 7}, {68, 6}, {121, 7}, {74, 6}, {92, 6}, {2, 7}, + {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, {76, 6}, {94, 6}, + {4, 7}, {193, 6}, {82, 6}, {100, 6}, {8, 7}, {118, 6}, {16, 7}, + {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, + {209, 12}, {209, 12}, {145, 3}, {209, 12}, {156, 8}, {168, 8}, {146, 4}, + {180, 8}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, {159, 8}, {171, 8}, + {72, 8}, {183, 8}, {78, 8}, {96, 8}, {65, 5}, {195, 8}, {84, 8}, + {102, 8}, {67, 5}, {120, 8}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, + {209, 12}, {174, 8}, {148, 6}, {186, 8}, {80, 8}, {98, 8}, {66, 6}, + {198, 8}, {86, 8}, {104, 8}, {68, 6}, {122, 8}, {74, 6}, {92, 6}, + {3, 8}, {209, 12}, {157, 6}, {110, 8}, {70, 6}, {128, 8}, {76, 6}, + {94, 6}, {5, 8}, {193, 6}, {82, 6}, {100, 6}, {9, 8}, {118, 6}, + {17, 8}, {33, 8}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, + {189, 8}, {152, 7}, {164, 7}, {145, 3}, {201, 8}, {88, 8}, {106, 8}, + {69, 7}, {124, 8}, {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, + {112, 8}, {71, 7}, {130, 8}, {77, 7}, {95, 7}, {6, 8}, {194, 7}, + {83, 7}, {101, 7}, {10, 8}, {119, 7}, {18, 8}, {34, 8}, {1, 7}, + {209, 12}, {209, 12}, {173, 7}, {148, 6}, {136, 8}, {79, 7}, {97, 7}, + {66, 6}, {197, 7}, {85, 7}, {103, 7}, {12, 8}, {121, 7}, {20, 8}, + {36, 8}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, + {24, 8}, {40, 8}, {4, 7}, {193, 6}, {82, 6}, {48, 8}, {8, 7}, + {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, + {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, {209, 12}, {209, 12}, + {209, 12}, {146, 4}, {209, 12}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, + {160, 9}, {172, 9}, {147, 5}, {184, 9}, {150, 5}, {162, 5}, {65, 5}, + {196, 9}, {153, 5}, {165, 5}, {67, 5}, {177, 5}, {73, 5}, {91, 5}, + {64, 4}, {209, 12}, {209, 12}, {175, 9}, {148, 6}, {187, 9}, {81, 9}, + {99, 9}, {66, 6}, {199, 9}, {87, 9}, {105, 9}, {68, 6}, {123, 9}, + {74, 6}, {92, 6}, {64, 4}, {209, 12}, {157, 6}, {111, 9}, {70, 6}, + {129, 9}, {76, 6}, {94, 6}, {65, 5}, {193, 6}, {82, 6}, {100, 6}, + {67, 5}, {118, 6}, {73, 5}, {91, 5}, {0, 6}, {209, 12}, {209, 12}, + {209, 12}, {209, 12}, {190, 9}, {152, 7}, {164, 7}, {145, 3}, {202, 9}, + {89, 9}, {107, 9}, {69, 7}, {125, 9}, {75, 7}, {93, 7}, {64, 4}, + {209, 12}, {158, 7}, {113, 9}, {71, 7}, {131, 9}, {77, 7}, {95, 7}, + {7, 9}, {194, 7}, {83, 7}, {101, 7}, {11, 9}, {119, 7}, {19, 9}, + {35, 9}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6}, {137, 9}, + {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, {103, 7}, {13, 9}, + {121, 7}, {21, 9}, {37, 9}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, + {70, 6}, {127, 7}, {25, 9}, {41, 9}, {4, 7}, {193, 6}, {82, 6}, + {49, 9}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, + {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, + {205, 9}, {156, 8}, {168, 8}, {146, 4}, {180, 8}, {149, 4}, {161, 4}, + {64, 4}, {209, 12}, {159, 8}, {115, 9}, {72, 8}, {133, 9}, {78, 8}, + {96, 8}, {65, 5}, {195, 8}, {84, 8}, {102, 8}, {67, 5}, {120, 8}, + {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, {174, 8}, {148, 6}, + {139, 9}, {80, 8}, {98, 8}, {66, 6}, {198, 8}, {86, 8}, {104, 8}, + {14, 9}, {122, 8}, {22, 9}, {38, 9}, {3, 8}, {209, 12}, {157, 6}, + {110, 8}, {70, 6}, {128, 8}, {26, 9}, {42, 9}, {5, 8}, {193, 6}, + {82, 6}, {50, 9}, {9, 8}, {118, 6}, {17, 8}, {33, 8}, {0, 6}, + {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8}, {152, 7}, {164, 7}, + {145, 3}, {201, 8}, {88, 8}, {106, 8}, {69, 7}, {124, 8}, {75, 7}, + {93, 7}, {64, 4}, {209, 12}, {158, 7}, {112, 8}, {71, 7}, {130, 8}, + {28, 9}, {44, 9}, {6, 8}, {194, 7}, {83, 7}, {52, 9}, {10, 8}, + {119, 7}, {18, 8}, {34, 8}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, + {148, 6}, {136, 8}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, + {56, 9}, {12, 8}, {121, 7}, {20, 8}, {36, 8}, {2, 7}, {209, 12}, + {157, 6}, {109, 7}, {70, 6}, {127, 7}, {24, 8}, {40, 8}, {4, 7}, + {193, 6}, {82, 6}, {48, 8}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, + {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, + {209, 12}, {145, 3}, {209, 12}, {209, 12}, {209, 12}, {146, 4}, {209, 12}, + {149, 4}, {161, 4}, {64, 4}, {209, 12}, {209, 12}, {209, 12}, {147, 5}, + {209, 12}, {150, 5}, {162, 5}, {65, 5}, {209, 12}, {153, 5}, {165, 5}, + {67, 5}, {177, 5}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, + {176, 10}, {148, 6}, {188, 10}, {151, 6}, {163, 6}, {66, 6}, {200, 10}, + {154, 6}, {166, 6}, {68, 6}, {178, 6}, {74, 6}, {92, 6}, {64, 4}, + {209, 12}, {157, 6}, {169, 6}, {70, 6}, {181, 6}, {76, 6}, {94, 6}, + {65, 5}, {193, 6}, {82, 6}, {100, 6}, {67, 5}, {118, 6}, {73, 5}, + {91, 5}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {191, 10}, + {152, 7}, {164, 7}, {145, 3}, {203, 10}, {90, 10}, {108, 10}, {69, 7}, + {126, 10}, {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, {114, 10}, + {71, 7}, {132, 10}, {77, 7}, {95, 7}, {65, 5}, {194, 7}, {83, 7}, + {101, 7}, {67, 5}, {119, 7}, {73, 5}, {91, 5}, {1, 7}, {209, 12}, + {209, 12}, {173, 7}, {148, 6}, {138, 10}, {79, 7}, {97, 7}, {66, 6}, + {197, 7}, {85, 7}, {103, 7}, {68, 6}, {121, 7}, {74, 6}, {92, 6}, + {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, {76, 6}, + {94, 6}, {4, 7}, {193, 6}, {82, 6}, {100, 6}, {8, 7}, {118, 6}, + {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, + {209, 12}, {209, 12}, {209, 12}, {145, 3}, {206, 10}, {156, 8}, {168, 8}, + {146, 4}, {180, 8}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, {159, 8}, + {116, 10}, {72, 8}, {134, 10}, {78, 8}, {96, 8}, {65, 5}, {195, 8}, + {84, 8}, {102, 8}, {67, 5}, {120, 8}, {73, 5}, {91, 5}, {64, 4}, + {209, 12}, {209, 12}, {174, 8}, {148, 6}, {140, 10}, {80, 8}, {98, 8}, + {66, 6}, {198, 8}, {86, 8}, {104, 8}, {15, 10}, {122, 8}, {23, 10}, + {39, 10}, {3, 8}, {209, 12}, {157, 6}, {110, 8}, {70, 6}, {128, 8}, + {27, 10}, {43, 10}, {5, 8}, {193, 6}, {82, 6}, {51, 10}, {9, 8}, + {118, 6}, {17, 8}, {33, 8}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, + {209, 12}, {189, 8}, {152, 7}, {164, 7}, {145, 3}, {201, 8}, {88, 8}, + {106, 8}, {69, 7}, {124, 8}, {75, 7}, {93, 7}, {64, 4}, {209, 12}, + {158, 7}, {112, 8}, {71, 7}, {130, 8}, {29, 10}, {45, 10}, {6, 8}, + {194, 7}, {83, 7}, {53, 10}, {10, 8}, {119, 7}, {18, 8}, {34, 8}, + {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6}, {136, 8}, {79, 7}, + {97, 7}, {66, 6}, {197, 7}, {85, 7}, {57, 10}, {12, 8}, {121, 7}, + {20, 8}, {36, 8}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6}, + {127, 7}, {24, 8}, {40, 8}, {4, 7}, {193, 6}, {82, 6}, {48, 8}, + {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12}, + {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, {209, 12}, + {209, 12}, {209, 12}, {146, 4}, {209, 12}, {149, 4}, {161, 4}, {64, 4}, + {209, 12}, {160, 9}, {172, 9}, {147, 5}, {184, 9}, {150, 5}, {162, 5}, + {65, 5}, {196, 9}, {153, 5}, {165, 5}, {67, 5}, {177, 5}, {73, 5}, + {91, 5}, {64, 4}, {209, 12}, {209, 12}, {175, 9}, {148, 6}, {142, 10}, + {81, 9}, {99, 9}, {66, 6}, {199, 9}, {87, 9}, {105, 9}, {68, 6}, + {123, 9}, {74, 6}, {92, 6}, {64, 4}, {209, 12}, {157, 6}, {111, 9}, + {70, 6}, {129, 9}, {76, 6}, {94, 6}, {65, 5}, {193, 6}, {82, 6}, + {100, 6}, {67, 5}, {118, 6}, {73, 5}, {91, 5}, {0, 6}, {209, 12}, + {209, 12}, {209, 12}, {209, 12}, {190, 9}, {152, 7}, {164, 7}, {145, 3}, + {202, 9}, {89, 9}, {107, 9}, {69, 7}, {125, 9}, {75, 7}, {93, 7}, + {64, 4}, {209, 12}, {158, 7}, {113, 9}, {71, 7}, {131, 9}, {30, 10}, + {46, 10}, {7, 9}, {194, 7}, {83, 7}, {54, 10}, {11, 9}, {119, 7}, + {19, 9}, {35, 9}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6}, + {137, 9}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, {58, 10}, + {13, 9}, {121, 7}, {21, 9}, {37, 9}, {2, 7}, {209, 12}, {157, 6}, + {109, 7}, {70, 6}, {127, 7}, {25, 9}, {41, 9}, {4, 7}, {193, 6}, + {82, 6}, {49, 9}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6}, + {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, + {145, 3}, {205, 9}, {156, 8}, {168, 8}, {146, 4}, {180, 8}, {149, 4}, + {161, 4}, {64, 4}, {209, 12}, {159, 8}, {115, 9}, {72, 8}, {133, 9}, + {78, 8}, {96, 8}, {65, 5}, {195, 8}, {84, 8}, {102, 8}, {67, 5}, + {120, 8}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, {174, 8}, + {148, 6}, {139, 9}, {80, 8}, {98, 8}, {66, 6}, {198, 8}, {86, 8}, + {60, 10}, {14, 9}, {122, 8}, {22, 9}, {38, 9}, {3, 8}, {209, 12}, + {157, 6}, {110, 8}, {70, 6}, {128, 8}, {26, 9}, {42, 9}, {5, 8}, + {193, 6}, {82, 6}, {50, 9}, {9, 8}, {118, 6}, {17, 8}, {33, 8}, + {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8}, {152, 7}, + {164, 7}, {145, 3}, {201, 8}, {88, 8}, {106, 8}, {69, 7}, {124, 8}, + {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, {112, 8}, {71, 7}, + {130, 8}, {28, 9}, {44, 9}, {6, 8}, {194, 7}, {83, 7}, {52, 9}, + {10, 8}, {119, 7}, {18, 8}, {34, 8}, {1, 7}, {209, 12}, {209, 12}, + {173, 7}, {148, 6}, {136, 8}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, + {85, 7}, {56, 9}, {12, 8}, {121, 7}, {20, 8}, {36, 8}, {2, 7}, + {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, {24, 8}, {40, 8}, + {4, 7}, {193, 6}, {82, 6}, {48, 8}, {8, 7}, {118, 6}, {16, 7}, + {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, + {209, 12}, {209, 12}, {145, 3}, {209, 12}, {209, 12}, {209, 12}, {146, 4}, + {209, 12}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, {209, 12}, {209, 12}, + {147, 5}, {209, 12}, {150, 5}, {162, 5}, {65, 5}, {209, 12}, {153, 5}, + {165, 5}, {67, 5}, {177, 5}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, + {209, 12}, {209, 12}, {148, 6}, {209, 12}, {151, 6}, {163, 6}, {66, 6}, + {209, 12}, {154, 6}, {166, 6}, {68, 6}, {178, 6}, {74, 6}, {92, 6}, + {64, 4}, {209, 12}, {157, 6}, {169, 6}, {70, 6}, {181, 6}, {76, 6}, + {94, 6}, {65, 5}, {193, 6}, {82, 6}, {100, 6}, {67, 5}, {118, 6}, + {73, 5}, {91, 5}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, + {192, 11}, {152, 7}, {164, 7}, {145, 3}, {204, 11}, {155, 7}, {167, 7}, + {69, 7}, {179, 7}, {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, + {170, 7}, {71, 7}, {182, 7}, {77, 7}, {95, 7}, {65, 5}, {194, 7}, + {83, 7}, {101, 7}, {67, 5}, {119, 7}, {73, 5}, {91, 5}, {1, 7}, + {209, 12}, {209, 12}, {173, 7}, {148, 6}, {185, 7}, {79, 7}, {97, 7}, + {66, 6}, {197, 7}, {85, 7}, {103, 7}, {68, 6}, {121, 7}, {74, 6}, + {92, 6}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, + {76, 6}, {94, 6}, {4, 7}, {193, 6}, {82, 6}, {100, 6}, {8, 7}, + {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, + {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, {207, 11}, {156, 8}, + {168, 8}, {146, 4}, {180, 8}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, + {159, 8}, {117, 11}, {72, 8}, {135, 11}, {78, 8}, {96, 8}, {65, 5}, + {195, 8}, {84, 8}, {102, 8}, {67, 5}, {120, 8}, {73, 5}, {91, 5}, + {64, 4}, {209, 12}, {209, 12}, {174, 8}, {148, 6}, {141, 11}, {80, 8}, + {98, 8}, {66, 6}, {198, 8}, {86, 8}, {104, 8}, {68, 6}, {122, 8}, + {74, 6}, {92, 6}, {3, 8}, {209, 12}, {157, 6}, {110, 8}, {70, 6}, + {128, 8}, {76, 6}, {94, 6}, {5, 8}, {193, 6}, {82, 6}, {100, 6}, + {9, 8}, {118, 6}, {17, 8}, {33, 8}, {0, 6}, {209, 12}, {209, 12}, + {209, 12}, {209, 12}, {189, 8}, {152, 7}, {164, 7}, {145, 3}, {201, 8}, + {88, 8}, {106, 8}, {69, 7}, {124, 8}, {75, 7}, {93, 7}, {64, 4}, + {209, 12}, {158, 7}, {112, 8}, {71, 7}, {130, 8}, {77, 7}, {95, 7}, + {6, 8}, {194, 7}, {83, 7}, {101, 7}, {10, 8}, {119, 7}, {18, 8}, + {34, 8}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6}, {136, 8}, + {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, {103, 7}, {12, 8}, + {121, 7}, {20, 8}, {36, 8}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, + {70, 6}, {127, 7}, {24, 8}, {40, 8}, {4, 7}, {193, 6}, {82, 6}, + {48, 8}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, + {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, + {209, 12}, {209, 12}, {209, 12}, {146, 4}, {209, 12}, {149, 4}, {161, 4}, + {64, 4}, {209, 12}, {160, 9}, {172, 9}, {147, 5}, {184, 9}, {150, 5}, + {162, 5}, {65, 5}, {196, 9}, {153, 5}, {165, 5}, {67, 5}, {177, 5}, + {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, {175, 9}, {148, 6}, + {143, 11}, {81, 9}, {99, 9}, {66, 6}, {199, 9}, {87, 9}, {105, 9}, + {68, 6}, {123, 9}, {74, 6}, {92, 6}, {64, 4}, {209, 12}, {157, 6}, + {111, 9}, {70, 6}, {129, 9}, {76, 6}, {94, 6}, {65, 5}, {193, 6}, + {82, 6}, {100, 6}, {67, 5}, {118, 6}, {73, 5}, {91, 5}, {0, 6}, + {209, 12}, {209, 12}, {209, 12}, {209, 12}, {190, 9}, {152, 7}, {164, 7}, + {145, 3}, {202, 9}, {89, 9}, {107, 9}, {69, 7}, {125, 9}, {75, 7}, + {93, 7}, {64, 4}, {209, 12}, {158, 7}, {113, 9}, {71, 7}, {131, 9}, + {31, 11}, {47, 11}, {7, 9}, {194, 7}, {83, 7}, {55, 11}, {11, 9}, + {119, 7}, {19, 9}, {35, 9}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, + {148, 6}, {137, 9}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, + {59, 11}, {13, 9}, {121, 7}, {21, 9}, {37, 9}, {2, 7}, {209, 12}, + {157, 6}, {109, 7}, {70, 6}, {127, 7}, {25, 9}, {41, 9}, {4, 7}, + {193, 6}, {82, 6}, {49, 9}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, + {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, + {209, 12}, {145, 3}, {205, 9}, {156, 8}, {168, 8}, {146, 4}, {180, 8}, + {149, 4}, {161, 4}, {64, 4}, {209, 12}, {159, 8}, {115, 9}, {72, 8}, + {133, 9}, {78, 8}, {96, 8}, {65, 5}, {195, 8}, {84, 8}, {102, 8}, + {67, 5}, {120, 8}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, + {174, 8}, {148, 6}, {139, 9}, {80, 8}, {98, 8}, {66, 6}, {198, 8}, + {86, 8}, {61, 11}, {14, 9}, {122, 8}, {22, 9}, {38, 9}, {3, 8}, + {209, 12}, {157, 6}, {110, 8}, {70, 6}, {128, 8}, {26, 9}, {42, 9}, + {5, 8}, {193, 6}, {82, 6}, {50, 9}, {9, 8}, {118, 6}, {17, 8}, + {33, 8}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8}, + {152, 7}, {164, 7}, {145, 3}, {201, 8}, {88, 8}, {106, 8}, {69, 7}, + {124, 8}, {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, {112, 8}, + {71, 7}, {130, 8}, {28, 9}, {44, 9}, {6, 8}, {194, 7}, {83, 7}, + {52, 9}, {10, 8}, {119, 7}, {18, 8}, {34, 8}, {1, 7}, {209, 12}, + {209, 12}, {173, 7}, {148, 6}, {136, 8}, {79, 7}, {97, 7}, {66, 6}, + {197, 7}, {85, 7}, {56, 9}, {12, 8}, {121, 7}, {20, 8}, {36, 8}, + {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, {24, 8}, + {40, 8}, {4, 7}, {193, 6}, {82, 6}, {48, 8}, {8, 7}, {118, 6}, + {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, + {209, 12}, {209, 12}, {209, 12}, {145, 3}, {209, 12}, {209, 12}, {209, 12}, + {146, 4}, {209, 12}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, {209, 12}, + {209, 12}, {147, 5}, {209, 12}, {150, 5}, {162, 5}, {65, 5}, {209, 12}, + {153, 5}, {165, 5}, {67, 5}, {177, 5}, {73, 5}, {91, 5}, {64, 4}, + {209, 12}, {209, 12}, {176, 10}, {148, 6}, {188, 10}, {151, 6}, {163, 6}, + {66, 6}, {200, 10}, {154, 6}, {166, 6}, {68, 6}, {178, 6}, {74, 6}, + {92, 6}, {64, 4}, {209, 12}, {157, 6}, {169, 6}, {70, 6}, {181, 6}, + {76, 6}, {94, 6}, {65, 5}, {193, 6}, {82, 6}, {100, 6}, {67, 5}, + {118, 6}, {73, 5}, {91, 5}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, + {209, 12}, {191, 10}, {152, 7}, {164, 7}, {145, 3}, {203, 10}, {90, 10}, + {108, 10}, {69, 7}, {126, 10}, {75, 7}, {93, 7}, {64, 4}, {209, 12}, + {158, 7}, {114, 10}, {71, 7}, {132, 10}, {77, 7}, {95, 7}, {65, 5}, + {194, 7}, {83, 7}, {101, 7}, {67, 5}, {119, 7}, {73, 5}, {91, 5}, + {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6}, {138, 10}, {79, 7}, + {97, 7}, {66, 6}, {197, 7}, {85, 7}, {103, 7}, {68, 6}, {121, 7}, + {74, 6}, {92, 6}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6}, + {127, 7}, {76, 6}, {94, 6}, {4, 7}, {193, 6}, {82, 6}, {100, 6}, + {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12}, + {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, {206, 10}, + {156, 8}, {168, 8}, {146, 4}, {180, 8}, {149, 4}, {161, 4}, {64, 4}, + {209, 12}, {159, 8}, {116, 10}, {72, 8}, {134, 10}, {78, 8}, {96, 8}, + {65, 5}, {195, 8}, {84, 8}, {102, 8}, {67, 5}, {120, 8}, {73, 5}, + {91, 5}, {64, 4}, {209, 12}, {209, 12}, {174, 8}, {148, 6}, {140, 10}, + {80, 8}, {98, 8}, {66, 6}, {198, 8}, {86, 8}, {62, 11}, {15, 10}, + {122, 8}, {23, 10}, {39, 10}, {3, 8}, {209, 12}, {157, 6}, {110, 8}, + {70, 6}, {128, 8}, {27, 10}, {43, 10}, {5, 8}, {193, 6}, {82, 6}, + {51, 10}, {9, 8}, {118, 6}, {17, 8}, {33, 8}, {0, 6}, {209, 12}, + {209, 12}, {209, 12}, {209, 12}, {189, 8}, {152, 7}, {164, 7}, {145, 3}, + {201, 8}, {88, 8}, {106, 8}, {69, 7}, {124, 8}, {75, 7}, {93, 7}, + {64, 4}, {209, 12}, {158, 7}, {112, 8}, {71, 7}, {130, 8}, {29, 10}, + {45, 10}, {6, 8}, {194, 7}, {83, 7}, {53, 10}, {10, 8}, {119, 7}, + {18, 8}, {34, 8}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6}, + {136, 8}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, {57, 10}, + {12, 8}, {121, 7}, {20, 8}, {36, 8}, {2, 7}, {209, 12}, {157, 6}, + {109, 7}, {70, 6}, {127, 7}, {24, 8}, {40, 8}, {4, 7}, {193, 6}, + {82, 6}, {48, 8}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6}, + {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, + {145, 3}, {209, 12}, {209, 12}, {209, 12}, {146, 4}, {209, 12}, {149, 4}, + {161, 4}, {64, 4}, {209, 12}, {160, 9}, {172, 9}, {147, 5}, {184, 9}, + {150, 5}, {162, 5}, {65, 5}, {196, 9}, {153, 5}, {165, 5}, {67, 5}, + {177, 5}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, {175, 9}, + {148, 6}, {142, 10}, {81, 9}, {99, 9}, {66, 6}, {199, 9}, {87, 9}, + {105, 9}, {68, 6}, {123, 9}, {74, 6}, {92, 6}, {64, 4}, {209, 12}, + {157, 6}, {111, 9}, {70, 6}, {129, 9}, {76, 6}, {94, 6}, {65, 5}, + {193, 6}, {82, 6}, {100, 6}, {67, 5}, {118, 6}, {73, 5}, {91, 5}, + {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {190, 9}, {152, 7}, + {164, 7}, {145, 3}, {202, 9}, {89, 9}, {107, 9}, {69, 7}, {125, 9}, + {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, {113, 9}, {71, 7}, + {131, 9}, {30, 10}, {46, 10}, {7, 9}, {194, 7}, {83, 7}, {54, 10}, + {11, 9}, {119, 7}, {19, 9}, {35, 9}, {1, 7}, {209, 12}, {209, 12}, + {173, 7}, {148, 6}, {137, 9}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, + {85, 7}, {58, 10}, {13, 9}, {121, 7}, {21, 9}, {37, 9}, {2, 7}, + {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, {25, 9}, {41, 9}, + {4, 7}, {193, 6}, {82, 6}, {49, 9}, {8, 7}, {118, 6}, {16, 7}, + {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, + {209, 12}, {209, 12}, {145, 3}, {205, 9}, {156, 8}, {168, 8}, {146, 4}, + {180, 8}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, {159, 8}, {115, 9}, + {72, 8}, {133, 9}, {78, 8}, {96, 8}, {65, 5}, {195, 8}, {84, 8}, + {102, 8}, {67, 5}, {120, 8}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, + {209, 12}, {174, 8}, {148, 6}, {139, 9}, {80, 8}, {98, 8}, {66, 6}, + {198, 8}, {86, 8}, {60, 10}, {14, 9}, {122, 8}, {22, 9}, {38, 9}, + {3, 8}, {209, 12}, {157, 6}, {110, 8}, {70, 6}, {128, 8}, {26, 9}, + {42, 9}, {5, 8}, {193, 6}, {82, 6}, {50, 9}, {9, 8}, {118, 6}, + {17, 8}, {33, 8}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, + {189, 8}, {152, 7}, {164, 7}, {145, 3}, {201, 8}, {88, 8}, {106, 8}, + {69, 7}, {124, 8}, {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, + {112, 8}, {71, 7}, {130, 8}, {28, 9}, {44, 9}, {6, 8}, {194, 7}, + {83, 7}, {52, 9}, {10, 8}, {119, 7}, {18, 8}, {34, 8}, {1, 7}, + {209, 12}, {209, 12}, {173, 7}, {148, 6}, {136, 8}, {79, 7}, {97, 7}, + {66, 6}, {197, 7}, {85, 7}, {56, 9}, {12, 8}, {121, 7}, {20, 8}, + {36, 8}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, + {24, 8}, {40, 8}, {4, 7}, {193, 6}, {82, 6}, {48, 8}, {8, 7}, + {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, + {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, {209, 12}, {209, 12}, + {209, 12}, {146, 4}, {209, 12}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, + {209, 12}, {209, 12}, {147, 5}, {209, 12}, {150, 5}, {162, 5}, {65, 5}, + {209, 12}, {153, 5}, {165, 5}, {67, 5}, {177, 5}, {73, 5}, {91, 5}, + {64, 4}, {209, 12}, {209, 12}, {209, 12}, {148, 6}, {209, 12}, {151, 6}, + {163, 6}, {66, 6}, {209, 12}, {154, 6}, {166, 6}, {68, 6}, {178, 6}, + {74, 6}, {92, 6}, {64, 4}, {209, 12}, {157, 6}, {169, 6}, {70, 6}, + {181, 6}, {76, 6}, {94, 6}, {65, 5}, {193, 6}, {82, 6}, {100, 6}, + {67, 5}, {118, 6}, {73, 5}, {91, 5}, {0, 6}, {209, 12}, {209, 12}, + {209, 12}, {209, 12}, {209, 12}, {152, 7}, {164, 7}, {145, 3}, {209, 12}, + {155, 7}, {167, 7}, {69, 7}, {179, 7}, {75, 7}, {93, 7}, {64, 4}, + {209, 12}, {158, 7}, {170, 7}, {71, 7}, {182, 7}, {77, 7}, {95, 7}, + {65, 5}, {194, 7}, {83, 7}, {101, 7}, {67, 5}, {119, 7}, {73, 5}, + {91, 5}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6}, {185, 7}, + {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, {103, 7}, {68, 6}, + {121, 7}, {74, 6}, {92, 6}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, + {70, 6}, {127, 7}, {76, 6}, {94, 6}, {4, 7}, {193, 6}, {82, 6}, + {100, 6}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, + {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, + {208, 12}, {156, 8}, {168, 8}, {146, 4}, {180, 8}, {149, 4}, {161, 4}, + {64, 4}, {209, 12}, {159, 8}, {171, 8}, {72, 8}, {183, 8}, {78, 8}, + {96, 8}, {65, 5}, {195, 8}, {84, 8}, {102, 8}, {67, 5}, {120, 8}, + {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, {174, 8}, {148, 6}, + {186, 8}, {80, 8}, {98, 8}, {66, 6}, {198, 8}, {86, 8}, {104, 8}, + {68, 6}, {122, 8}, {74, 6}, {92, 6}, {3, 8}, {209, 12}, {157, 6}, + {110, 8}, {70, 6}, {128, 8}, {76, 6}, {94, 6}, {5, 8}, {193, 6}, + {82, 6}, {100, 6}, {9, 8}, {118, 6}, {17, 8}, {33, 8}, {0, 6}, + {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8}, {152, 7}, {164, 7}, + {145, 3}, {201, 8}, {88, 8}, {106, 8}, {69, 7}, {124, 8}, {75, 7}, + {93, 7}, {64, 4}, {209, 12}, {158, 7}, {112, 8}, {71, 7}, {130, 8}, + {77, 7}, {95, 7}, {6, 8}, {194, 7}, {83, 7}, {101, 7}, {10, 8}, + {119, 7}, {18, 8}, {34, 8}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, + {148, 6}, {136, 8}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, + {103, 7}, {12, 8}, {121, 7}, {20, 8}, {36, 8}, {2, 7}, {209, 12}, + {157, 6}, {109, 7}, {70, 6}, {127, 7}, {24, 8}, {40, 8}, {4, 7}, + {193, 6}, {82, 6}, {48, 8}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, + {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, + {209, 12}, {145, 3}, {209, 12}, {209, 12}, {209, 12}, {146, 4}, {209, 12}, + {149, 4}, {161, 4}, {64, 4}, {209, 12}, {160, 9}, {172, 9}, {147, 5}, + {184, 9}, {150, 5}, {162, 5}, {65, 5}, {196, 9}, {153, 5}, {165, 5}, + {67, 5}, {177, 5}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, + {175, 9}, {148, 6}, {144, 12}, {81, 9}, {99, 9}, {66, 6}, {199, 9}, + {87, 9}, {105, 9}, {68, 6}, {123, 9}, {74, 6}, {92, 6}, {64, 4}, + {209, 12}, {157, 6}, {111, 9}, {70, 6}, {129, 9}, {76, 6}, {94, 6}, + {65, 5}, {193, 6}, {82, 6}, {100, 6}, {67, 5}, {118, 6}, {73, 5}, + {91, 5}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {190, 9}, + {152, 7}, {164, 7}, {145, 3}, {202, 9}, {89, 9}, {107, 9}, {69, 7}, + {125, 9}, {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, {113, 9}, + {71, 7}, {131, 9}, {77, 7}, {95, 7}, {7, 9}, {194, 7}, {83, 7}, + {101, 7}, {11, 9}, {119, 7}, {19, 9}, {35, 9}, {1, 7}, {209, 12}, + {209, 12}, {173, 7}, {148, 6}, {137, 9}, {79, 7}, {97, 7}, {66, 6}, + {197, 7}, {85, 7}, {103, 7}, {13, 9}, {121, 7}, {21, 9}, {37, 9}, + {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, {25, 9}, + {41, 9}, {4, 7}, {193, 6}, {82, 6}, {49, 9}, {8, 7}, {118, 6}, + {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, + {209, 12}, {209, 12}, {209, 12}, {145, 3}, {205, 9}, {156, 8}, {168, 8}, + {146, 4}, {180, 8}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, {159, 8}, + {115, 9}, {72, 8}, {133, 9}, {78, 8}, {96, 8}, {65, 5}, {195, 8}, + {84, 8}, {102, 8}, {67, 5}, {120, 8}, {73, 5}, {91, 5}, {64, 4}, + {209, 12}, {209, 12}, {174, 8}, {148, 6}, {139, 9}, {80, 8}, {98, 8}, + {66, 6}, {198, 8}, {86, 8}, {104, 8}, {14, 9}, {122, 8}, {22, 9}, + {38, 9}, {3, 8}, {209, 12}, {157, 6}, {110, 8}, {70, 6}, {128, 8}, + {26, 9}, {42, 9}, {5, 8}, {193, 6}, {82, 6}, {50, 9}, {9, 8}, + {118, 6}, {17, 8}, {33, 8}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, + {209, 12}, {189, 8}, {152, 7}, {164, 7}, {145, 3}, {201, 8}, {88, 8}, + {106, 8}, {69, 7}, {124, 8}, {75, 7}, {93, 7}, {64, 4}, {209, 12}, + {158, 7}, {112, 8}, {71, 7}, {130, 8}, {28, 9}, {44, 9}, {6, 8}, + {194, 7}, {83, 7}, {52, 9}, {10, 8}, {119, 7}, {18, 8}, {34, 8}, + {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6}, {136, 8}, {79, 7}, + {97, 7}, {66, 6}, {197, 7}, {85, 7}, {56, 9}, {12, 8}, {121, 7}, + {20, 8}, {36, 8}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6}, + {127, 7}, {24, 8}, {40, 8}, {4, 7}, {193, 6}, {82, 6}, {48, 8}, + {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12}, + {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, {209, 12}, + {209, 12}, {209, 12}, {146, 4}, {209, 12}, {149, 4}, {161, 4}, {64, 4}, + {209, 12}, {209, 12}, {209, 12}, {147, 5}, {209, 12}, {150, 5}, {162, 5}, + {65, 5}, {209, 12}, {153, 5}, {165, 5}, {67, 5}, {177, 5}, {73, 5}, + {91, 5}, {64, 4}, {209, 12}, {209, 12}, {176, 10}, {148, 6}, {188, 10}, + {151, 6}, {163, 6}, {66, 6}, {200, 10}, {154, 6}, {166, 6}, {68, 6}, + {178, 6}, {74, 6}, {92, 6}, {64, 4}, {209, 12}, {157, 6}, {169, 6}, + {70, 6}, {181, 6}, {76, 6}, {94, 6}, {65, 5}, {193, 6}, {82, 6}, + {100, 6}, {67, 5}, {118, 6}, {73, 5}, {91, 5}, {0, 6}, {209, 12}, + {209, 12}, {209, 12}, {209, 12}, {191, 10}, {152, 7}, {164, 7}, {145, 3}, + {203, 10}, {90, 10}, {108, 10}, {69, 7}, {126, 10}, {75, 7}, {93, 7}, + {64, 4}, {209, 12}, {158, 7}, {114, 10}, {71, 7}, {132, 10}, {77, 7}, + {95, 7}, {65, 5}, {194, 7}, {83, 7}, {101, 7}, {67, 5}, {119, 7}, + {73, 5}, {91, 5}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6}, + {138, 10}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, {103, 7}, + {68, 6}, {121, 7}, {74, 6}, {92, 6}, {2, 7}, {209, 12}, {157, 6}, + {109, 7}, {70, 6}, {127, 7}, {76, 6}, {94, 6}, {4, 7}, {193, 6}, + {82, 6}, {100, 6}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6}, + {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, + {145, 3}, {206, 10}, {156, 8}, {168, 8}, {146, 4}, {180, 8}, {149, 4}, + {161, 4}, {64, 4}, {209, 12}, {159, 8}, {116, 10}, {72, 8}, {134, 10}, + {78, 8}, {96, 8}, {65, 5}, {195, 8}, {84, 8}, {102, 8}, {67, 5}, + {120, 8}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, {174, 8}, + {148, 6}, {140, 10}, {80, 8}, {98, 8}, {66, 6}, {198, 8}, {86, 8}, + {63, 12}, {15, 10}, {122, 8}, {23, 10}, {39, 10}, {3, 8}, {209, 12}, + {157, 6}, {110, 8}, {70, 6}, {128, 8}, {27, 10}, {43, 10}, {5, 8}, + {193, 6}, {82, 6}, {51, 10}, {9, 8}, {118, 6}, {17, 8}, {33, 8}, + {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8}, {152, 7}, + {164, 7}, {145, 3}, {201, 8}, {88, 8}, {106, 8}, {69, 7}, {124, 8}, + {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, {112, 8}, {71, 7}, + {130, 8}, {29, 10}, {45, 10}, {6, 8}, {194, 7}, {83, 7}, {53, 10}, + {10, 8}, {119, 7}, {18, 8}, {34, 8}, {1, 7}, {209, 12}, {209, 12}, + {173, 7}, {148, 6}, {136, 8}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, + {85, 7}, {57, 10}, {12, 8}, {121, 7}, {20, 8}, {36, 8}, {2, 7}, + {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, {24, 8}, {40, 8}, + {4, 7}, {193, 6}, {82, 6}, {48, 8}, {8, 7}, {118, 6}, {16, 7}, + {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, + {209, 12}, {209, 12}, {145, 3}, {209, 12}, {209, 12}, {209, 12}, {146, 4}, + {209, 12}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, {160, 9}, {172, 9}, + {147, 5}, {184, 9}, {150, 5}, {162, 5}, {65, 5}, {196, 9}, {153, 5}, + {165, 5}, {67, 5}, {177, 5}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, + {209, 12}, {175, 9}, {148, 6}, {142, 10}, {81, 9}, {99, 9}, {66, 6}, + {199, 9}, {87, 9}, {105, 9}, {68, 6}, {123, 9}, {74, 6}, {92, 6}, + {64, 4}, {209, 12}, {157, 6}, {111, 9}, {70, 6}, {129, 9}, {76, 6}, + {94, 6}, {65, 5}, {193, 6}, {82, 6}, {100, 6}, {67, 5}, {118, 6}, + {73, 5}, {91, 5}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, + {190, 9}, {152, 7}, {164, 7}, {145, 3}, {202, 9}, {89, 9}, {107, 9}, + {69, 7}, {125, 9}, {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, + {113, 9}, {71, 7}, {131, 9}, {30, 10}, {46, 10}, {7, 9}, {194, 7}, + {83, 7}, {54, 10}, {11, 9}, {119, 7}, {19, 9}, {35, 9}, {1, 7}, + {209, 12}, {209, 12}, {173, 7}, {148, 6}, {137, 9}, {79, 7}, {97, 7}, + {66, 6}, {197, 7}, {85, 7}, {58, 10}, {13, 9}, {121, 7}, {21, 9}, + {37, 9}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, + {25, 9}, {41, 9}, {4, 7}, {193, 6}, {82, 6}, {49, 9}, {8, 7}, + {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, + {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, {205, 9}, {156, 8}, + {168, 8}, {146, 4}, {180, 8}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, + {159, 8}, {115, 9}, {72, 8}, {133, 9}, {78, 8}, {96, 8}, {65, 5}, + {195, 8}, {84, 8}, {102, 8}, {67, 5}, {120, 8}, {73, 5}, {91, 5}, + {64, 4}, {209, 12}, {209, 12}, {174, 8}, {148, 6}, {139, 9}, {80, 8}, + {98, 8}, {66, 6}, {198, 8}, {86, 8}, {60, 10}, {14, 9}, {122, 8}, + {22, 9}, {38, 9}, {3, 8}, {209, 12}, {157, 6}, {110, 8}, {70, 6}, + {128, 8}, {26, 9}, {42, 9}, {5, 8}, {193, 6}, {82, 6}, {50, 9}, + {9, 8}, {118, 6}, {17, 8}, {33, 8}, {0, 6}, {209, 12}, {209, 12}, + {209, 12}, {209, 12}, {189, 8}, {152, 7}, {164, 7}, {145, 3}, {201, 8}, + {88, 8}, {106, 8}, {69, 7}, {124, 8}, {75, 7}, {93, 7}, {64, 4}, + {209, 12}, {158, 7}, {112, 8}, {71, 7}, {130, 8}, {28, 9}, {44, 9}, + {6, 8}, {194, 7}, {83, 7}, {52, 9}, {10, 8}, {119, 7}, {18, 8}, + {34, 8}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6}, {136, 8}, + {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, {56, 9}, {12, 8}, + {121, 7}, {20, 8}, {36, 8}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, + {70, 6}, {127, 7}, {24, 8}, {40, 8}, {4, 7}, {193, 6}, {82, 6}, + {48, 8}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, + {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, + {209, 12}, {209, 12}, {209, 12}, {146, 4}, {209, 12}, {149, 4}, {161, 4}, + {64, 4}, {209, 12}, {209, 12}, {209, 12}, {147, 5}, {209, 12}, {150, 5}, + {162, 5}, {65, 5}, {209, 12}, {153, 5}, {165, 5}, {67, 5}, {177, 5}, + {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, {209, 12}, {148, 6}, + {209, 12}, {151, 6}, {163, 6}, {66, 6}, {209, 12}, {154, 6}, {166, 6}, + {68, 6}, {178, 6}, {74, 6}, {92, 6}, {64, 4}, {209, 12}, {157, 6}, + {169, 6}, {70, 6}, {181, 6}, {76, 6}, {94, 6}, {65, 5}, {193, 6}, + {82, 6}, {100, 6}, {67, 5}, {118, 6}, {73, 5}, {91, 5}, {0, 6}, + {209, 12}, {209, 12}, {209, 12}, {209, 12}, {192, 11}, {152, 7}, {164, 7}, + {145, 3}, {204, 11}, {155, 7}, {167, 7}, {69, 7}, {179, 7}, {75, 7}, + {93, 7}, {64, 4}, {209, 12}, {158, 7}, {170, 7}, {71, 7}, {182, 7}, + {77, 7}, {95, 7}, {65, 5}, {194, 7}, {83, 7}, {101, 7}, {67, 5}, + {119, 7}, {73, 5}, {91, 5}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, + {148, 6}, {185, 7}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, + {103, 7}, {68, 6}, {121, 7}, {74, 6}, {92, 6}, {2, 7}, {209, 12}, + {157, 6}, {109, 7}, {70, 6}, {127, 7}, {76, 6}, {94, 6}, {4, 7}, + {193, 6}, {82, 6}, {100, 6}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, + {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, + {209, 12}, {145, 3}, {207, 11}, {156, 8}, {168, 8}, {146, 4}, {180, 8}, + {149, 4}, {161, 4}, {64, 4}, {209, 12}, {159, 8}, {117, 11}, {72, 8}, + {135, 11}, {78, 8}, {96, 8}, {65, 5}, {195, 8}, {84, 8}, {102, 8}, + {67, 5}, {120, 8}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, + {174, 8}, {148, 6}, {141, 11}, {80, 8}, {98, 8}, {66, 6}, {198, 8}, + {86, 8}, {104, 8}, {68, 6}, {122, 8}, {74, 6}, {92, 6}, {3, 8}, + {209, 12}, {157, 6}, {110, 8}, {70, 6}, {128, 8}, {76, 6}, {94, 6}, + {5, 8}, {193, 6}, {82, 6}, {100, 6}, {9, 8}, {118, 6}, {17, 8}, + {33, 8}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8}, + {152, 7}, {164, 7}, {145, 3}, {201, 8}, {88, 8}, {106, 8}, {69, 7}, + {124, 8}, {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, {112, 8}, + {71, 7}, {130, 8}, {77, 7}, {95, 7}, {6, 8}, {194, 7}, {83, 7}, + {101, 7}, {10, 8}, {119, 7}, {18, 8}, {34, 8}, {1, 7}, {209, 12}, + {209, 12}, {173, 7}, {148, 6}, {136, 8}, {79, 7}, {97, 7}, {66, 6}, + {197, 7}, {85, 7}, {103, 7}, {12, 8}, {121, 7}, {20, 8}, {36, 8}, + {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, {24, 8}, + {40, 8}, {4, 7}, {193, 6}, {82, 6}, {48, 8}, {8, 7}, {118, 6}, + {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, + {209, 12}, {209, 12}, {209, 12}, {145, 3}, {209, 12}, {209, 12}, {209, 12}, + {146, 4}, {209, 12}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, {160, 9}, + {172, 9}, {147, 5}, {184, 9}, {150, 5}, {162, 5}, {65, 5}, {196, 9}, + {153, 5}, {165, 5}, {67, 5}, {177, 5}, {73, 5}, {91, 5}, {64, 4}, + {209, 12}, {209, 12}, {175, 9}, {148, 6}, {143, 11}, {81, 9}, {99, 9}, + {66, 6}, {199, 9}, {87, 9}, {105, 9}, {68, 6}, {123, 9}, {74, 6}, + {92, 6}, {64, 4}, {209, 12}, {157, 6}, {111, 9}, {70, 6}, {129, 9}, + {76, 6}, {94, 6}, {65, 5}, {193, 6}, {82, 6}, {100, 6}, {67, 5}, + {118, 6}, {73, 5}, {91, 5}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, + {209, 12}, {190, 9}, {152, 7}, {164, 7}, {145, 3}, {202, 9}, {89, 9}, + {107, 9}, {69, 7}, {125, 9}, {75, 7}, {93, 7}, {64, 4}, {209, 12}, + {158, 7}, {113, 9}, {71, 7}, {131, 9}, {31, 11}, {47, 11}, {7, 9}, + {194, 7}, {83, 7}, {55, 11}, {11, 9}, {119, 7}, {19, 9}, {35, 9}, + {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6}, {137, 9}, {79, 7}, + {97, 7}, {66, 6}, {197, 7}, {85, 7}, {59, 11}, {13, 9}, {121, 7}, + {21, 9}, {37, 9}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6}, + {127, 7}, {25, 9}, {41, 9}, {4, 7}, {193, 6}, {82, 6}, {49, 9}, + {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12}, + {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, {205, 9}, + {156, 8}, {168, 8}, {146, 4}, {180, 8}, {149, 4}, {161, 4}, {64, 4}, + {209, 12}, {159, 8}, {115, 9}, {72, 8}, {133, 9}, {78, 8}, {96, 8}, + {65, 5}, {195, 8}, {84, 8}, {102, 8}, {67, 5}, {120, 8}, {73, 5}, + {91, 5}, {64, 4}, {209, 12}, {209, 12}, {174, 8}, {148, 6}, {139, 9}, + {80, 8}, {98, 8}, {66, 6}, {198, 8}, {86, 8}, {61, 11}, {14, 9}, + {122, 8}, {22, 9}, {38, 9}, {3, 8}, {209, 12}, {157, 6}, {110, 8}, + {70, 6}, {128, 8}, {26, 9}, {42, 9}, {5, 8}, {193, 6}, {82, 6}, + {50, 9}, {9, 8}, {118, 6}, {17, 8}, {33, 8}, {0, 6}, {209, 12}, + {209, 12}, {209, 12}, {209, 12}, {189, 8}, {152, 7}, {164, 7}, {145, 3}, + {201, 8}, {88, 8}, {106, 8}, {69, 7}, {124, 8}, {75, 7}, {93, 7}, + {64, 4}, {209, 12}, {158, 7}, {112, 8}, {71, 7}, {130, 8}, {28, 9}, + {44, 9}, {6, 8}, {194, 7}, {83, 7}, {52, 9}, {10, 8}, {119, 7}, + {18, 8}, {34, 8}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6}, + {136, 8}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, {56, 9}, + {12, 8}, {121, 7}, {20, 8}, {36, 8}, {2, 7}, {209, 12}, {157, 6}, + {109, 7}, {70, 6}, {127, 7}, {24, 8}, {40, 8}, {4, 7}, {193, 6}, + {82, 6}, {48, 8}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6}, + {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, + {145, 3}, {209, 12}, {209, 12}, {209, 12}, {146, 4}, {209, 12}, {149, 4}, + {161, 4}, {64, 4}, {209, 12}, {209, 12}, {209, 12}, {147, 5}, {209, 12}, + {150, 5}, {162, 5}, {65, 5}, {209, 12}, {153, 5}, {165, 5}, {67, 5}, + {177, 5}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, {176, 10}, + {148, 6}, {188, 10}, {151, 6}, {163, 6}, {66, 6}, {200, 10}, {154, 6}, + {166, 6}, {68, 6}, {178, 6}, {74, 6}, {92, 6}, {64, 4}, {209, 12}, + {157, 6}, {169, 6}, {70, 6}, {181, 6}, {76, 6}, {94, 6}, {65, 5}, + {193, 6}, {82, 6}, {100, 6}, {67, 5}, {118, 6}, {73, 5}, {91, 5}, + {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {191, 10}, {152, 7}, + {164, 7}, {145, 3}, {203, 10}, {90, 10}, {108, 10}, {69, 7}, {126, 10}, + {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, {114, 10}, {71, 7}, + {132, 10}, {77, 7}, {95, 7}, {65, 5}, {194, 7}, {83, 7}, {101, 7}, + {67, 5}, {119, 7}, {73, 5}, {91, 5}, {1, 7}, {209, 12}, {209, 12}, + {173, 7}, {148, 6}, {138, 10}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, + {85, 7}, {103, 7}, {68, 6}, {121, 7}, {74, 6}, {92, 6}, {2, 7}, + {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, {76, 6}, {94, 6}, + {4, 7}, {193, 6}, {82, 6}, {100, 6}, {8, 7}, {118, 6}, {16, 7}, + {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, + {209, 12}, {209, 12}, {145, 3}, {206, 10}, {156, 8}, {168, 8}, {146, 4}, + {180, 8}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, {159, 8}, {116, 10}, + {72, 8}, {134, 10}, {78, 8}, {96, 8}, {65, 5}, {195, 8}, {84, 8}, + {102, 8}, {67, 5}, {120, 8}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, + {209, 12}, {174, 8}, {148, 6}, {140, 10}, {80, 8}, {98, 8}, {66, 6}, + {198, 8}, {86, 8}, {62, 11}, {15, 10}, {122, 8}, {23, 10}, {39, 10}, + {3, 8}, {209, 12}, {157, 6}, {110, 8}, {70, 6}, {128, 8}, {27, 10}, + {43, 10}, {5, 8}, {193, 6}, {82, 6}, {51, 10}, {9, 8}, {118, 6}, + {17, 8}, {33, 8}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, + {189, 8}, {152, 7}, {164, 7}, {145, 3}, {201, 8}, {88, 8}, {106, 8}, + {69, 7}, {124, 8}, {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, + {112, 8}, {71, 7}, {130, 8}, {29, 10}, {45, 10}, {6, 8}, {194, 7}, + {83, 7}, {53, 10}, {10, 8}, {119, 7}, {18, 8}, {34, 8}, {1, 7}, + {209, 12}, {209, 12}, {173, 7}, {148, 6}, {136, 8}, {79, 7}, {97, 7}, + {66, 6}, {197, 7}, {85, 7}, {57, 10}, {12, 8}, {121, 7}, {20, 8}, + {36, 8}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, + {24, 8}, {40, 8}, {4, 7}, {193, 6}, {82, 6}, {48, 8}, {8, 7}, + {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, + {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, {209, 12}, {209, 12}, + {209, 12}, {146, 4}, {209, 12}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, + {160, 9}, {172, 9}, {147, 5}, {184, 9}, {150, 5}, {162, 5}, {65, 5}, + {196, 9}, {153, 5}, {165, 5}, {67, 5}, {177, 5}, {73, 5}, {91, 5}, + {64, 4}, {209, 12}, {209, 12}, {175, 9}, {148, 6}, {142, 10}, {81, 9}, + {99, 9}, {66, 6}, {199, 9}, {87, 9}, {105, 9}, {68, 6}, {123, 9}, + {74, 6}, {92, 6}, {64, 4}, {209, 12}, {157, 6}, {111, 9}, {70, 6}, + {129, 9}, {76, 6}, {94, 6}, {65, 5}, {193, 6}, {82, 6}, {100, 6}, + {67, 5}, {118, 6}, {73, 5}, {91, 5}, {0, 6}, {209, 12}, {209, 12}, + {209, 12}, {209, 12}, {190, 9}, {152, 7}, {164, 7}, {145, 3}, {202, 9}, + {89, 9}, {107, 9}, {69, 7}, {125, 9}, {75, 7}, {93, 7}, {64, 4}, + {209, 12}, {158, 7}, {113, 9}, {71, 7}, {131, 9}, {30, 10}, {46, 10}, + {7, 9}, {194, 7}, {83, 7}, {54, 10}, {11, 9}, {119, 7}, {19, 9}, + {35, 9}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6}, {137, 9}, + {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, {58, 10}, {13, 9}, + {121, 7}, {21, 9}, {37, 9}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, + {70, 6}, {127, 7}, {25, 9}, {41, 9}, {4, 7}, {193, 6}, {82, 6}, + {49, 9}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, + {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, + {205, 9}, {156, 8}, {168, 8}, {146, 4}, {180, 8}, {149, 4}, {161, 4}, + {64, 4}, {209, 12}, {159, 8}, {115, 9}, {72, 8}, {133, 9}, {78, 8}, + {96, 8}, {65, 5}, {195, 8}, {84, 8}, {102, 8}, {67, 5}, {120, 8}, + {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, {174, 8}, {148, 6}, + {139, 9}, {80, 8}, {98, 8}, {66, 6}, {198, 8}, {86, 8}, {60, 10}, + {14, 9}, {122, 8}, {22, 9}, {38, 9}, {3, 8}, {209, 12}, {157, 6}, + {110, 8}, {70, 6}, {128, 8}, {26, 9}, {42, 9}, {5, 8}, {193, 6}, + {82, 6}, {50, 9}, {9, 8}, {118, 6}, {17, 8}, {33, 8}, {0, 6}, + {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8}, {152, 7}, {164, 7}, + {145, 3}, {201, 8}, {88, 8}, {106, 8}, {69, 7}, {124, 8}, {75, 7}, + {93, 7}, {64, 4}, {209, 12}, {158, 7}, {112, 8}, {71, 7}, {130, 8}, + {28, 9}, {44, 9}, {6, 8}, {194, 7}, {83, 7}, {52, 9}, {10, 8}, + {119, 7}, {18, 8}, {34, 8}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, + {148, 6}, {136, 8}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, + {56, 9}, {12, 8}, {121, 7}, {20, 8}, {36, 8}, {2, 7}, {209, 12}, + {157, 6}, {109, 7}, {70, 6}, {127, 7}, {24, 8}, {40, 8}, {4, 7}, + {193, 6}, {82, 6}, {48, 8}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, + {0, 6}}; +} // namespace utf8_to_utf16 +} // namespace tables +} // unnamed namespace +} // namespace simdutf -const unsigned char BitsSetTable256mul2[256] = { - 0, 2, 2, 4, 2, 4, 4, 6, 2, 4, 4, 6, 4, 6, 6, 8, 2, 4, 4, - 6, 4, 6, 6, 8, 4, 6, 6, 8, 6, 8, 8, 10, 2, 4, 4, 6, 4, 6, - 6, 8, 4, 6, 6, 8, 6, 8, 8, 10, 4, 6, 6, 8, 6, 8, 8, 10, 6, - 8, 8, 10, 8, 10, 10, 12, 2, 4, 4, 6, 4, 6, 6, 8, 4, 6, 6, 8, - 6, 8, 8, 10, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8, 10, 8, 10, 10, - 12, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8, 10, 8, 10, 10, 12, 6, 8, - 8, 10, 8, 10, 10, 12, 8, 10, 10, 12, 10, 12, 12, 14, 2, 4, 4, 6, 4, - 6, 6, 8, 4, 6, 6, 8, 6, 8, 8, 10, 4, 6, 6, 8, 6, 8, 8, 10, - 6, 8, 8, 10, 8, 10, 10, 12, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8, - 10, 8, 10, 10, 12, 6, 8, 8, 10, 8, 10, 10, 12, 8, 10, 10, 12, 10, 12, - 12, 14, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8, 10, 8, 10, 10, 12, 6, - 8, 8, 10, 8, 10, 10, 12, 8, 10, 10, 12, 10, 12, 12, 14, 6, 8, 8, 10, - 8, 10, 10, 12, 8, 10, 10, 12, 10, 12, 12, 14, 8, 10, 10, 12, 10, 12, 12, - 14, 10, 12, 12, 14, 12, 14, 14, 16}; - -constexpr uint8_t to_base64_value[] = { - 255, 255, 255, 255, 255, 255, 255, 255, 255, 64, 64, 255, 64, 64, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 64, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, - 255, 255, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, - 255, 255, 255, 255, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, - 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, - 25, 255, 255, 255, 255, 255, 255, 26, 27, 28, 29, 30, 31, 32, 33, - 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, - 49, 50, 51, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255}; +#endif // SIMDUTF_UTF8_TO_UTF16_TABLES_H +/* end file src/tables/utf8_to_utf16_tables.h */ +/* begin file src/tables/utf16_to_utf8_tables.h */ +// file generated by scripts/sse_convert_utf16_to_utf8.py +#ifndef SIMDUTF_UTF16_TO_UTF8_TABLES_H +#define SIMDUTF_UTF16_TO_UTF8_TABLES_H -constexpr uint8_t to_base64_url_value[] = { - 255, 255, 255, 255, 255, 255, 255, 255, 255, 64, 64, 255, 64, 64, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 64, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 62, 255, 255, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, - 255, 255, 255, 255, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, - 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, - 25, 255, 255, 255, 255, 63, 255, 26, 27, 28, 29, 30, 31, 32, 33, - 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, - 49, 50, 51, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255}; -static_assert(sizeof(to_base64_value) == 256, "to_base64_value must have 256 elements"); -static_assert(sizeof(to_base64_url_value) == 256, "to_base64_url_value must have 256 elements"); -static_assert(to_base64_value[uint8_t(' ')] == 64, "space must be == 64 in to_base64_value"); -static_assert(to_base64_url_value[uint8_t(' ')] == 64, "space must be == 64 in to_base64_url_value"); -static_assert(to_base64_value[uint8_t('\t')] == 64, "tab must be == 64 in to_base64_value"); -static_assert(to_base64_url_value[uint8_t('\t')] == 64, "tab must be == 64 in to_base64_url_value"); -static_assert(to_base64_value[uint8_t('\r')] == 64, "cr must be == 64 in to_base64_value"); -static_assert(to_base64_url_value[uint8_t('\r')] == 64, "cr must be == 64 in to_base64_url_value"); -static_assert(to_base64_value[uint8_t('\n')] == 64, "lf must be == 64 in to_base64_value"); -static_assert(to_base64_url_value[uint8_t('\n')] == 64, "lf must be == 64 in to_base64_url_value"); -static_assert(to_base64_value[uint8_t('\f')] == 64, "ff must be == 64 in to_base64_value"); -static_assert(to_base64_url_value[uint8_t('\f')] == 64, "ff must be == 64 in to_base64_url_value"); -static_assert(to_base64_value[uint8_t('+')] == 62, "+ must be == 62 in to_base64_value"); -static_assert(to_base64_url_value[uint8_t('-')] == 62, "- must be == 62 in to_base64_url_value"); -static_assert(to_base64_value[uint8_t('/')] == 63, "/ must be == 62 in to_base64_value"); -static_assert(to_base64_url_value[uint8_t('_')] == 63, "_ must be == 62 in to_base64_url_value"); -} // namespace base64 +namespace simdutf { +namespace { +namespace tables { +namespace utf16_to_utf8 { + +// 1 byte for length, 16 bytes for mask +const uint8_t pack_1_2_utf8_bytes[256][17] = { + {16, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}, + {15, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80}, + {15, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80}, + {14, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80}, + {15, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80}, + {14, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80}, + {14, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80}, + {13, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80}, + {15, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80}, + {14, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80}, + {14, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80}, + {13, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80}, + {14, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80}, + {13, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80}, + {13, 1, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80}, + {12, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {15, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80}, + {14, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80}, + {14, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80}, + {13, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80}, + {14, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80}, + {13, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80}, + {13, 1, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80}, + {12, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {14, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80}, + {13, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80}, + {13, 1, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80}, + {12, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {13, 1, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80}, + {12, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 2, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 2, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {15, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80}, + {14, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80}, + {14, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80}, + {13, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80}, + {14, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80}, + {13, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80}, + {13, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80}, + {12, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80}, + {13, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80}, + {13, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80}, + {12, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {13, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80}, + {12, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 2, 5, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 2, 5, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {14, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80}, + {13, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80}, + {13, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80}, + {12, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {13, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80}, + {12, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 2, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 2, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {13, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80}, + {12, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 3, 2, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 3, 2, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 2, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 2, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {11, 1, 0, 2, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {10, 0, 2, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {15, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80}, + {14, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80}, + {14, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80}, + {13, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80}, + {14, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80}, + {13, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80}, + {13, 1, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80}, + {12, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {14, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80}, + {13, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80}, + {13, 1, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80}, + {12, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {13, 1, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80}, + {12, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 2, 5, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 2, 5, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {14, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80}, + {13, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80}, + {13, 1, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80}, + {12, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {13, 1, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80}, + {12, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 2, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 2, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {13, 1, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80}, + {12, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 3, 2, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 3, 2, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 2, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 2, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {11, 1, 0, 2, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {10, 0, 2, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {14, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80}, + {13, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80}, + {13, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80}, + {12, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {13, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80}, + {12, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 2, 5, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 2, 5, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {13, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80}, + {12, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 3, 2, 5, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 3, 2, 5, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 2, 5, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 2, 5, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {11, 1, 0, 2, 5, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {10, 0, 2, 5, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {13, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80}, + {12, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 3, 2, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 3, 2, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 2, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 2, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {11, 1, 0, 2, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {10, 0, 2, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 3, 2, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 3, 2, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {11, 1, 0, 3, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {10, 0, 3, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {11, 1, 0, 2, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {10, 0, 2, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {10, 1, 0, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {9, 0, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {15, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80}, + {14, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80}, + {14, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80}, + {13, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80}, + {14, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80}, + {13, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80}, + {13, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80}, + {12, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80}, + {13, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80}, + {13, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80}, + {12, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {13, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80}, + {12, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {14, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80}, + {13, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80}, + {13, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80}, + {12, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {13, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80}, + {12, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {13, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80}, + {12, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {11, 1, 0, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {10, 0, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80}, + {13, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80}, + {13, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80}, + {12, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {13, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80}, + {12, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {13, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80}, + {12, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {11, 1, 0, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {10, 0, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {13, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80}, + {12, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {11, 1, 0, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {10, 0, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {11, 1, 0, 3, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {10, 0, 3, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {11, 1, 0, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {10, 0, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {10, 1, 0, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {9, 0, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {14, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80}, + {13, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80}, + {13, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80}, + {12, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {13, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80}, + {12, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {13, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80}, + {12, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {11, 1, 0, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {10, 0, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {13, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80}, + {12, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {11, 1, 0, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {10, 0, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {11, 1, 0, 3, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {10, 0, 3, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {11, 1, 0, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {10, 0, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {10, 1, 0, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {9, 0, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {13, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80}, + {12, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {11, 1, 0, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {10, 0, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {11, 1, 0, 3, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {10, 0, 3, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {11, 1, 0, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {10, 0, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {10, 1, 0, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {9, 0, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {12, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {11, 1, 0, 3, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {10, 0, 3, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {11, 1, 0, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {10, 0, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {10, 1, 0, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {9, 0, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {11, 1, 0, 3, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80}, + {10, 0, 3, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {10, 1, 0, 3, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {9, 0, 3, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {10, 1, 0, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {9, 0, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {9, 1, 0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {8, 0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}}; + +// 1 byte for length, 16 bytes for mask +const uint8_t pack_1_2_3_utf8_bytes[256][17] = { + {12, 2, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80}, + {9, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {11, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80}, + {10, 0, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {9, 2, 3, 1, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 3, 1, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 0, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {11, 2, 3, 1, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80}, + {8, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {10, 3, 1, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {9, 0, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {10, 2, 3, 1, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {7, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {9, 3, 1, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 0, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {9, 2, 3, 1, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {6, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 3, 1, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 0, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 2, 3, 1, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {3, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 3, 1, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 0, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {8, 2, 3, 1, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 3, 1, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 0, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 2, 3, 1, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 3, 1, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 0, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {11, 2, 3, 1, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80}, + {8, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {10, 3, 1, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {9, 0, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 2, 3, 1, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 3, 1, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 0, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {10, 2, 3, 1, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {7, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {9, 3, 1, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 0, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {9, 2, 3, 1, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 3, 1, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 0, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {10, 2, 3, 1, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {7, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {9, 3, 1, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {8, 0, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 2, 3, 1, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 3, 1, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 0, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {9, 2, 3, 1, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 3, 1, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 0, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 2, 3, 1, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 3, 1, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 0, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {9, 2, 3, 1, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {6, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 3, 1, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 0, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 2, 3, 1, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {3, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 3, 1, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 0, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {8, 2, 3, 1, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 3, 1, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 0, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 2, 3, 1, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 3, 1, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 0, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {6, 2, 3, 1, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {3, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 3, 1, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 0, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {3, 2, 3, 1, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}, + {2, 3, 1, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}, + {5, 2, 3, 1, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {2, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {4, 3, 1, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {3, 0, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {4, 2, 3, 1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}, + {3, 3, 1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {2, 0, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {8, 2, 3, 1, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 3, 1, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 0, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 2, 3, 1, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {2, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {4, 3, 1, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {3, 0, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {7, 2, 3, 1, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {6, 3, 1, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 0, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {6, 2, 3, 1, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {3, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 3, 1, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 0, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 2, 3, 1, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {6, 3, 1, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 0, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 2, 3, 1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}, + {3, 3, 1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {2, 0, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 2, 3, 1, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {3, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 3, 1, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 0, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 2, 3, 1, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {2, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {4, 3, 1, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {3, 0, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {11, 2, 3, 1, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80}, + {8, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {10, 3, 1, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {9, 0, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 2, 3, 1, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 3, 1, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 0, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {10, 2, 3, 1, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {7, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {9, 3, 1, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 0, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {9, 2, 3, 1, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 3, 1, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 0, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 2, 3, 1, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 3, 1, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 0, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 2, 3, 1, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {2, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}, + {4, 3, 1, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {3, 0, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {7, 2, 3, 1, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 3, 1, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 0, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {6, 2, 3, 1, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {3, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 3, 1, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 0, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {10, 2, 3, 1, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {7, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {9, 3, 1, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {8, 0, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 2, 3, 1, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 3, 1, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 0, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {9, 2, 3, 1, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 3, 1, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 0, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 2, 3, 1, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 3, 1, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 0, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {9, 2, 3, 1, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {6, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 3, 1, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 0, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 2, 3, 1, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {3, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 3, 1, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 0, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {8, 2, 3, 1, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 3, 1, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 0, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 2, 3, 1, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 3, 1, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 0, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {10, 2, 3, 1, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {7, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {9, 3, 1, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {8, 0, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 2, 3, 1, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 3, 1, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 0, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {9, 2, 3, 1, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 3, 1, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 0, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 2, 3, 1, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 3, 1, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 0, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 2, 3, 1, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {6, 3, 1, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 0, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 2, 3, 1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}, + {3, 3, 1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {2, 0, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 2, 3, 1, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {3, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 3, 1, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 0, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 2, 3, 1, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {2, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {4, 3, 1, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {3, 0, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {9, 2, 3, 1, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {6, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 3, 1, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 0, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 2, 3, 1, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {3, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 3, 1, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 0, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {8, 2, 3, 1, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 3, 1, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 0, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 2, 3, 1, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 3, 1, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 0, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 2, 3, 1, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 3, 1, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 0, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 2, 3, 1, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {2, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {4, 3, 1, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {3, 0, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {7, 2, 3, 1, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {6, 3, 1, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 0, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {6, 2, 3, 1, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {3, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 3, 1, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 0, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}}; + +} // namespace utf16_to_utf8 } // namespace tables } // unnamed namespace } // namespace simdutf -#endif // SIMDUTF_BASE64_TABLES_H -/* end file src/tables/base64_tables.h */ -/* begin file src/implementation.cpp */ -#include -#include -#include +#endif // SIMDUTF_UTF16_TO_UTF8_TABLES_H +/* end file src/tables/utf16_to_utf8_tables.h */ +/* begin file src/tables/utf32_to_utf16_tables.h */ +// file generated by scripts/sse_convert_utf32_to_utf16.py +#ifndef SIMDUTF_UTF32_TO_UTF16_TABLES_H +#define SIMDUTF_UTF32_TO_UTF16_TABLES_H -// Useful for debugging purposes namespace simdutf { namespace { +namespace tables { +namespace utf32_to_utf16 { -template -std::string toBinaryString(T b) { - std::string binary = ""; - T mask = T(1) << (sizeof(T) * CHAR_BIT - 1); - while (mask > 0) { - binary += ((b & mask) == 0) ? '0' : '1'; - mask >>= 1; - } - return binary; -} -} -} +const uint8_t pack_utf32_to_utf16le[16][16] = { + {0, 1, 4, 5, 8, 9, 12, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {0, 1, 2, 3, 4, 5, 8, 9, 12, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {0, 1, 4, 5, 6, 7, 8, 9, 12, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 0x80, 0x80, 0x80, 0x80}, + {0, 1, 4, 5, 8, 9, 10, 11, 12, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 0x80, 0x80, 0x80, 0x80}, + {0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 0x80, 0x80, 0x80}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 0x80}, + {0, 1, 4, 5, 8, 9, 12, 13, 14, 15, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {0, 1, 2, 3, 4, 5, 8, 9, 12, 13, 14, 15, 0x80, 0x80, 0x80, 0x80}, + {0, 1, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, 0x80, 0x80, 0x80, 0x80}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, 0x80, 0x80}, + {0, 1, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 0x80, 0x80, 0x80, 0x80}, + {0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 0x80, 0x80}, + {0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0x80, 0x80}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, +}; + +const uint8_t pack_utf32_to_utf16be[16][16] = { + {1, 0, 5, 4, 9, 8, 13, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {1, 0, 3, 2, 5, 4, 9, 8, 13, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {1, 0, 5, 4, 7, 6, 9, 8, 13, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 13, 12, 0x80, 0x80, 0x80, 0x80}, + {1, 0, 5, 4, 9, 8, 11, 10, 13, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {1, 0, 3, 2, 5, 4, 9, 8, 11, 10, 13, 12, 0x80, 0x80, 0x80, 0x80}, + {1, 0, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 0x80, 0x80, 0x80, 0x80}, + {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 0x80, 0x80}, + {1, 0, 5, 4, 9, 8, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {1, 0, 3, 2, 5, 4, 9, 8, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {1, 0, 5, 4, 7, 6, 9, 8, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 13, 12, 15, 14, 0x80, 0x80}, + {1, 0, 5, 4, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {1, 0, 3, 2, 5, 4, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80}, + {1, 0, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80}, + {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}, +}; + +} // namespace utf32_to_utf16 +} // namespace tables +} // unnamed namespace +} // namespace simdutf + +#endif // SIMDUTF_UTF16_TO_UTF8_TABLES_H +/* end file src/tables/utf32_to_utf16_tables.h */ +// End of tables. + +// Implementations: they need to be setup before including +// scalar/* code, as the scalar code is sometimes enabled +// only for peculiar build targets. -// Implementations // The best choice should always come first! +#ifndef SIMDUTF_REGULAR_VISUAL_STUDIO +SIMDUTF_DISABLE_UNUSED_WARNING +#endif /* begin file src/simdutf/arm64.h */ #ifndef SIMDUTF_ARM64_H #define SIMDUTF_ARM64_H #ifdef SIMDUTF_FALLBACK_H -#error "arm64.h must be included before fallback.h" + #error "arm64.h must be included before fallback.h" #endif #ifndef SIMDUTF_IMPLEMENTATION_ARM64 -#define SIMDUTF_IMPLEMENTATION_ARM64 (SIMDUTF_IS_ARM64) + #define SIMDUTF_IMPLEMENTATION_ARM64 (SIMDUTF_IS_ARM64) #endif #if SIMDUTF_IMPLEMENTATION_ARM64 && SIMDUTF_IS_ARM64 -#define SIMDUTF_CAN_ALWAYS_RUN_ARM64 1 + #define SIMDUTF_CAN_ALWAYS_RUN_ARM64 1 #else -#define SIMDUTF_CAN_ALWAYS_RUN_ARM64 0 + #define SIMDUTF_CAN_ALWAYS_RUN_ARM64 0 #endif - #if SIMDUTF_IMPLEMENTATION_ARM64 namespace simdutf { /** * Implementation for NEON (ARMv8). */ -namespace arm64 { -} // namespace arm64 +namespace arm64 {} // namespace arm64 } // namespace simdutf /* begin file src/simdutf/arm64/implementation.h */ @@ -744,88 +1783,279 @@ using namespace simdutf; class implementation final : public simdutf::implementation { public: - simdutf_really_inline implementation() : simdutf::implementation("arm64", "ARM NEON", internal::instruction_set::NEON) {} - simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final; - simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final; - simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final; - simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final; - simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final; - simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final; - simdutf_warn_unused size_t convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept final; - simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_latin1_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; - simdutf_warn_unused size_t convert_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final; - simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final; - simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; - simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; - simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; - simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; - simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final; - simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * buf, size_t len, char* latin1_output) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final; - simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final; - simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept; - simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept; - simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept; - simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept; - simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept; - simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept; - simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept; - simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept; - simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept; - simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept; - simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept; - simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept; - simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept; - simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept; - simdutf_warn_unused size_t base64_length_from_binary(size_t length, base64_options options) const noexcept; - size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept; + simdutf_really_inline implementation() + : simdutf::implementation("arm64", "ARM NEON", + internal::instruction_set::NEON) {} +#if SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused int detect_encodings(const char *input, + size_t length) const noexcept final; +#endif // SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool validate_utf8(const char *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF8 + simdutf_warn_unused result + validate_utf8_with_errors(const char *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 +#if SIMDUTF_FEATURE_ASCII + simdutf_warn_unused bool validate_ascii(const char *buf, + size_t len) const noexcept final; + simdutf_warn_unused result + validate_ascii_with_errors(const char *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_ASCII +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII + simdutf_warn_unused bool + validate_utf16le_as_ascii(const char16_t *buf, + size_t len) const noexcept final; + + simdutf_warn_unused bool + validate_utf16be_as_ascii(const char16_t *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool validate_utf16le(const char16_t *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused bool validate_utf16be(const char16_t *buf, + size_t len) const noexcept final; + simdutf_warn_unused result validate_utf16le_with_errors( + const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf16be_with_errors( + const char16_t *buf, size_t len) const noexcept final; + void to_well_formed_utf16be(const char16_t *input, size_t len, + char16_t *output) const noexcept final; + void to_well_formed_utf16le(const char16_t *input, size_t len, + char16_t *output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool validate_utf32(const char32_t *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused result validate_utf32_with_errors( + const char32_t *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf8( + const char *buf, size_t len, char *utf8_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf16le( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_latin1_to_utf16be( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_latin1_with_errors( + const char *buf, size_t len, char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t convert_utf8_to_utf16le( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf16be( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf16le_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf16be_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16le( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16be( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t convert_utf8_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf32_with_errors( + const char *buf, size_t len, char32_t *utf32_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf32( + const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + convert_utf16le_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf16be_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_latin1_with_errors( + const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_latin1_with_errors( + const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t convert_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + convert_utf32_to_latin1(const char32_t *buf, size_t len, + char *latin1_output) const noexcept final; + simdutf_warn_unused result + convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len, + char *latin1_output) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf32_to_latin1(const char32_t *buf, size_t len, + char *latin1_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t convert_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf8_with_errors( + const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t + convert_utf32_to_utf16le(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf32_to_utf16be(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf16le_with_errors( + const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf16be_with_errors( + const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf16le_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf16be_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_utf32_with_errors( + const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_utf32_with_errors( + const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF16 + void change_endianness_utf16(const char16_t *buf, size_t length, + char16_t *output) const noexcept final; + simdutf_warn_unused size_t + count_utf16le(const char16_t *buf, size_t length) const noexcept override; + simdutf_warn_unused size_t + count_utf16be(const char16_t *buf, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF8 + simdutf_warn_unused size_t count_utf8(const char *buf, + size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF8 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t utf8_length_from_utf16le( + const char16_t *input, size_t length) const noexcept override; + simdutf_warn_unused size_t utf8_length_from_utf16be( + const char16_t *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t utf32_length_from_utf16le( + const char16_t *input, size_t length) const noexcept override; + simdutf_warn_unused size_t utf32_length_from_utf16be( + const char16_t *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t utf16_length_from_utf8( + const char *input, size_t length) const noexcept override; + simdutf_warn_unused result utf8_length_from_utf16le_with_replacement( + const char16_t *input, size_t length) const noexcept override; + ; + simdutf_warn_unused result utf8_length_from_utf16be_with_replacement( + const char16_t *input, size_t length) const noexcept override; + ; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t utf8_length_from_utf32( + const char32_t *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t utf16_length_from_utf32( + const char32_t *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t utf32_length_from_utf8( + const char *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t latin1_length_from_utf8( + const char *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t utf8_length_from_latin1( + const char *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_BASE64 + simdutf_warn_unused result base64_to_binary( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept override; + simdutf_warn_unused full_result base64_to_binary_details( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept override; + simdutf_warn_unused result base64_to_binary( + const char16_t *input, size_t length, char *output, + base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept override; + simdutf_warn_unused full_result base64_to_binary_details( + const char16_t *input, size_t length, char *output, + base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept override; + size_t binary_to_base64(const char *input, size_t length, char *output, + base64_options options) const noexcept override; + size_t + binary_to_base64_with_lines(const char *input, size_t length, char *output, + size_t line_length, + base64_options options) const noexcept override; + const char *find(const char *start, const char *end, + char character) const noexcept override; + const char16_t *find(const char16_t *start, const char16_t *end, + char16_t character) const noexcept override; +#endif // SIMDUTF_FEATURE_BASE64 }; } // namespace arm64 @@ -837,9 +2067,10 @@ class implementation final : public simdutf::implementation { /* begin file src/simdutf/arm64/begin.h */ // redefining SIMDUTF_IMPLEMENTATION to "arm64" // #define SIMDUTF_IMPLEMENTATION arm64 +#define SIMDUTF_SIMD_HAS_BYTEMASK 1 /* end file src/simdutf/arm64/begin.h */ -// Declarations + // Declarations /* begin file src/simdutf/arm64/intrinsics.h */ #ifndef SIMDUTF_ARM64_INTRINSICS_H #define SIMDUTF_ARM64_INTRINSICS_H @@ -861,22 +2092,25 @@ namespace { /* result might be undefined when input_num is zero */ simdutf_really_inline int count_ones(uint64_t input_num) { - return vaddv_u8(vcnt_u8(vcreate_u8(input_num))); + return vaddv_u8(vcnt_u8(vcreate_u8(input_num))); } #if SIMDUTF_NEED_TRAILING_ZEROES simdutf_really_inline int trailing_zeroes(uint64_t input_num) { -#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO unsigned long ret; // Search the mask data from least significant bit (LSB) // to the most significant bit (MSB) for a set bit (1). _BitScanForward64(&ret, input_num); return (int)ret; -#else // SIMDUTF_REGULAR_VISUAL_STUDIO + #else // SIMDUTF_REGULAR_VISUAL_STUDIO return __builtin_ctzll(input_num); -#endif // SIMDUTF_REGULAR_VISUAL_STUDIO + #endif // SIMDUTF_REGULAR_VISUAL_STUDIO } #endif +template T clear_least_significant_bit(T x) { + return (x & (x - 1)); +} } // unnamed namespace } // namespace arm64 @@ -890,7 +2124,6 @@ simdutf_really_inline int trailing_zeroes(uint64_t input_num) { #include - namespace simdutf { namespace arm64 { namespace { @@ -898,736 +2131,777 @@ namespace simd { #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO namespace { -// Start of private section with Visual Studio workaround - -#ifndef simdutf_make_uint8x16_t -#define simdutf_make_uint8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, \ - x13, x14, x15, x16) \ - ([=]() { \ - uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, \ - x9, x10, x11, x12, x13, x14, x15, x16}; \ - return vld1q_u8(array); \ - }()) -#endif -#ifndef simdutf_make_int8x16_t -#define simdutf_make_int8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, \ - x13, x14, x15, x16) \ - ([=]() { \ - int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, \ - x9, x10, x11, x12, x13, x14, x15, x16}; \ - return vld1q_s8(array); \ - }()) -#endif - -#ifndef simdutf_make_uint8x8_t -#define simdutf_make_uint8x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \ - ([=]() { \ - uint8_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \ - return vld1_u8(array); \ - }()) -#endif -#ifndef simdutf_make_int8x8_t -#define simdutf_make_int8x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \ - ([=]() { \ - int8_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \ - return vld1_s8(array); \ - }()) -#endif -#ifndef simdutf_make_uint16x8_t -#define simdutf_make_uint16x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \ - ([=]() { \ - uint16_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \ - return vld1q_u16(array); \ - }()) -#endif -#ifndef simdutf_make_int16x8_t -#define simdutf_make_int16x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \ - ([=]() { \ - int16_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \ - return vld1q_s16(array); \ - }()) -#endif + // Start of private section with Visual Studio workaround + + #ifndef simdutf_make_uint8x16_t + #define simdutf_make_uint8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, \ + x11, x12, x13, x14, x15, x16) \ + ([=]() { \ + uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, \ + x9, x10, x11, x12, x13, x14, x15, x16}; \ + return vld1q_u8(array); \ + }()) + #endif + #ifndef simdutf_make_int8x16_t + #define simdutf_make_int8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, \ + x11, x12, x13, x14, x15, x16) \ + ([=]() { \ + int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, \ + x9, x10, x11, x12, x13, x14, x15, x16}; \ + return vld1q_s8(array); \ + }()) + #endif + #ifndef simdutf_make_uint8x8_t + #define simdutf_make_uint8x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \ + ([=]() { \ + uint8_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \ + return vld1_u8(array); \ + }()) + #endif + #ifndef simdutf_make_int8x8_t + #define simdutf_make_int8x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \ + ([=]() { \ + int8_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \ + return vld1_s8(array); \ + }()) + #endif + #ifndef simdutf_make_uint16x8_t + #define simdutf_make_uint16x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \ + ([=]() { \ + uint16_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \ + return vld1q_u16(array); \ + }()) + #endif + #ifndef simdutf_make_int16x8_t + #define simdutf_make_int16x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \ + ([=]() { \ + int16_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \ + return vld1q_s16(array); \ + }()) + #endif // End of private section with Visual Studio workaround } // namespace #endif // SIMDUTF_REGULAR_VISUAL_STUDIO +template struct simd8; - template - struct simd8; - - // - // Base class of simd8 and simd8, both of which use uint8x16_t internally. - // - template> - struct base_u8 { - uint8x16_t value; - static const int SIZE = sizeof(value); - - // Conversion from/to SIMD register - simdutf_really_inline base_u8(const uint8x16_t _value) : value(_value) {} - simdutf_really_inline operator const uint8x16_t&() const { return this->value; } - simdutf_really_inline operator uint8x16_t&() { return this->value; } - simdutf_really_inline T first() const { return vgetq_lane_u8(*this,0); } - simdutf_really_inline T last() const { return vgetq_lane_u8(*this,15); } - - // Bit operations - simdutf_really_inline simd8 operator|(const simd8 other) const { return vorrq_u8(*this, other); } - simdutf_really_inline simd8 operator&(const simd8 other) const { return vandq_u8(*this, other); } - simdutf_really_inline simd8 operator^(const simd8 other) const { return veorq_u8(*this, other); } - simdutf_really_inline simd8 bit_andnot(const simd8 other) const { return vbicq_u8(*this, other); } - simdutf_really_inline simd8 operator~() const { return *this ^ 0xFFu; } - simdutf_really_inline simd8& operator|=(const simd8 other) { auto this_cast = static_cast*>(this); *this_cast = *this_cast | other; return *this_cast; } - simdutf_really_inline simd8& operator&=(const simd8 other) { auto this_cast = static_cast*>(this); *this_cast = *this_cast & other; return *this_cast; } - simdutf_really_inline simd8& operator^=(const simd8 other) { auto this_cast = static_cast*>(this); *this_cast = *this_cast ^ other; return *this_cast; } - - friend simdutf_really_inline Mask operator==(const simd8 lhs, const simd8 rhs) { return vceqq_u8(lhs, rhs); } - - template - simdutf_really_inline simd8 prev(const simd8 prev_chunk) const { - return vextq_u8(prev_chunk, *this, 16 - N); - } - }; - - // SIMD byte mask type (returned by things like eq and gt) - template<> - struct simd8: base_u8 { - typedef uint16_t bitmask_t; - typedef uint32_t bitmask2_t; - - static simdutf_really_inline simd8 splat(bool _value) { return vmovq_n_u8(uint8_t(-(!!_value))); } - - simdutf_really_inline simd8(const uint8x16_t _value) : base_u8(_value) {} - // False constructor - simdutf_really_inline simd8() : simd8(vdupq_n_u8(0)) {} - // Splat constructor - simdutf_really_inline simd8(bool _value) : simd8(splat(_value)) {} - simdutf_really_inline void store(uint8_t dst[16]) const { return vst1q_u8(dst, *this); } +// +// Base class of simd8 and simd8, both of which use uint8x16_t +// internally. +// +template > struct base_u8 { + uint8x16_t value; + static const int SIZE = sizeof(value); + void dump() const { +#ifdef SIMDUTF_LOGGING + uint8_t temp[16]; + vst1q_u8(temp, *this); + printf("[%04x, %04x, %04x, %04x, %04x, %04x, %04x, %04x,%04x, %04x, %04x, " + "%04x, %04x, %04x, %04x, %04x]\n", + temp[0], temp[1], temp[2], temp[3], temp[4], temp[5], temp[6], + temp[7], temp[8], temp[9], temp[10], temp[11], temp[12], temp[13], + temp[14], temp[15]); +#endif // SIMDUTF_LOGGING + } + // Conversion from/to SIMD register + simdutf_really_inline base_u8(const uint8x16_t _value) : value(_value) {} + simdutf_really_inline operator const uint8x16_t &() const { + return this->value; + } - // We return uint32_t instead of uint16_t because that seems to be more efficient for most - // purposes (cutting it down to uint16_t costs performance in some compilers). - simdutf_really_inline uint32_t to_bitmask() const { -#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint8x16_t bit_mask = simdutf_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, - 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80); -#else - const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, - 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; -#endif - auto minput = *this & bit_mask; - uint8x16_t tmp = vpaddq_u8(minput, minput); - tmp = vpaddq_u8(tmp, tmp); - tmp = vpaddq_u8(tmp, tmp); - return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0); - } + // Bit operations + simdutf_really_inline simd8 operator|(const simd8 other) const { + return vorrq_u8(*this, other); + } + simdutf_really_inline simd8 operator&(const simd8 other) const { + return vandq_u8(*this, other); + } + simdutf_really_inline simd8 operator^(const simd8 other) const { + return veorq_u8(*this, other); + } + simdutf_really_inline simd8 &operator|=(const simd8 other) { + auto this_cast = static_cast *>(this); + *this_cast = *this_cast | other; + return *this_cast; + } - // Returns 4-bit out of each byte, alternating between the high 4 bits and low bits - // result it is 64 bit. - // This method is expected to be faster than none() and is equivalent - // when the vector register is the result of a comparison, with byte - // values 0xff and 0x00. - simdutf_really_inline uint64_t to_bitmask64() const { - return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(*this), 4)), 0); - } + friend simdutf_really_inline Mask operator==(const simd8 lhs, + const simd8 rhs) { + return vceqq_u8(lhs, rhs); + } - simdutf_really_inline bool any() const { return vmaxvq_u32(vreinterpretq_u32_u8(*this)) != 0; } - simdutf_really_inline bool none() const { return vmaxvq_u32(vreinterpretq_u32_u8(*this)) == 0; } - simdutf_really_inline bool all() const { return vminvq_u32(vreinterpretq_u32_u8(*this)) == 0xFFFFF; } + template + simdutf_really_inline simd8 prev(const simd8 prev_chunk) const { + return vextq_u8(prev_chunk, *this, 16 - N); + } +}; +// SIMD byte mask type (returned by things like eq and gt) +template <> struct simd8 : base_u8 { + static simdutf_really_inline simd8 splat(bool _value) { + return vmovq_n_u8(uint8_t(-(!!_value))); + } - }; + simdutf_really_inline simd8(const uint8x16_t _value) + : base_u8(_value) {} + // False constructor + simdutf_really_inline simd8() : simd8(vdupq_n_u8(0)) {} + // Splat constructor + simdutf_really_inline simd8(bool _value) : simd8(splat(_value)) {} + simdutf_really_inline void store(uint8_t dst[16]) const { + return vst1q_u8(dst, *this); + } - // Unsigned bytes - template<> - struct simd8: base_u8 { - static simdutf_really_inline simd8 splat(uint8_t _value) { return vmovq_n_u8(_value); } - static simdutf_really_inline simd8 zero() { return vdupq_n_u8(0); } - static simdutf_really_inline simd8 load(const uint8_t* values) { return vld1q_u8(values); } - simdutf_really_inline simd8(const uint8x16_t _value) : base_u8(_value) {} - // Zero constructor - simdutf_really_inline simd8() : simd8(zero()) {} - // Array constructor - simdutf_really_inline simd8(const uint8_t values[16]) : simd8(load(values)) {} - // Splat constructor - simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {} - // Member-by-member initialization + // We return uint32_t instead of uint16_t because that seems to be more + // efficient for most purposes (cutting it down to uint16_t costs performance + // in some compilers). + simdutf_really_inline uint32_t to_bitmask() const { #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - simdutf_really_inline simd8( - uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, - uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 - ) : simd8(simdutf_make_uint8x16_t( - v0, v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10,v11,v12,v13,v14,v15 - )) {} + const uint8x16_t bit_mask = + simdutf_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80); #else - simdutf_really_inline simd8( - uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, - uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 - ) : simd8(uint8x16_t{ - v0, v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10,v11,v12,v13,v14,v15 - }) {} + const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; #endif + auto minput = *this & bit_mask; + uint8x16_t tmp = vpaddq_u8(minput, minput); + tmp = vpaddq_u8(tmp, tmp); + tmp = vpaddq_u8(tmp, tmp); + return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0); + } - // Repeat 16 values as many times as necessary (usually for lookup tables) - simdutf_really_inline static simd8 repeat_16( - uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, - uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 - ) { - return simd8( - v0, v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10,v11,v12,v13,v14,v15 - ); - } - - // Store to array - simdutf_really_inline void store(uint8_t dst[16]) const { return vst1q_u8(dst, *this); } - - // Saturated math - simdutf_really_inline simd8 saturating_add(const simd8 other) const { return vqaddq_u8(*this, other); } - simdutf_really_inline simd8 saturating_sub(const simd8 other) const { return vqsubq_u8(*this, other); } - - // Addition/subtraction are the same for signed and unsigned - simdutf_really_inline simd8 operator+(const simd8 other) const { return vaddq_u8(*this, other); } - simdutf_really_inline simd8 operator-(const simd8 other) const { return vsubq_u8(*this, other); } - simdutf_really_inline simd8& operator+=(const simd8 other) { *this = *this + other; return *this; } - simdutf_really_inline simd8& operator-=(const simd8 other) { *this = *this - other; return *this; } - - // Order-specific operations - simdutf_really_inline uint8_t max_val() const { return vmaxvq_u8(*this); } - simdutf_really_inline uint8_t min_val() const { return vminvq_u8(*this); } - simdutf_really_inline simd8 max_val(const simd8 other) const { return vmaxq_u8(*this, other); } - simdutf_really_inline simd8 min_val(const simd8 other) const { return vminq_u8(*this, other); } - simdutf_really_inline simd8 operator<=(const simd8 other) const { return vcleq_u8(*this, other); } - simdutf_really_inline simd8 operator>=(const simd8 other) const { return vcgeq_u8(*this, other); } - simdutf_really_inline simd8 operator<(const simd8 other) const { return vcltq_u8(*this, other); } - simdutf_really_inline simd8 operator>(const simd8 other) const { return vcgtq_u8(*this, other); } - // Same as >, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's. - simdutf_really_inline simd8 gt_bits(const simd8 other) const { return simd8(*this > other); } - // Same as <, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's. - simdutf_really_inline simd8 lt_bits(const simd8 other) const { return simd8(*this < other); } - - // Bit-specific operations - simdutf_really_inline simd8 any_bits_set(simd8 bits) const { return vtstq_u8(*this, bits); } - simdutf_really_inline bool is_ascii() const { return this->max_val() < 0b10000000u; } - - simdutf_really_inline bool any_bits_set_anywhere() const { return this->max_val() != 0; } - simdutf_really_inline bool any_bits_set_anywhere(simd8 bits) const { return (*this & bits).any_bits_set_anywhere(); } - template - simdutf_really_inline simd8 shr() const { return vshrq_n_u8(*this, N); } - template - simdutf_really_inline simd8 shl() const { return vshlq_n_u8(*this, N); } - - // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values) - template - simdutf_really_inline simd8 lookup_16(simd8 lookup_table) const { - return lookup_table.apply_lookup_16_to(*this); - } - - - template - simdutf_really_inline simd8 lookup_16( - L replace0, L replace1, L replace2, L replace3, - L replace4, L replace5, L replace6, L replace7, - L replace8, L replace9, L replace10, L replace11, - L replace12, L replace13, L replace14, L replace15) const { - return lookup_16(simd8::repeat_16( - replace0, replace1, replace2, replace3, - replace4, replace5, replace6, replace7, - replace8, replace9, replace10, replace11, - replace12, replace13, replace14, replace15 - )); - } - - template - simdutf_really_inline simd8 apply_lookup_16_to(const simd8 original) const { - return vqtbl1q_u8(*this, simd8(original)); - } - }; - - // Signed bytes - template<> - struct simd8 { - int8x16_t value; - - static simdutf_really_inline simd8 splat(int8_t _value) { return vmovq_n_s8(_value); } - static simdutf_really_inline simd8 zero() { return vdupq_n_s8(0); } - static simdutf_really_inline simd8 load(const int8_t values[16]) { return vld1q_s8(values); } + // Returns 4-bit out of each byte, alternating between the high 4 bits and low + // bits result it is 64 bit. This method is expected to be faster than none() + // and is equivalent when the vector register is the result of a comparison, + // with byte values 0xff and 0x00. + simdutf_really_inline uint64_t to_bitmask64() const { + return vget_lane_u64( + vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(*this), 4)), 0); + } +}; - // Use ST2 instead of UXTL+UXTL2 to interleave zeroes. UXTL is actually a USHLL #0, - // and shifting in NEON is actually quite slow. - // - // While this needs the registers to be in a specific order, bigger cores can interleave - // these with no overhead, and it still performs decently on little cores. - // movi v1.3d, #0 - // mov v0.16b, value[0] - // st2 {v0.16b, v1.16b}, [ptr], #32 - // mov v0.16b, value[1] - // st2 {v0.16b, v1.16b}, [ptr], #32 - // ... - template - simdutf_really_inline void store_ascii_as_utf16(char16_t * p) const { - int8x16x2_t pair = match_system(big_endian) - ? int8x16x2_t{{this->value, vmovq_n_s8(0)}} - : int8x16x2_t{{vmovq_n_s8(0), this->value}}; - vst2q_s8(reinterpret_cast(p), pair); - } - - // currently unused - // Technically this could be done with ST4 like in store_ascii_as_utf16, but it is - // very much not worth it, as explicitly mentioned in the ARM Cortex-X1 Core Software - // Optimization Guide: - // 4.18 Complex ASIMD instructions - // The bandwidth of [ST4 with element size less than 64b] is limited by decode - // constraints and it is advisable to avoid them when high performing code is desired. - // Instead, it is better to use ZIP1+ZIP2 and two ST2. - simdutf_really_inline void store_ascii_as_utf32(char32_t * p) const { - const uint16x8_t low = vreinterpretq_u16_s8(vzip1q_s8(this->value, vmovq_n_s8(0))); - const uint16x8_t high = vreinterpretq_u16_s8(vzip2q_s8(this->value, vmovq_n_s8(0))); - const uint16x8x2_t low_pair{{ low, vmovq_n_u16(0) }}; - vst2q_u16(reinterpret_cast(p), low_pair); - const uint16x8x2_t high_pair{{ high, vmovq_n_u16(0) }}; - vst2q_u16(reinterpret_cast(p + 8), high_pair); - } - - // In places where the table can be reused, which is most uses in simdutf, it is worth it to do - // 4 table lookups, as there is no direct zero extension from u8 to u32. - simdutf_really_inline void store_ascii_as_utf32_tbl(char32_t * p) const { - const simd8 tb1{ 0,255,255,255, 1,255,255,255, 2,255,255,255, 3,255,255,255 }; - const simd8 tb2{ 4,255,255,255, 5,255,255,255, 6,255,255,255, 7,255,255,255 }; - const simd8 tb3{ 8,255,255,255, 9,255,255,255, 10,255,255,255, 11,255,255,255 }; - const simd8 tb4{ 12,255,255,255, 13,255,255,255, 14,255,255,255, 15,255,255,255 }; - - // encourage store pairing and interleaving - const auto shuf1 = this->apply_lookup_16_to(tb1); - const auto shuf2 = this->apply_lookup_16_to(tb2); - shuf1.store(reinterpret_cast(p)); - shuf2.store(reinterpret_cast(p + 4)); - - const auto shuf3 = this->apply_lookup_16_to(tb3); - const auto shuf4 = this->apply_lookup_16_to(tb4); - shuf3.store(reinterpret_cast(p + 8)); - shuf4.store(reinterpret_cast(p + 12)); - } - // Conversion from/to SIMD register - simdutf_really_inline simd8(const int8x16_t _value) : value{_value} {} - simdutf_really_inline operator const int8x16_t&() const { return this->value; } -#ifndef SIMDUTF_REGULAR_VISUAL_STUDIO - simdutf_really_inline operator const uint8x16_t() const { return vreinterpretq_u8_s8(this->value); } -#endif - simdutf_really_inline operator int8x16_t&() { return this->value; } - - // Zero constructor - simdutf_really_inline simd8() : simd8(zero()) {} - // Splat constructor - simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {} - // Array constructor - simdutf_really_inline simd8(const int8_t* values) : simd8(load(values)) {} - // Member-by-member initialization +// Unsigned bytes +template <> struct simd8 : base_u8 { + static simdutf_really_inline simd8 splat(uint8_t _value) { + return vmovq_n_u8(_value); + } + static simdutf_really_inline simd8 zero() { return vdupq_n_u8(0); } + static simdutf_really_inline simd8 load(const uint8_t *values) { + return vld1q_u8(values); + } + simdutf_really_inline simd8(const uint8x16_t _value) + : base_u8(_value) {} + // Zero constructor + simdutf_really_inline simd8() : simd8(zero()) {} + // Array constructor + simdutf_really_inline simd8(const uint8_t values[16]) : simd8(load(values)) {} + // Splat constructor + simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {} + // Member-by-member initialization #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - simdutf_really_inline simd8( - int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, - int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 - ) : simd8(simdutf_make_int8x16_t( - v0, v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10,v11,v12,v13,v14,v15 - )) {} + simdutf_really_inline + simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, + uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10, + uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15) + : simd8(simdutf_make_uint8x16_t(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, + v10, v11, v12, v13, v14, v15)) {} #else - simdutf_really_inline simd8( - int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, - int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 - ) : simd8(int8x16_t{ - v0, v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10,v11,v12,v13,v14,v15 - }) {} -#endif - // Repeat 16 values as many times as necessary (usually for lookup tables) - simdutf_really_inline static simd8 repeat_16( - int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, - int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 - ) { - return simd8( - v0, v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10,v11,v12,v13,v14,v15 - ); - } - - // Store to array - simdutf_really_inline void store(int8_t dst[16]) const { return vst1q_s8(dst, value); } - // Explicit conversion to/from unsigned - // - // Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same type. - // In theory, we could check this occurrence with std::same_as and std::enabled_if but it is C++14 - // and relatively ugly and hard to read. -#ifndef SIMDUTF_REGULAR_VISUAL_STUDIO - simdutf_really_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {} + simdutf_really_inline + simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, + uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10, + uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15) + : simd8(uint8x16_t{v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15}) {} #endif - simdutf_really_inline operator simd8() const { return vreinterpretq_u8_s8(this->value); } - - simdutf_really_inline simd8 operator|(const simd8 other) const { return vorrq_s8(value, other.value); } - simdutf_really_inline simd8 operator&(const simd8 other) const { return vandq_s8(value, other.value); } - simdutf_really_inline simd8 operator^(const simd8 other) const { return veorq_s8(value, other.value); } - simdutf_really_inline simd8 bit_andnot(const simd8 other) const { return vbicq_s8(value, other.value); } - - // Math - simdutf_really_inline simd8 operator+(const simd8 other) const { return vaddq_s8(value, other.value); } - simdutf_really_inline simd8 operator-(const simd8 other) const { return vsubq_s8(value, other.value); } - simdutf_really_inline simd8& operator+=(const simd8 other) { *this = *this + other; return *this; } - simdutf_really_inline simd8& operator-=(const simd8 other) { *this = *this - other; return *this; } - - simdutf_really_inline int8_t max_val() const { return vmaxvq_s8(value); } - simdutf_really_inline int8_t min_val() const { return vminvq_s8(value); } - simdutf_really_inline bool is_ascii() const { return this->min_val() >= 0; } - - // Order-sensitive comparisons - simdutf_really_inline simd8 max_val(const simd8 other) const { return vmaxq_s8(value, other.value); } - simdutf_really_inline simd8 min_val(const simd8 other) const { return vminq_s8(value, other.value); } - simdutf_really_inline simd8 operator>(const simd8 other) const { return vcgtq_s8(value, other.value); } - simdutf_really_inline simd8 operator<(const simd8 other) const { return vcltq_s8(value, other.value); } - simdutf_really_inline simd8 operator==(const simd8 other) const { return vceqq_s8(value, other.value); } - - template - simdutf_really_inline simd8 prev(const simd8 prev_chunk) const { - return vextq_s8(prev_chunk, *this, 16 - N); - } - - // Perform a lookup assuming no value is larger than 16 - template - simdutf_really_inline simd8 lookup_16(simd8 lookup_table) const { - return lookup_table.apply_lookup_16_to(*this); - } - template - simdutf_really_inline simd8 lookup_16( - L replace0, L replace1, L replace2, L replace3, - L replace4, L replace5, L replace6, L replace7, - L replace8, L replace9, L replace10, L replace11, - L replace12, L replace13, L replace14, L replace15) const { - return lookup_16(simd8::repeat_16( - replace0, replace1, replace2, replace3, - replace4, replace5, replace6, replace7, - replace8, replace9, replace10, replace11, - replace12, replace13, replace14, replace15 - )); - } - - template - simdutf_really_inline simd8 apply_lookup_16_to(const simd8 original) const { - return vqtbl1q_s8(*this, simd8(original)); - } - }; - template - struct simd8x64 { - static constexpr int NUM_CHUNKS = 64 / sizeof(simd8); - static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block."); - simd8 chunks[NUM_CHUNKS]; + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdutf_really_inline static simd8 + repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, + uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, + uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, + uint8_t v15) { + return simd8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15); + } - simd8x64(const simd8x64& o) = delete; // no copy allowed - simd8x64& operator=(const simd8 other) = delete; // no assignment allowed - simd8x64() = delete; // no default constructor allowed + // Store to array + simdutf_really_inline void store(uint8_t dst[16]) const { + return vst1q_u8(dst, *this); + } - simdutf_really_inline simd8x64(const simd8 chunk0, const simd8 chunk1, const simd8 chunk2, const simd8 chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {} - simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8::load(ptr), simd8::load(ptr+sizeof(simd8)/sizeof(T)), simd8::load(ptr+2*sizeof(simd8)/sizeof(T)), simd8::load(ptr+3*sizeof(simd8)/sizeof(T))} {} + // Addition/subtraction are the same for signed and unsigned + simdutf_really_inline simd8 + operator-(const simd8 other) const { + return vsubq_u8(*this, other); + } + simdutf_really_inline simd8 &operator-=(const simd8 other) { + *this = *this - other; + return *this; + } - simdutf_really_inline void store(T* ptr) const { - this->chunks[0].store(ptr+sizeof(simd8)*0/sizeof(T)); - this->chunks[1].store(ptr+sizeof(simd8)*1/sizeof(T)); - this->chunks[2].store(ptr+sizeof(simd8)*2/sizeof(T)); - this->chunks[3].store(ptr+sizeof(simd8)*3/sizeof(T)); - } + // Order-specific operations + simdutf_really_inline uint8_t max_val() const { return vmaxvq_u8(*this); } + simdutf_really_inline simd8 + operator>=(const simd8 other) const { + return vcgeq_u8(*this, other); + } + simdutf_really_inline simd8 + operator>(const simd8 other) const { + return vcgtq_u8(*this, other); + } + // Same as >, but instead of guaranteeing all 1's == true, false = 0 and true + // = nonzero. For ARM, returns all 1's. + simdutf_really_inline simd8 + gt_bits(const simd8 other) const { + return simd8(*this > other); + } + // Bit-specific operations + simdutf_really_inline simd8 any_bits_set(simd8 bits) const { + return vtstq_u8(*this, bits); + } - simdutf_really_inline simd8x64& operator |=(const simd8x64 &other) { - this->chunks[0] |= other.chunks[0]; - this->chunks[1] |= other.chunks[1]; - this->chunks[2] |= other.chunks[2]; - this->chunks[3] |= other.chunks[3]; - return *this; - } + simdutf_really_inline bool is_ascii() const { + return this->max_val() < 0b10000000u; + } - simdutf_really_inline simd8 reduce_or() const { - return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]); - } + simdutf_really_inline bool any_bits_set_anywhere() const { + return this->max_val() != 0; + } + template simdutf_really_inline simd8 shr() const { + return vshrq_n_u8(*this, N); + } + simdutf_really_inline uint16_t sum_bytes() const { return vaddvq_u8(*this); } - simdutf_really_inline bool is_ascii() const { - return reduce_or().is_ascii(); - } + // Perform a lookup assuming the value is between 0 and 16 (undefined behavior + // for out of range values) + template + simdutf_really_inline simd8 lookup_16(simd8 lookup_table) const { + return lookup_table.apply_lookup_16_to(*this); + } - template - simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const { - this->chunks[0].template store_ascii_as_utf16(ptr+sizeof(simd8)*0); - this->chunks[1].template store_ascii_as_utf16(ptr+sizeof(simd8)*1); - this->chunks[2].template store_ascii_as_utf16(ptr+sizeof(simd8)*2); - this->chunks[3].template store_ascii_as_utf16(ptr+sizeof(simd8)*3); - } + template + simdutf_really_inline simd8 + lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4, + L replace5, L replace6, L replace7, L replace8, L replace9, + L replace10, L replace11, L replace12, L replace13, L replace14, + L replace15) const { + return lookup_16(simd8::repeat_16( + replace0, replace1, replace2, replace3, replace4, replace5, replace6, + replace7, replace8, replace9, replace10, replace11, replace12, + replace13, replace14, replace15)); + } - simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const { - this->chunks[0].store_ascii_as_utf32_tbl(ptr+sizeof(simd8)*0); - this->chunks[1].store_ascii_as_utf32_tbl(ptr+sizeof(simd8)*1); - this->chunks[2].store_ascii_as_utf32_tbl(ptr+sizeof(simd8)*2); - this->chunks[3].store_ascii_as_utf32_tbl(ptr+sizeof(simd8)*3); - } + template + simdutf_really_inline simd8 + apply_lookup_16_to(const simd8 original) const { + return vqtbl1q_u8(*this, simd8(original)); + } +}; - simdutf_really_inline uint64_t to_bitmask() const { -#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint8x16_t bit_mask = simdutf_make_uint8x16_t( - 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, - 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 - ); -#else - const uint8x16_t bit_mask = { - 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, - 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 - }; -#endif - // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one. - uint8x16_t sum0 = vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[0]), bit_mask), vandq_u8(uint8x16_t(this->chunks[1]), bit_mask)); - uint8x16_t sum1 = vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[2]), bit_mask), vandq_u8(uint8x16_t(this->chunks[3]), bit_mask)); - sum0 = vpaddq_u8(sum0, sum1); - sum0 = vpaddq_u8(sum0, sum0); - return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0); - } +// Signed bytes +template <> struct simd8 { + int8x16_t value; + static const int SIZE = sizeof(value); - simdutf_really_inline uint64_t eq(const T m) const { - const simd8 mask = simd8::splat(m); - return simd8x64( - this->chunks[0] == mask, - this->chunks[1] == mask, - this->chunks[2] == mask, - this->chunks[3] == mask - ).to_bitmask(); + static simdutf_really_inline simd8 splat(int8_t _value) { + return vmovq_n_s8(_value); + } + static simdutf_really_inline simd8 zero() { return vdupq_n_s8(0); } + static simdutf_really_inline simd8 load(const int8_t values[16]) { + return vld1q_s8(values); } - simdutf_really_inline uint64_t lteq(const T m) const { + // Use ST2 instead of UXTL+UXTL2 to interleave zeroes. UXTL is actually a + // USHLL #0, and shifting in NEON is actually quite slow. + // + // While this needs the registers to be in a specific order, bigger cores can + // interleave these with no overhead, and it still performs decently on little + // cores. + // movi v1.3d, #0 + // mov v0.16b, value[0] + // st2 {v0.16b, v1.16b}, [ptr], #32 + // mov v0.16b, value[1] + // st2 {v0.16b, v1.16b}, [ptr], #32 + // ... + template + simdutf_really_inline void store_ascii_as_utf16(char16_t *p) const { + simdutf_constexpr auto matches = match_system(big_endian); + const int8x16x2_t pair = matches + ? int8x16x2_t{{this->value, vmovq_n_s8(0)}} + : int8x16x2_t{{vmovq_n_s8(0), this->value}}; + vst2q_s8(reinterpret_cast(p), pair); + } + + // In places where the table can be reused, which is most uses in simdutf, it + // is worth it to do 4 table lookups, as there is no direct zero extension + // from u8 to u32. + simdutf_really_inline void store_ascii_as_utf32_tbl(char32_t *p) const { + const simd8 tb1{0, 255, 255, 255, 1, 255, 255, 255, + 2, 255, 255, 255, 3, 255, 255, 255}; + const simd8 tb2{4, 255, 255, 255, 5, 255, 255, 255, + 6, 255, 255, 255, 7, 255, 255, 255}; + const simd8 tb3{8, 255, 255, 255, 9, 255, 255, 255, + 10, 255, 255, 255, 11, 255, 255, 255}; + const simd8 tb4{12, 255, 255, 255, 13, 255, 255, 255, + 14, 255, 255, 255, 15, 255, 255, 255}; + + // encourage store pairing and interleaving + const auto shuf1 = this->apply_lookup_16_to(tb1); + const auto shuf2 = this->apply_lookup_16_to(tb2); + shuf1.store(reinterpret_cast(p)); + shuf2.store(reinterpret_cast(p + 4)); + + const auto shuf3 = this->apply_lookup_16_to(tb3); + const auto shuf4 = this->apply_lookup_16_to(tb4); + shuf3.store(reinterpret_cast(p + 8)); + shuf4.store(reinterpret_cast(p + 12)); + } + // Conversion from/to SIMD register + simdutf_really_inline simd8(const int8x16_t _value) : value{_value} {} + simdutf_really_inline operator const int8x16_t &() const { + return this->value; + } +#ifndef SIMDUTF_REGULAR_VISUAL_STUDIO + simdutf_really_inline operator const uint8x16_t() const { + return vreinterpretq_u8_s8(this->value); + } +#endif + simdutf_really_inline operator int8x16_t &() { return this->value; } + + // Zero constructor + simdutf_really_inline simd8() : simd8(zero()) {} + // Splat constructor + simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdutf_really_inline simd8(const int8_t *values) : simd8(load(values)) {} + // Member-by-member initialization +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + simdutf_really_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, + int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, + int8_t v12, int8_t v13, int8_t v14, int8_t v15) + : simd8(simdutf_make_int8x16_t(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, + v10, v11, v12, v13, v14, v15)) {} +#else + simdutf_really_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, + int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, + int8_t v12, int8_t v13, int8_t v14, int8_t v15) + : simd8(int8x16_t{v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15}) {} +#endif + + // Store to array + simdutf_really_inline void store(int8_t dst[16]) const { + return vst1q_s8(dst, value); + } + // Explicit conversion to/from unsigned + // + // Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same + // type. In theory, we could check this occurrence with std::same_as and + // std::enabled_if but it is C++14 and relatively ugly and hard to read. +#ifndef SIMDUTF_REGULAR_VISUAL_STUDIO + simdutf_really_inline explicit simd8(const uint8x16_t other) + : simd8(vreinterpretq_s8_u8(other)) {} +#endif + simdutf_really_inline operator simd8() const { + return vreinterpretq_u8_s8(this->value); + } + + simdutf_really_inline simd8 + operator|(const simd8 other) const { + return vorrq_s8(value, other.value); + } + + simdutf_really_inline int8_t max_val() const { return vmaxvq_s8(value); } + simdutf_really_inline int8_t min_val() const { return vminvq_s8(value); } + simdutf_really_inline bool is_ascii() const { return this->min_val() >= 0; } + + // Order-sensitive comparisons + simdutf_really_inline simd8 operator>(const simd8 other) const { + return vcgtq_s8(value, other.value); + } + simdutf_really_inline simd8 operator<(const simd8 other) const { + return vcltq_s8(value, other.value); + } + + template + simdutf_really_inline simd8 + apply_lookup_16_to(const simd8 original) const { + return vqtbl1q_s8(*this, simd8(original)); + } +}; + +template struct simd8x64 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd8); + static_assert(NUM_CHUNKS == 4, + "ARM kernel should use four registers per 64-byte block."); + simd8 chunks[NUM_CHUNKS]; + + simd8x64(const simd8x64 &o) = delete; // no copy allowed + simd8x64 & + operator=(const simd8 other) = delete; // no assignment allowed + simd8x64() = delete; // no default constructor allowed + + simdutf_really_inline simd8x64(const simd8 chunk0, const simd8 chunk1, + const simd8 chunk2, const simd8 chunk3) + : chunks{chunk0, chunk1, chunk2, chunk3} {} + simdutf_really_inline simd8x64(const T *ptr) + : chunks{simd8::load(ptr), + simd8::load(ptr + sizeof(simd8) / sizeof(T)), + simd8::load(ptr + 2 * sizeof(simd8) / sizeof(T)), + simd8::load(ptr + 3 * sizeof(simd8) / sizeof(T))} {} + + simdutf_really_inline void store(T *ptr) const { + this->chunks[0].store(ptr + sizeof(simd8) * 0 / sizeof(T)); + this->chunks[1].store(ptr + sizeof(simd8) * 1 / sizeof(T)); + this->chunks[2].store(ptr + sizeof(simd8) * 2 / sizeof(T)); + this->chunks[3].store(ptr + sizeof(simd8) * 3 / sizeof(T)); + } + + simdutf_really_inline simd8x64 &operator|=(const simd8x64 &other) { + this->chunks[0] |= other.chunks[0]; + this->chunks[1] |= other.chunks[1]; + this->chunks[2] |= other.chunks[2]; + this->chunks[3] |= other.chunks[3]; + return *this; + } + + simdutf_really_inline simd8 reduce_or() const { + return (this->chunks[0] | this->chunks[1]) | + (this->chunks[2] | this->chunks[3]); + } + + simdutf_really_inline bool is_ascii() const { return reduce_or().is_ascii(); } + + template + simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const { + this->chunks[0].template store_ascii_as_utf16(ptr + + sizeof(simd8) * 0); + this->chunks[1].template store_ascii_as_utf16(ptr + + sizeof(simd8) * 1); + this->chunks[2].template store_ascii_as_utf16(ptr + + sizeof(simd8) * 2); + this->chunks[3].template store_ascii_as_utf16(ptr + + sizeof(simd8) * 3); + } + + simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const { + this->chunks[0].store_ascii_as_utf32_tbl(ptr + sizeof(simd8) * 0); + this->chunks[1].store_ascii_as_utf32_tbl(ptr + sizeof(simd8) * 1); + this->chunks[2].store_ascii_as_utf32_tbl(ptr + sizeof(simd8) * 2); + this->chunks[3].store_ascii_as_utf32_tbl(ptr + sizeof(simd8) * 3); + } + + simdutf_really_inline uint64_t to_bitmask() const { +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x16_t bit_mask = + simdutf_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80); +#else + const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; +#endif + // Add each of the elements next to each other, successively, to stuff each + // 8 byte mask into one. + uint8x16_t sum0 = + vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[0]), bit_mask), + vandq_u8(uint8x16_t(this->chunks[1]), bit_mask)); + uint8x16_t sum1 = + vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[2]), bit_mask), + vandq_u8(uint8x16_t(this->chunks[3]), bit_mask)); + sum0 = vpaddq_u8(sum0, sum1); + sum0 = vpaddq_u8(sum0, sum0); + return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0); + } + + simdutf_really_inline uint64_t lt(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64(this->chunks[0] < mask, this->chunks[1] < mask, + this->chunks[2] < mask, this->chunks[3] < mask) + .to_bitmask(); + } + simdutf_really_inline uint64_t gt(const T m) const { const simd8 mask = simd8::splat(m); - return simd8x64( - this->chunks[0] <= mask, - this->chunks[1] <= mask, - this->chunks[2] <= mask, - this->chunks[3] <= mask - ).to_bitmask(); - } - - simdutf_really_inline uint64_t in_range(const T low, const T high) const { - const simd8 mask_low = simd8::splat(low); - const simd8 mask_high = simd8::splat(high); - - return simd8x64( - (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low), - (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low), - (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low), - (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low) - ).to_bitmask(); - } - simdutf_really_inline uint64_t not_in_range(const T low, const T high) const { - const simd8 mask_low = simd8::splat(low); - const simd8 mask_high = simd8::splat(high); - return simd8x64( - (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low), - (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low), - (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low), - (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low) - ).to_bitmask(); - } - simdutf_really_inline uint64_t lt(const T m) const { - const simd8 mask = simd8::splat(m); - return simd8x64( - this->chunks[0] < mask, - this->chunks[1] < mask, - this->chunks[2] < mask, - this->chunks[3] < mask - ).to_bitmask(); - } - simdutf_really_inline uint64_t gt(const T m) const { - const simd8 mask = simd8::splat(m); - return simd8x64( - this->chunks[0] > mask, - this->chunks[1] > mask, - this->chunks[2] > mask, - this->chunks[3] > mask - ).to_bitmask(); - } - simdutf_really_inline uint64_t gteq(const T m) const { - const simd8 mask = simd8::splat(m); - return simd8x64( - this->chunks[0] >= mask, - this->chunks[1] >= mask, - this->chunks[2] >= mask, - this->chunks[3] >= mask - ).to_bitmask(); - } - simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const { - const simd8 mask = simd8::splat(m); - return simd8x64( - simd8(uint8x16_t(this->chunks[0])) >= mask, - simd8(uint8x16_t(this->chunks[1])) >= mask, - simd8(uint8x16_t(this->chunks[2])) >= mask, - simd8(uint8x16_t(this->chunks[3])) >= mask - ).to_bitmask(); - } - }; // struct simd8x64 + return simd8x64(this->chunks[0] > mask, this->chunks[1] > mask, + this->chunks[2] > mask, this->chunks[3] > mask) + .to_bitmask(); + } + simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const { + const simd8 mask = simd8::splat(m); + return simd8x64(simd8(uint8x16_t(this->chunks[0])) >= mask, + simd8(uint8x16_t(this->chunks[1])) >= mask, + simd8(uint8x16_t(this->chunks[2])) >= mask, + simd8(uint8x16_t(this->chunks[3])) >= mask) + .to_bitmask(); + } +}; // struct simd8x64 /* begin file src/simdutf/arm64/simd16-inl.h */ -template -struct simd16; - - template> - struct base_u16 { - uint16x8_t value; - static const int SIZE = sizeof(value); - - // Conversion from/to SIMD register - simdutf_really_inline base_u16() = default; - simdutf_really_inline base_u16(const uint16x8_t _value) : value(_value) {} - simdutf_really_inline operator const uint16x8_t&() const { return this->value; } - simdutf_really_inline operator uint16x8_t&() { return this->value; } - // Bit operations - simdutf_really_inline simd16 operator|(const simd16 other) const { return vorrq_u16(*this, other); } - simdutf_really_inline simd16 operator&(const simd16 other) const { return vandq_u16(*this, other); } - simdutf_really_inline simd16 operator^(const simd16 other) const { return veorq_u16(*this, other); } - simdutf_really_inline simd16 bit_andnot(const simd16 other) const { return vbicq_u16(*this, other); } - simdutf_really_inline simd16 operator~() const { return *this ^ 0xFFu; } - simdutf_really_inline simd16& operator|=(const simd16 other) { auto this_cast = static_cast*>(this); *this_cast = *this_cast | other; return *this_cast; } - simdutf_really_inline simd16& operator&=(const simd16 other) { auto this_cast = static_cast*>(this); *this_cast = *this_cast & other; return *this_cast; } - simdutf_really_inline simd16& operator^=(const simd16 other) { auto this_cast = static_cast*>(this); *this_cast = *this_cast ^ other; return *this_cast; } - - friend simdutf_really_inline Mask operator==(const simd16 lhs, const simd16 rhs) { return vceqq_u16(lhs, rhs); } - - template - simdutf_really_inline simd16 prev(const simd16 prev_chunk) const { - return vextq_u18(prev_chunk, *this, 8 - N); - } - }; +template struct simd16; + +template > struct base_u16 { + uint16x8_t value; + /// the size of vector in bytes + static const int SIZE = sizeof(value); + /// the number of elements of type T a vector can hold + static const int ELEMENTS = SIZE / sizeof(T); + // Conversion from/to SIMD register + simdutf_really_inline base_u16() = default; + simdutf_really_inline base_u16(const uint16x8_t _value) : value(_value) {} + simdutf_really_inline operator const uint16x8_t &() const { + return this->value; + } + simdutf_really_inline operator uint16x8_t &() { return this->value; } + // Bit operations + simdutf_really_inline simd16 operator|(const simd16 other) const { + return vorrq_u16(*this, other); + } + simdutf_really_inline simd16 operator&(const simd16 other) const { + return vandq_u16(*this, other); + } + simdutf_really_inline simd16 operator^(const simd16 other) const { + return veorq_u16(*this, other); + } + simdutf_really_inline simd16 bit_andnot(const simd16 other) const { + return vbicq_u16(*this, other); + } + simdutf_really_inline simd16 operator~() const { return *this ^ 0xFFu; } + simdutf_really_inline simd16 &operator|=(const simd16 other) { + auto this_cast = static_cast *>(this); + *this_cast = *this_cast | other; + return *this_cast; + } + simdutf_really_inline simd16 &operator&=(const simd16 other) { + auto this_cast = static_cast *>(this); + *this_cast = *this_cast & other; + return *this_cast; + } + simdutf_really_inline simd16 &operator^=(const simd16 other) { + auto this_cast = static_cast *>(this); + *this_cast = *this_cast ^ other; + return *this_cast; + } + + friend simdutf_really_inline Mask operator==(const simd16 lhs, + const simd16 rhs) { + return vceqq_u16(lhs, rhs); + } + + template + simdutf_really_inline simd16 prev(const simd16 prev_chunk) const { + return vextq_u18(prev_chunk, *this, 8 - N); + } +}; -template> -struct base16: base_u16 { +template > +struct base16 : base_u16 { typedef uint16_t bitmask_t; typedef uint32_t bitmask2_t; simdutf_really_inline base16() : base_u16() {} simdutf_really_inline base16(const uint16x8_t _value) : base_u16(_value) {} template - simdutf_really_inline base16(const Pointer* ptr) : base16(vld1q_u16(ptr)) {} + simdutf_really_inline base16(const Pointer *ptr) : base16(vld1q_u16(ptr)) {} static const int SIZE = sizeof(base_u16::value); - - template + void dump() const { +#ifdef SIMDUTF_LOGGING + uint16_t temp[8]; + vst1q_u16(temp, *this); + printf("[%04x, %04x, %04x, %04x, %04x, %04x, %04x, %04x]\n", temp[0], + temp[1], temp[2], temp[3], temp[4], temp[5], temp[6], temp[7]); +#endif // SIMDUTF_LOGGING + } + template simdutf_really_inline simd16 prev(const simd16 prev_chunk) const { return vextq_u18(prev_chunk, *this, 8 - N); } }; // SIMD byte mask type (returned by things like eq and gt) -template<> -struct simd16: base16 { - static simdutf_really_inline simd16 splat(bool _value) { return vmovq_n_u16(uint16_t(-(!!_value))); } +template <> struct simd16 : base16 { + static simdutf_really_inline simd16 splat(bool _value) { + return vmovq_n_u16(uint16_t(-(!!_value))); + } - simdutf_really_inline simd16() : base16() {} - simdutf_really_inline simd16(const uint16x8_t _value) : base16(_value) {} + simdutf_really_inline simd16() : base16() {} + simdutf_really_inline simd16(const uint16x8_t _value) + : base16(_value) {} // Splat constructor - simdutf_really_inline simd16(bool _value) : base16(splat(_value)) {} - + simdutf_really_inline simd16(bool _value) : base16(splat(_value)) {} }; -template -struct base16_numeric: base16 { - static simdutf_really_inline simd16 splat(T _value) { return vmovq_n_u16(_value); } +template struct base16_numeric : base16 { + static simdutf_really_inline simd16 splat(T _value) { + return vmovq_n_u16(_value); + } static simdutf_really_inline simd16 zero() { return vdupq_n_u16(0); } static simdutf_really_inline simd16 load(const T values[8]) { - return vld1q_u16(reinterpret_cast(values)); + return vld1q_u16(reinterpret_cast(values)); } simdutf_really_inline base16_numeric() : base16() {} - simdutf_really_inline base16_numeric(const uint16x8_t _value) : base16(_value) {} + simdutf_really_inline base16_numeric(const uint16x8_t _value) + : base16(_value) {} // Store to array - simdutf_really_inline void store(T dst[8]) const { return vst1q_u16(dst, *this); } + simdutf_really_inline void store(T dst[8]) const { + return vst1q_u16(dst, *this); + } // Override to distinguish from bool version simdutf_really_inline simd16 operator~() const { return *this ^ 0xFFu; } // Addition/subtraction are the same for signed and unsigned - simdutf_really_inline simd16 operator+(const simd16 other) const { return vaddq_u8(*this, other); } - simdutf_really_inline simd16 operator-(const simd16 other) const { return vsubq_u8(*this, other); } - simdutf_really_inline simd16& operator+=(const simd16 other) { *this = *this + other; return *static_cast*>(this); } - simdutf_really_inline simd16& operator-=(const simd16 other) { *this = *this - other; return *static_cast*>(this); } + simdutf_really_inline simd16 operator+(const simd16 other) const { + return vaddq_u16(*this, other); + } + simdutf_really_inline simd16 operator-(const simd16 other) const { + return vsubq_u16(*this, other); + } + simdutf_really_inline simd16 &operator+=(const simd16 other) { + *this = *this + other; + return *static_cast *>(this); + } + simdutf_really_inline simd16 &operator-=(const simd16 other) { + *this = *this - other; + return *static_cast *>(this); + } }; // Signed code units -template<> -struct simd16 : base16_numeric { +template <> struct simd16 : base16_numeric { simdutf_really_inline simd16() : base16_numeric() {} #ifndef SIMDUTF_REGULAR_VISUAL_STUDIO - simdutf_really_inline simd16(const uint16x8_t _value) : base16_numeric(_value) {} + simdutf_really_inline simd16(const uint16x8_t _value) + : base16_numeric(_value) {} #endif - simdutf_really_inline simd16(const int16x8_t _value) : base16_numeric(vreinterpretq_u16_s16(_value)) {} + simdutf_really_inline simd16(const int16x8_t _value) + : base16_numeric(vreinterpretq_u16_s16(_value)) {} // Splat constructor simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {} // Array constructor - simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {} - simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast(values))) {} + simdutf_really_inline simd16(const int16_t *values) : simd16(load(values)) {} + simdutf_really_inline simd16(const char16_t *values) + : simd16(load(reinterpret_cast(values))) {} simdutf_really_inline operator simd16() const; - simdutf_really_inline operator const uint16x8_t&() const { return this->value; } - simdutf_really_inline operator const int16x8_t() const { return vreinterpretq_s16_u16(this->value); } + simdutf_really_inline operator const uint16x8_t &() const { + return this->value; + } + simdutf_really_inline operator const int16x8_t() const { + return vreinterpretq_s16_u16(this->value); + } - simdutf_really_inline int16_t max_val() const { return vmaxvq_s16(vreinterpretq_s16_u16(this->value)); } - simdutf_really_inline int16_t min_val() const { return vminvq_s16(vreinterpretq_s16_u16(this->value)); } + simdutf_really_inline int16_t max_val() const { + return vmaxvq_s16(vreinterpretq_s16_u16(this->value)); + } + simdutf_really_inline int16_t min_val() const { + return vminvq_s16(vreinterpretq_s16_u16(this->value)); + } // Order-sensitive comparisons - simdutf_really_inline simd16 max_val(const simd16 other) const { return vmaxq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); } - simdutf_really_inline simd16 min_val(const simd16 other) const { return vmaxq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); } - simdutf_really_inline simd16 operator>(const simd16 other) const { return vcgtq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); } - simdutf_really_inline simd16 operator<(const simd16 other) const { return vcltq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); } + simdutf_really_inline simd16 + max_val(const simd16 other) const { + return vmaxq_s16(vreinterpretq_s16_u16(this->value), + vreinterpretq_s16_u16(other.value)); + } + simdutf_really_inline simd16 + min_val(const simd16 other) const { + return vmaxq_s16(vreinterpretq_s16_u16(this->value), + vreinterpretq_s16_u16(other.value)); + } + simdutf_really_inline simd16 + operator>(const simd16 other) const { + return vcgtq_s16(vreinterpretq_s16_u16(this->value), + vreinterpretq_s16_u16(other.value)); + } + simdutf_really_inline simd16 + operator<(const simd16 other) const { + return vcltq_s16(vreinterpretq_s16_u16(this->value), + vreinterpretq_s16_u16(other.value)); + } }; - - - // Unsigned code units -template<> -struct simd16: base16_numeric { +template <> struct simd16 : base16_numeric { simdutf_really_inline simd16() : base16_numeric() {} - simdutf_really_inline simd16(const uint16x8_t _value) : base16_numeric(_value) {} + simdutf_really_inline simd16(const uint16x8_t _value) + : base16_numeric(_value) {} // Splat constructor simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {} // Array constructor - simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {} - simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast(values))) {} - + simdutf_really_inline simd16(const uint16_t *values) : simd16(load(values)) {} + simdutf_really_inline simd16(const char16_t *values) + : simd16(load(reinterpret_cast(values))) {} simdutf_really_inline int16_t max_val() const { return vmaxvq_u16(*this); } simdutf_really_inline int16_t min_val() const { return vminvq_u16(*this); } // Saturated math - simdutf_really_inline simd16 saturating_add(const simd16 other) const { return vqaddq_u16(*this, other); } - simdutf_really_inline simd16 saturating_sub(const simd16 other) const { return vqsubq_u16(*this, other); } + simdutf_really_inline simd16 + saturating_add(const simd16 other) const { + return vqaddq_u16(*this, other); + } + simdutf_really_inline simd16 + saturating_sub(const simd16 other) const { + return vqsubq_u16(*this, other); + } // Order-specific operations - simdutf_really_inline simd16 max_val(const simd16 other) const { return vmaxq_u16(*this, other); } - simdutf_really_inline simd16 min_val(const simd16 other) const { return vminq_u16(*this, other); } + simdutf_really_inline simd16 + max_val(const simd16 other) const { + return vmaxq_u16(*this, other); + } + simdutf_really_inline simd16 + min_val(const simd16 other) const { + return vminq_u16(*this, other); + } // Same as >, but only guarantees true is nonzero (< guarantees true = -1) - simdutf_really_inline simd16 gt_bits(const simd16 other) const { return this->saturating_sub(other); } + simdutf_really_inline simd16 + gt_bits(const simd16 other) const { + return this->saturating_sub(other); + } // Same as <, but only guarantees true is nonzero (< guarantees true = -1) - simdutf_really_inline simd16 lt_bits(const simd16 other) const { return other.saturating_sub(*this); } - simdutf_really_inline simd16 operator<=(const simd16 other) const { return vcleq_u16(*this, other); } - simdutf_really_inline simd16 operator>=(const simd16 other) const { return vcgeq_u16(*this, other); } - simdutf_really_inline simd16 operator>(const simd16 other) const { return vcgtq_u16(*this, other); } - simdutf_really_inline simd16 operator<(const simd16 other) const { return vcltq_u16(*this, other); } + simdutf_really_inline simd16 + lt_bits(const simd16 other) const { + return other.saturating_sub(*this); + } + simdutf_really_inline simd16 + operator<=(const simd16 other) const { + return vcleq_u16(*this, other); + } + simdutf_really_inline simd16 + operator>=(const simd16 other) const { + return vcgeq_u16(*this, other); + } + simdutf_really_inline simd16 + operator>(const simd16 other) const { + return vcgtq_u16(*this, other); + } + simdutf_really_inline simd16 + operator<(const simd16 other) const { + return vcltq_u16(*this, other); + } // Bit-specific operations - simdutf_really_inline simd16 bits_not_set() const { return *this == uint16_t(0); } - template - simdutf_really_inline simd16 shr() const { return simd16(vshrq_n_u16(*this, N)); } - template - simdutf_really_inline simd16 shl() const { return simd16(vshlq_n_u16(*this, N)); } - - // logical operations - simdutf_really_inline simd16 operator|(const simd16 other) const { return vorrq_u16(*this, other); } - simdutf_really_inline simd16 operator&(const simd16 other) const { return vandq_u16(*this, other); } - simdutf_really_inline simd16 operator^(const simd16 other) const { return veorq_u16(*this, other); } - - // Pack with the unsigned saturation of two uint16_t code units into single uint8_t vector - static simdutf_really_inline simd8 pack(const simd16& v0, const simd16& v1) { + simdutf_really_inline simd16 bits_not_set() const { + return *this == uint16_t(0); + } + template simdutf_really_inline simd16 shr() const { + return simd16(vshrq_n_u16(*this, N)); + } + template simdutf_really_inline simd16 shl() const { + return simd16(vshlq_n_u16(*this, N)); + } + + // Pack with the unsigned saturation of two uint16_t code units into single + // uint8_t vector + static simdutf_really_inline simd8 pack(const simd16 &v0, + const simd16 &v1) { return vqmovn_high_u16(vqmovn_u16(v0), v1); } @@ -1635,317 +2909,524 @@ struct simd16: base16_numeric { simdutf_really_inline simd16 swap_bytes() const { return vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(*this))); } + + void dump() const { + uint16_t temp[8]; + vst1q_u16(temp, *this); + printf("[%04x, %04x, %04x, %04x, %04x, %04x, %04x, %04x]\n", temp[0], + temp[1], temp[2], temp[3], temp[4], temp[5], temp[6], temp[7]); + } + + simdutf_really_inline uint32_t sum() const { return vaddlvq_u16(value); } }; -simdutf_really_inline simd16::operator simd16() const { return this->value; } +simdutf_really_inline simd16::operator simd16() const { + return this->value; +} - template - struct simd16x32 { - static constexpr int NUM_CHUNKS = 64 / sizeof(simd16); - static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block."); - simd16 chunks[NUM_CHUNKS]; +template struct simd16x32 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd16); + static_assert(NUM_CHUNKS == 4, + "ARM kernel should use four registers per 64-byte block."); + simd16 chunks[NUM_CHUNKS]; - simd16x32(const simd16x32& o) = delete; // no copy allowed - simd16x32& operator=(const simd16 other) = delete; // no assignment allowed - simd16x32() = delete; // no default constructor allowed + simd16x32(const simd16x32 &o) = delete; // no copy allowed + simd16x32 & + operator=(const simd16 other) = delete; // no assignment allowed + simd16x32() = delete; // no default constructor allowed - simdutf_really_inline simd16x32(const simd16 chunk0, const simd16 chunk1, const simd16 chunk2, const simd16 chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {} - simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16::load(ptr), simd16::load(ptr+sizeof(simd16)/sizeof(T)), simd16::load(ptr+2*sizeof(simd16)/sizeof(T)), simd16::load(ptr+3*sizeof(simd16)/sizeof(T))} {} + simdutf_really_inline + simd16x32(const simd16 chunk0, const simd16 chunk1, + const simd16 chunk2, const simd16 chunk3) + : chunks{chunk0, chunk1, chunk2, chunk3} {} + simdutf_really_inline simd16x32(const T *ptr) + : chunks{simd16::load(ptr), + simd16::load(ptr + sizeof(simd16) / sizeof(T)), + simd16::load(ptr + 2 * sizeof(simd16) / sizeof(T)), + simd16::load(ptr + 3 * sizeof(simd16) / sizeof(T))} {} - simdutf_really_inline void store(T* ptr) const { - this->chunks[0].store(ptr+sizeof(simd16)*0/sizeof(T)); - this->chunks[1].store(ptr+sizeof(simd16)*1/sizeof(T)); - this->chunks[2].store(ptr+sizeof(simd16)*2/sizeof(T)); - this->chunks[3].store(ptr+sizeof(simd16)*3/sizeof(T)); - } + simdutf_really_inline void store(T *ptr) const { + this->chunks[0].store(ptr + sizeof(simd16) * 0 / sizeof(T)); + this->chunks[1].store(ptr + sizeof(simd16) * 1 / sizeof(T)); + this->chunks[2].store(ptr + sizeof(simd16) * 2 / sizeof(T)); + this->chunks[3].store(ptr + sizeof(simd16) * 3 / sizeof(T)); + } - simdutf_really_inline simd16 reduce_or() const { - return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]); - } + simdutf_really_inline simd16 reduce_or() const { + return (this->chunks[0] | this->chunks[1]) | + (this->chunks[2] | this->chunks[3]); + } - simdutf_really_inline bool is_ascii() const { - return reduce_or().is_ascii(); - } + simdutf_really_inline bool is_ascii() const { return reduce_or().is_ascii(); } - simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const { - this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16)*0); - this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16)*1); - this->chunks[2].store_ascii_as_utf16(ptr+sizeof(simd16)*2); - this->chunks[3].store_ascii_as_utf16(ptr+sizeof(simd16)*3); - } + simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const { + this->chunks[0].store_ascii_as_utf16(ptr + sizeof(simd16) * 0); + this->chunks[1].store_ascii_as_utf16(ptr + sizeof(simd16) * 1); + this->chunks[2].store_ascii_as_utf16(ptr + sizeof(simd16) * 2); + this->chunks[3].store_ascii_as_utf16(ptr + sizeof(simd16) * 3); + } - simdutf_really_inline uint64_t to_bitmask() const { + simdutf_really_inline uint64_t to_bitmask() const { #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint8x16_t bit_mask = simdutf_make_uint8x16_t( - 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, - 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 - ); + const uint8x16_t bit_mask = + simdutf_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80); #else - const uint8x16_t bit_mask = { - 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, - 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 - }; + const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; #endif - // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one. - uint8x16_t sum0 = vpaddq_u8(vreinterpretq_u8_u16(this->chunks[0] & vreinterpretq_u16_u8(bit_mask)), vreinterpretq_u8_u16(this->chunks[1] & vreinterpretq_u16_u8(bit_mask))); - uint8x16_t sum1 = vpaddq_u8(vreinterpretq_u8_u16(this->chunks[2] & vreinterpretq_u16_u8(bit_mask)), vreinterpretq_u8_u16(this->chunks[3] & vreinterpretq_u16_u8(bit_mask))); - sum0 = vpaddq_u8(sum0, sum1); - sum0 = vpaddq_u8(sum0, sum0); - return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0); - } - - simdutf_really_inline void swap_bytes() { - this->chunks[0] = this->chunks[0].swap_bytes(); - this->chunks[1] = this->chunks[1].swap_bytes(); - this->chunks[2] = this->chunks[2].swap_bytes(); - this->chunks[3] = this->chunks[3].swap_bytes(); - } + // Add each of the elements next to each other, successively, to stuff each + // 8 byte mask into one. + uint8x16_t sum0 = vpaddq_u8( + vreinterpretq_u8_u16(this->chunks[0] & vreinterpretq_u16_u8(bit_mask)), + vreinterpretq_u8_u16(this->chunks[1] & vreinterpretq_u16_u8(bit_mask))); + uint8x16_t sum1 = vpaddq_u8( + vreinterpretq_u8_u16(this->chunks[2] & vreinterpretq_u16_u8(bit_mask)), + vreinterpretq_u8_u16(this->chunks[3] & vreinterpretq_u16_u8(bit_mask))); + sum0 = vpaddq_u8(sum0, sum1); + sum0 = vpaddq_u8(sum0, sum0); + return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0); + } - simdutf_really_inline uint64_t eq(const T m) const { + simdutf_really_inline void swap_bytes() { + this->chunks[0] = this->chunks[0].swap_bytes(); + this->chunks[1] = this->chunks[1].swap_bytes(); + this->chunks[2] = this->chunks[2].swap_bytes(); + this->chunks[3] = this->chunks[3].swap_bytes(); + } + simdutf_really_inline uint64_t gt(const T m) const { const simd16 mask = simd16::splat(m); - return simd16x32( - this->chunks[0] == mask, - this->chunks[1] == mask, - this->chunks[2] == mask, - this->chunks[3] == mask - ).to_bitmask(); + return simd16x32(this->chunks[0] > mask, this->chunks[1] > mask, + this->chunks[2] > mask, this->chunks[3] > mask) + .to_bitmask(); } simdutf_really_inline uint64_t lteq(const T m) const { const simd16 mask = simd16::splat(m); - return simd16x32( - this->chunks[0] <= mask, - this->chunks[1] <= mask, - this->chunks[2] <= mask, - this->chunks[3] <= mask - ).to_bitmask(); - } - - simdutf_really_inline uint64_t in_range(const T low, const T high) const { - const simd16 mask_low = simd16::splat(low); - const simd16 mask_high = simd16::splat(high); - - return simd16x32( - (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low), - (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low), - (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low), - (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low) - ).to_bitmask(); - } - simdutf_really_inline uint64_t not_in_range(const T low, const T high) const { - const simd16 mask_low = simd16::splat(low); - const simd16 mask_high = simd16::splat(high); - return simd16x32( - (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low), - (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low), - (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low), - (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low) - ).to_bitmask(); - } - simdutf_really_inline uint64_t lt(const T m) const { - const simd16 mask = simd16::splat(m); - return simd16x32( - this->chunks[0] < mask, - this->chunks[1] < mask, - this->chunks[2] < mask, - this->chunks[3] < mask - ).to_bitmask(); - } - - }; // struct simd16x32 - template<> - simdutf_really_inline uint64_t simd16x32::not_in_range(const uint16_t low, const uint16_t high) const { - const simd16 mask_low = simd16::splat(low); - const simd16 mask_high = simd16::splat(high); - simd16x32 x( - simd16((this->chunks[0] > mask_high) | (this->chunks[0] < mask_low)), - simd16((this->chunks[1] > mask_high) | (this->chunks[1] < mask_low)), - simd16((this->chunks[2] > mask_high) | (this->chunks[2] < mask_low)), - simd16((this->chunks[3] > mask_high) | (this->chunks[3] < mask_low)) - ); - return x.to_bitmask(); - } + return simd16x32(this->chunks[0] <= mask, this->chunks[1] <= mask, + this->chunks[2] <= mask, this->chunks[3] <= mask) + .to_bitmask(); + } + + simdutf_really_inline uint64_t not_in_range(const T low, const T high) const { + const simd16 mask_low = simd16::splat(low); + const simd16 mask_high = simd16::splat(high); + return simd16x32( + (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low), + (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low), + (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low), + (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low)) + .to_bitmask(); + } +}; // struct simd16x32 +template <> +simdutf_really_inline uint64_t simd16x32::not_in_range( + const uint16_t low, const uint16_t high) const { + const simd16 mask_low = simd16::splat(low); + const simd16 mask_high = simd16::splat(high); + simd16x32 x(simd16((this->chunks[0] > mask_high) | + (this->chunks[0] < mask_low)), + simd16((this->chunks[1] > mask_high) | + (this->chunks[1] < mask_low)), + simd16((this->chunks[2] > mask_high) | + (this->chunks[2] < mask_low)), + simd16((this->chunks[3] > mask_high) | + (this->chunks[3] < mask_low))); + return x.to_bitmask(); +} + +simdutf_really_inline simd16 min(const simd16 a, + simd16 b) { + return vminq_u16(a.value, b.value); +} /* end file src/simdutf/arm64/simd16-inl.h */ -} // namespace simd -} // unnamed namespace -} // namespace arm64 -} // namespace simdutf +/* begin file src/simdutf/arm64/simd32-inl.h */ +template struct simd32; -#endif // SIMDUTF_ARM64_SIMD_H -/* end file src/simdutf/arm64/simd.h */ +template <> struct simd32 { + static const size_t SIZE = sizeof(uint32x4_t); + static const size_t ELEMENTS = SIZE / sizeof(uint32_t); -/* begin file src/simdutf/arm64/end.h */ -/* end file src/simdutf/arm64/end.h */ + uint32x4_t value; -#endif // SIMDUTF_IMPLEMENTATION_ARM64 + simdutf_really_inline simd32(const uint32x4_t v) : value(v) {} -#endif // SIMDUTF_ARM64_H -/* end file src/simdutf/arm64.h */ -/* begin file src/simdutf/icelake.h */ -#ifndef SIMDUTF_ICELAKE_H -#define SIMDUTF_ICELAKE_H + template + simdutf_really_inline simd32(const Pointer *ptr) + : value(vld1q_u32(reinterpret_cast(ptr))) {} + simdutf_really_inline uint64_t sum() const { return vaddvq_u32(value); } + simdutf_really_inline simd32 swap_bytes() const { + return vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(value))); + } -#ifdef __has_include -// How do we detect that a compiler supports vbmi2? -// For sure if the following header is found, we are ok? -#if __has_include() -#define SIMDUTF_COMPILER_SUPPORTS_VBMI2 1 -#endif -#endif + template simdutf_really_inline simd32 shr() const { + return vshrq_n_u32(value, N); + } -#ifdef _MSC_VER -#if _MSC_VER >= 1930 -// Visual Studio 2022 and up support VBMI2 under x64 even if the header -// avx512vbmi2intrin.h is not found. -// Visual Studio 2019 technically supports VBMI2, but the implementation -// might be unreliable. Search for visualstudio2019icelakeissue in our -// tests. -#define SIMDUTF_COMPILER_SUPPORTS_VBMI2 1 -#endif -#endif + template simdutf_really_inline simd32 shl() const { + return vshlq_n_u32(value, N); + } -// We allow icelake on x64 as long as the compiler is known to support VBMI2. -#ifndef SIMDUTF_IMPLEMENTATION_ICELAKE -#define SIMDUTF_IMPLEMENTATION_ICELAKE ((SIMDUTF_IS_X86_64) && (SIMDUTF_COMPILER_SUPPORTS_VBMI2)) -#endif + void dump() const { +#ifdef SIMDUTF_LOGGING + uint32_t temp[4]; + vst1q_u32(temp, value); + printf("[%08x, %08x, %08x, %08x]\n", temp[0], temp[1], temp[2], temp[3]); +#endif // SIMDUTF_LOGGING + } -// To see why (__BMI__) && (__LZCNT__) are not part of this next line, see -// https://github.com/simdutf/simdutf/issues/1247 -#if ((SIMDUTF_IMPLEMENTATION_ICELAKE) && (SIMDUTF_IS_X86_64) && (__AVX2__) && (SIMDUTF_HAS_AVX512F && \ - SIMDUTF_HAS_AVX512DQ && \ - SIMDUTF_HAS_AVX512VL && \ - SIMDUTF_HAS_AVX512VBMI2) && (!SIMDUTF_IS_32BITS)) -#define SIMDUTF_CAN_ALWAYS_RUN_ICELAKE 1 -#else -#define SIMDUTF_CAN_ALWAYS_RUN_ICELAKE 0 -#endif + // operators + simdutf_really_inline simd32 &operator+=(const simd32 other) { + value = vaddq_u32(value, other.value); + return *this; + } -#if SIMDUTF_IMPLEMENTATION_ICELAKE -#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE -#define SIMDUTF_TARGET_ICELAKE -#else -#define SIMDUTF_TARGET_ICELAKE SIMDUTF_TARGET_REGION("avx512f,avx512dq,avx512cd,avx512bw,avx512vbmi,avx512vbmi2,avx512vl,avx2,bmi,bmi2,pclmul,lzcnt,popcnt,avx512vpopcntdq") -#endif + // static members + simdutf_really_inline static simd32 zero() { + return vdupq_n_u32(0); + } -namespace simdutf { -namespace icelake { -} // namespace icelake -} // namespace simdutf + simdutf_really_inline static simd32 splat(uint32_t v) { + return vdupq_n_u32(v); + } +}; +//---------------------------------------------------------------------- +template <> struct simd32 { + uint32x4_t value; -// -// These two need to be included outside SIMDUTF_TARGET_REGION -// -/* begin file src/simdutf/icelake/intrinsics.h */ -#ifndef SIMDUTF_ICELAKE_INTRINSICS_H -#define SIMDUTF_ICELAKE_INTRINSICS_H + simdutf_really_inline simd32(const uint32x4_t v) : value(v) {} + simdutf_really_inline bool any() const { return vmaxvq_u32(value) != 0; } +}; -#ifdef SIMDUTF_VISUAL_STUDIO -// under clang within visual studio, this will include -#include // visual studio or clang -#include -#else +//---------------------------------------------------------------------- -#if SIMDUTF_GCC11ORMORE -// We should not get warnings while including yet we do -// under some versions of GCC. -// If the x86intrin.h header has uninitialized values that are problematic, -// it is a GCC issue, we want to ignore these warnings. -SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized) +template +simdutf_really_inline simd32 operator|(const simd32 a, + const simd32 b) { + return vorrq_u32(a.value, b.value); +} + +simdutf_really_inline simd32 min(const simd32 a, + const simd32 b) { + return vminq_u32(a.value, b.value); +} + +simdutf_really_inline simd32 max(const simd32 a, + const simd32 b) { + return vmaxq_u32(a.value, b.value); +} + +simdutf_really_inline simd32 operator==(const simd32 a, + uint32_t b) { + return vceqq_u32(a.value, vdupq_n_u32(b)); +} + +simdutf_really_inline simd32 operator&(const simd32 a, + const simd32 b) { + return vandq_u32(a.value, b.value); +} + +simdutf_really_inline simd32 operator&(const simd32 a, + uint32_t b) { + return vandq_u32(a.value, vdupq_n_u32(b)); +} + +simdutf_really_inline simd32 operator|(const simd32 a, + uint32_t b) { + return vorrq_u32(a.value, vdupq_n_u32(b)); +} + +simdutf_really_inline simd32 operator+(const simd32 a, + const simd32 b) { + return vaddq_u32(a.value, b.value); +} + +simdutf_really_inline simd32 operator-(const simd32 a, + uint32_t b) { + return vsubq_u32(a.value, vdupq_n_u32(b)); +} + +simdutf_really_inline simd32 operator>=(const simd32 a, + const simd32 b) { + return vcgeq_u32(a.value, b.value); +} + +simdutf_really_inline simd32 operator!(const simd32 v) { + return vmvnq_u32(v.value); +} + +simdutf_really_inline simd32 operator>(const simd32 a, + const simd32 b) { + return vcgtq_u32(a.value, b.value); +} + +simdutf_really_inline simd32 select(const simd32 cond, + const simd32 v_true, + const simd32 v_false) { + return vbslq_u32(cond.value, v_true.value, v_false.value); +} +/* end file src/simdutf/arm64/simd32-inl.h */ +/* begin file src/simdutf/arm64/simd64-inl.h */ +template struct simd64; + +template <> struct simd64 { + uint64x2_t value; + + simdutf_really_inline simd64(const uint64x2_t v) : value(v) {} + + template + simdutf_really_inline simd64(const Pointer *ptr) + : value(vld1q_u64(reinterpret_cast(ptr))) {} + + simdutf_really_inline uint64_t sum() const { return vaddvq_u64(value); } + + // operators + simdutf_really_inline simd64 &operator+=(const simd64 other) { + value = vaddq_u64(value, other.value); + return *this; + } + + // static members + simdutf_really_inline static simd64 zero() { + return vdupq_n_u64(0); + } + + simdutf_really_inline static simd64 splat(uint64_t v) { + return vdupq_n_u64(v); + } +}; +/* end file src/simdutf/arm64/simd64-inl.h */ + +simdutf_really_inline simd64 sum_8bytes(const simd8 v) { + // We do it as 3 instructions. There might be a faster way. + // We hope that these 3 instructions are cheap. + uint16x8_t first_sum = vpaddlq_u8(v); + uint32x4_t second_sum = vpaddlq_u16(first_sum); + return vpaddlq_u32(second_sum); +} + +} // namespace simd +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf + +#endif // SIMDUTF_ARM64_SIMD_H +/* end file src/simdutf/arm64/simd.h */ + +/* begin file src/simdutf/arm64/end.h */ +#undef SIMDUTF_SIMD_HAS_BYTEMASK +/* end file src/simdutf/arm64/end.h */ + +#endif // SIMDUTF_IMPLEMENTATION_ARM64 + +#endif // SIMDUTF_ARM64_H +/* end file src/simdutf/arm64.h */ +/* begin file src/simdutf/icelake.h */ +#ifndef SIMDUTF_ICELAKE_H +#define SIMDUTF_ICELAKE_H + + +#ifdef __has_include + // How do we detect that a compiler supports vbmi2? + // For sure if the following header is found, we are ok? + #if __has_include() + #define SIMDUTF_COMPILER_SUPPORTS_VBMI2 1 + #endif #endif -#include // elsewhere +#ifdef _MSC_VER + #if _MSC_VER >= 1930 + // Visual Studio 2022 and up support VBMI2 under x64 even if the header + // avx512vbmi2intrin.h is not found. + // Visual Studio 2019 technically supports VBMI2, but the implementation + // might be unreliable. Search for visualstudio2019icelakeissue in our + // tests. + #ifndef SIMDUTF_COMPILER_SUPPORTS_VBMI2 + #define SIMDUTF_COMPILER_SUPPORTS_VBMI2 1 + #endif + #endif +#endif +#if SIMDUTF_GCC9OROLDER && SIMDUTF_IS_X86_64 + #define SIMDUTF_IMPLEMENTATION_ICELAKE 0 + #warning \ + "You are using a legacy GCC compiler, we are disabling AVX-512 support" +#endif -#if SIMDUTF_GCC11ORMORE -// cancels the suppression of the -Wuninitialized -SIMDUTF_POP_DISABLE_WARNINGS +// We allow icelake on x64 as long as the compiler is known to support VBMI2. +#ifndef SIMDUTF_IMPLEMENTATION_ICELAKE + #define SIMDUTF_IMPLEMENTATION_ICELAKE \ + ((SIMDUTF_IS_X86_64) && (SIMDUTF_COMPILER_SUPPORTS_VBMI2)) #endif -#ifndef _tzcnt_u64 -#define _tzcnt_u64(x) __tzcnt_u64(x) -#endif // _tzcnt_u64 -#endif // SIMDUTF_VISUAL_STUDIO +// To see why (__BMI__) && (__LZCNT__) are not part of this next line, see +// https://github.com/simdutf/simdutf/issues/1247 +#if ((SIMDUTF_IMPLEMENTATION_ICELAKE) && (SIMDUTF_IS_X86_64) && (__AVX2__) && \ + (SIMDUTF_HAS_AVX512F && SIMDUTF_HAS_AVX512DQ && SIMDUTF_HAS_AVX512VL && \ + SIMDUTF_HAS_AVX512VBMI2) && \ + (!SIMDUTF_IS_32BITS)) + #define SIMDUTF_CAN_ALWAYS_RUN_ICELAKE 1 +#else + #define SIMDUTF_CAN_ALWAYS_RUN_ICELAKE 0 +#endif -#ifdef SIMDUTF_CLANG_VISUAL_STUDIO -/** - * You are not supposed, normally, to include these - * headers directly. Instead you should either include intrin.h - * or x86intrin.h. However, when compiling with clang - * under Windows (i.e., when _MSC_VER is set), these headers - * only get included *if* the corresponding features are detected - * from macros: - * e.g., if __AVX2__ is set... in turn, we normally set these - * macros by compiling against the corresponding architecture - * (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole - * software with these advanced instructions. In simdutf, we - * want to compile the whole program for a generic target, - * and only target our specific kernels. As a workaround, - * we directly include the needed headers. These headers would - * normally guard against such usage, but we carefully included - * (or ) before, so the headers - * are fooled. - */ -#include // for _blsr_u64 -#include // for _pext_u64, _pdep_u64 -#include // for __lzcnt64 -#include // for most things (AVX2, AVX512, _popcnt64) -#include -#include -#include -#include -// Important: we need the AVX-512 headers: -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -// unfortunately, we may not get _blsr_u64, but, thankfully, clang -// has it as a macro. -#ifndef _blsr_u64 -// we roll our own -#define _blsr_u64(n) ((n - 1) & n) -#endif // _blsr_u64 -#endif // SIMDUTF_CLANG_VISUAL_STUDIO +#if SIMDUTF_IMPLEMENTATION_ICELAKE + #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE + #define SIMDUTF_TARGET_ICELAKE + #else + #define SIMDUTF_TARGET_ICELAKE \ + SIMDUTF_TARGET_REGION( \ + "avx512f,avx512dq,avx512cd,avx512bw,avx512vbmi,avx512vbmi2," \ + "avx512vl,avx2,bmi,bmi2,pclmul,lzcnt,popcnt,avx512vpopcntdq") + #endif + +namespace simdutf { +namespace icelake {} // namespace icelake +} // namespace simdutf + + // + // These two need to be included outside SIMDUTF_TARGET_REGION + // +/* begin file src/simdutf/icelake/intrinsics.h */ +#ifndef SIMDUTF_ICELAKE_INTRINSICS_H +#define SIMDUTF_ICELAKE_INTRINSICS_H + + +#ifdef SIMDUTF_VISUAL_STUDIO + // under clang within visual studio, this will include + #include // visual studio or clang + #include +#else + #if SIMDUTF_GCC11ORMORE +// We should not get warnings while including yet we do +// under some versions of GCC. +// If the x86intrin.h header has uninitialized values that are problematic, +// it is a GCC issue, we want to ignore these warnings. +SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized) + #endif + + #include // elsewhere + + #if SIMDUTF_GCC11ORMORE +// cancels the suppression of the -Wuninitialized +SIMDUTF_POP_DISABLE_WARNINGS + #endif + + #ifndef _tzcnt_u64 + #define _tzcnt_u64(x) __tzcnt_u64(x) + #endif // _tzcnt_u64 +#endif // SIMDUTF_VISUAL_STUDIO +#ifdef SIMDUTF_CLANG_VISUAL_STUDIO + /** + * You are not supposed, normally, to include these + * headers directly. Instead you should either include intrin.h + * or x86intrin.h. However, when compiling with clang + * under Windows (i.e., when _MSC_VER is set), these headers + * only get included *if* the corresponding features are detected + * from macros: + * e.g., if __AVX2__ is set... in turn, we normally set these + * macros by compiling against the corresponding architecture + * (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole + * software with these advanced instructions. In simdutf, we + * want to compile the whole program for a generic target, + * and only target our specific kernels. As a workaround, + * we directly include the needed headers. These headers would + * normally guard against such usage, but we carefully included + * (or ) before, so the headers + * are fooled. + */ + #include // for _blsr_u64 + #include // for _pext_u64, _pdep_u64 + #include // for __lzcnt64 + #include // for most things (AVX2, AVX512, _popcnt64) + #include + #include + #include + #include + // Important: we need the AVX-512 headers: + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + // unfortunately, we may not get _blsr_u64, but, thankfully, clang + // has it as a macro. + #ifndef _blsr_u64 + // we roll our own + #define _blsr_u64(n) ((n - 1) & n) + #endif // _blsr_u64 +#endif // SIMDUTF_CLANG_VISUAL_STUDIO #if defined(__GNUC__) && !defined(__clang__) -#if __GNUC__ == 8 -#define SIMDUTF_GCC8 1 -#elif __GNUC__ == 9 -#define SIMDUTF_GCC9 1 -#endif // __GNUC__ == 8 || __GNUC__ == 9 + #if __GNUC__ == 8 + #define SIMDUTF_GCC8 1 + #elif __GNUC__ == 9 + #define SIMDUTF_GCC9 1 + #endif // __GNUC__ == 8 || __GNUC__ == 9 #endif // defined(__GNUC__) && !defined(__clang__) #if SIMDUTF_GCC8 -#pragma GCC push_options -#pragma GCC target("avx512f") + #pragma GCC push_options + #pragma GCC target("avx512f") /** * GCC 8 fails to provide _mm512_set_epi8. We roll our own. */ -inline __m512i _mm512_set_epi8(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8, uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16, uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24, uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32, uint8_t a33, uint8_t a34, uint8_t a35, uint8_t a36, uint8_t a37, uint8_t a38, uint8_t a39, uint8_t a40, uint8_t a41, uint8_t a42, uint8_t a43, uint8_t a44, uint8_t a45, uint8_t a46, uint8_t a47, uint8_t a48, uint8_t a49, uint8_t a50, uint8_t a51, uint8_t a52, uint8_t a53, uint8_t a54, uint8_t a55, uint8_t a56, uint8_t a57, uint8_t a58, uint8_t a59, uint8_t a60, uint8_t a61, uint8_t a62, uint8_t a63) { - return _mm512_set_epi64(uint64_t(a7) + (uint64_t(a6) << 8) + (uint64_t(a5) << 16) + (uint64_t(a4) << 24) + (uint64_t(a3) << 32) + (uint64_t(a2) << 40) + (uint64_t(a1) << 48) + (uint64_t(a0) << 56), - uint64_t(a15) + (uint64_t(a14) << 8) + (uint64_t(a13) << 16) + (uint64_t(a12) << 24) + (uint64_t(a11) << 32) + (uint64_t(a10) << 40) + (uint64_t(a9) << 48) + (uint64_t(a8) << 56), - uint64_t(a23) + (uint64_t(a22) << 8) + (uint64_t(a21) << 16) + (uint64_t(a20) << 24) + (uint64_t(a19) << 32) + (uint64_t(a18) << 40) + (uint64_t(a17) << 48) + (uint64_t(a16) << 56), - uint64_t(a31) + (uint64_t(a30) << 8) + (uint64_t(a29) << 16) + (uint64_t(a28) << 24) + (uint64_t(a27) << 32) + (uint64_t(a26) << 40) + (uint64_t(a25) << 48) + (uint64_t(a24) << 56), - uint64_t(a39) + (uint64_t(a38) << 8) + (uint64_t(a37) << 16) + (uint64_t(a36) << 24) + (uint64_t(a35) << 32) + (uint64_t(a34) << 40) + (uint64_t(a33) << 48) + (uint64_t(a32) << 56), - uint64_t(a47) + (uint64_t(a46) << 8) + (uint64_t(a45) << 16) + (uint64_t(a44) << 24) + (uint64_t(a43) << 32) + (uint64_t(a42) << 40) + (uint64_t(a41) << 48) + (uint64_t(a40) << 56), - uint64_t(a55) + (uint64_t(a54) << 8) + (uint64_t(a53) << 16) + (uint64_t(a52) << 24) + (uint64_t(a51) << 32) + (uint64_t(a50) << 40) + (uint64_t(a49) << 48) + (uint64_t(a48) << 56), - uint64_t(a63) + (uint64_t(a62) << 8) + (uint64_t(a61) << 16) + (uint64_t(a60) << 24) + (uint64_t(a59) << 32) + (uint64_t(a58) << 40) + (uint64_t(a57) << 48) + (uint64_t(a56) << 56)); -} -#pragma GCC pop_options +inline __m512i +_mm512_set_epi8(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, + uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8, uint8_t a9, + uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, + uint8_t a15, uint8_t a16, uint8_t a17, uint8_t a18, uint8_t a19, + uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24, + uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, + uint8_t a30, uint8_t a31, uint8_t a32, uint8_t a33, uint8_t a34, + uint8_t a35, uint8_t a36, uint8_t a37, uint8_t a38, uint8_t a39, + uint8_t a40, uint8_t a41, uint8_t a42, uint8_t a43, uint8_t a44, + uint8_t a45, uint8_t a46, uint8_t a47, uint8_t a48, uint8_t a49, + uint8_t a50, uint8_t a51, uint8_t a52, uint8_t a53, uint8_t a54, + uint8_t a55, uint8_t a56, uint8_t a57, uint8_t a58, uint8_t a59, + uint8_t a60, uint8_t a61, uint8_t a62, uint8_t a63) { + return _mm512_set_epi64( + uint64_t(a7) + (uint64_t(a6) << 8) + (uint64_t(a5) << 16) + + (uint64_t(a4) << 24) + (uint64_t(a3) << 32) + (uint64_t(a2) << 40) + + (uint64_t(a1) << 48) + (uint64_t(a0) << 56), + uint64_t(a15) + (uint64_t(a14) << 8) + (uint64_t(a13) << 16) + + (uint64_t(a12) << 24) + (uint64_t(a11) << 32) + + (uint64_t(a10) << 40) + (uint64_t(a9) << 48) + (uint64_t(a8) << 56), + uint64_t(a23) + (uint64_t(a22) << 8) + (uint64_t(a21) << 16) + + (uint64_t(a20) << 24) + (uint64_t(a19) << 32) + + (uint64_t(a18) << 40) + (uint64_t(a17) << 48) + (uint64_t(a16) << 56), + uint64_t(a31) + (uint64_t(a30) << 8) + (uint64_t(a29) << 16) + + (uint64_t(a28) << 24) + (uint64_t(a27) << 32) + + (uint64_t(a26) << 40) + (uint64_t(a25) << 48) + (uint64_t(a24) << 56), + uint64_t(a39) + (uint64_t(a38) << 8) + (uint64_t(a37) << 16) + + (uint64_t(a36) << 24) + (uint64_t(a35) << 32) + + (uint64_t(a34) << 40) + (uint64_t(a33) << 48) + (uint64_t(a32) << 56), + uint64_t(a47) + (uint64_t(a46) << 8) + (uint64_t(a45) << 16) + + (uint64_t(a44) << 24) + (uint64_t(a43) << 32) + + (uint64_t(a42) << 40) + (uint64_t(a41) << 48) + (uint64_t(a40) << 56), + uint64_t(a55) + (uint64_t(a54) << 8) + (uint64_t(a53) << 16) + + (uint64_t(a52) << 24) + (uint64_t(a51) << 32) + + (uint64_t(a50) << 40) + (uint64_t(a49) << 48) + (uint64_t(a48) << 56), + uint64_t(a63) + (uint64_t(a62) << 8) + (uint64_t(a61) << 16) + + (uint64_t(a60) << 24) + (uint64_t(a59) << 32) + + (uint64_t(a58) << 40) + (uint64_t(a57) << 48) + + (uint64_t(a56) << 56)); +} + #pragma GCC pop_options #endif // SIMDUTF_GCC8 #endif // SIMDUTF_HASWELL_INTRINSICS_H @@ -1964,91 +3445,319 @@ using namespace simdutf; class implementation final : public simdutf::implementation { public: - simdutf_really_inline implementation() : simdutf::implementation( - "icelake", - "Intel AVX512 (AVX-512BW, AVX-512CD, AVX-512VL, AVX-512VBMI2 extensions)", - internal::instruction_set::AVX2 | internal::instruction_set::BMI1 | internal::instruction_set::BMI2 | internal::instruction_set::AVX512BW | internal::instruction_set::AVX512CD | internal::instruction_set::AVX512VL | internal::instruction_set::AVX512VBMI2 | internal::instruction_set::AVX512VPOPCNTDQ ) {} - simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final; - simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final; - simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final; - simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final; - simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final; - simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final; - simdutf_warn_unused size_t convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept final; - simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_latin1_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; - simdutf_warn_unused size_t convert_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final; - simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final; - simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; - simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; - simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; - simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; - simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final; - simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * buf, size_t len, char* latin1_output) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final; - simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final; - simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept; - simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept; - simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept; - simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept; - simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept; - simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept; - simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept; - simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept; - simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept; - simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept; - simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept; - simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept; - simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept; - simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept; - simdutf_warn_unused size_t base64_length_from_binary(size_t length, base64_options options) const noexcept; - size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept; + simdutf_really_inline implementation() + : simdutf::implementation( + "icelake", + "Intel AVX512 (AVX-512BW, AVX-512CD, AVX-512VL, AVX-512VBMI2 " + "extensions)", + internal::instruction_set::AVX2 | internal::instruction_set::BMI1 | + internal::instruction_set::BMI2 | + internal::instruction_set::AVX512BW | + internal::instruction_set::AVX512CD | + internal::instruction_set::AVX512VL | + internal::instruction_set::AVX512VBMI2 | + internal::instruction_set::AVX512VPOPCNTDQ) {} + +#if SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused int detect_encodings(const char *input, + size_t length) const noexcept final; +#endif // SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool validate_utf8(const char *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF8 + simdutf_warn_unused result + validate_utf8_with_errors(const char *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_ASCII + simdutf_warn_unused bool validate_ascii(const char *buf, + size_t len) const noexcept final; + simdutf_warn_unused result + validate_ascii_with_errors(const char *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_ASCII + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII + simdutf_warn_unused bool + validate_utf16le_as_ascii(const char16_t *buf, + size_t len) const noexcept final; + + simdutf_warn_unused bool + validate_utf16be_as_ascii(const char16_t *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII + +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool validate_utf16le(const char16_t *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused bool validate_utf16be(const char16_t *buf, + size_t len) const noexcept final; + simdutf_warn_unused result validate_utf16le_with_errors( + const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf16be_with_errors( + const char16_t *buf, size_t len) const noexcept final; + void to_well_formed_utf16be(const char16_t *input, size_t len, + char16_t *output) const noexcept final; + void to_well_formed_utf16le(const char16_t *input, size_t len, + char16_t *output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool validate_utf32(const char32_t *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused result validate_utf32_with_errors( + const char32_t *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf8( + const char *buf, size_t len, char *utf8_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf16le( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_latin1_to_utf16be( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_latin1_with_errors( + const char *buf, size_t len, char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t convert_utf8_to_utf16le( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf16be( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf16le_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf16be_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16le( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16be( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t convert_utf8_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf32_with_errors( + const char *buf, size_t len, char32_t *utf32_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf32( + const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + convert_utf16le_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf16be_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_latin1_with_errors( + const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_latin1_with_errors( + const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t convert_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t convert_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf8_with_errors( + const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + convert_utf32_to_latin1(const char32_t *buf, size_t len, + char *latin1_output) const noexcept final; + simdutf_warn_unused result + convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len, + char *latin1_output) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf32_to_latin1(const char32_t *buf, size_t len, + char *latin1_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t + convert_utf32_to_utf16le(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf32_to_utf16be(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf16le_with_errors( + const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf16be_with_errors( + const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf16le_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf16be_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_utf32_with_errors( + const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_utf32_with_errors( + const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 + void change_endianness_utf16(const char16_t *buf, size_t length, + char16_t *output) const noexcept final; + simdutf_warn_unused size_t + count_utf16le(const char16_t *buf, size_t length) const noexcept override; + simdutf_warn_unused size_t + count_utf16be(const char16_t *buf, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 + simdutf_warn_unused size_t count_utf8(const char *buf, + size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t utf8_length_from_utf16le( + const char16_t *input, size_t length) const noexcept override; + simdutf_warn_unused size_t utf8_length_from_utf16be( + const char16_t *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t utf32_length_from_utf16le( + const char16_t *input, size_t length) const noexcept override; + simdutf_warn_unused size_t utf32_length_from_utf16be( + const char16_t *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t utf16_length_from_utf8( + const char *input, size_t length) const noexcept override; + simdutf_warn_unused result utf8_length_from_utf16le_with_replacement( + const char16_t *input, size_t length) const noexcept override; + ; + simdutf_warn_unused result utf8_length_from_utf16be_with_replacement( + const char16_t *input, size_t length) const noexcept override; + ; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t utf8_length_from_utf32( + const char32_t *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t utf16_length_from_utf32( + const char32_t *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t utf32_length_from_utf8( + const char *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t latin1_length_from_utf8( + const char *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t utf8_length_from_latin1( + const char *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_BASE64 + simdutf_warn_unused result base64_to_binary( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept override; + simdutf_warn_unused full_result base64_to_binary_details( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept override; + simdutf_warn_unused result base64_to_binary( + const char16_t *input, size_t length, char *output, + base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept override; + simdutf_warn_unused full_result base64_to_binary_details( + const char16_t *input, size_t length, char *output, + base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept override; + size_t binary_to_base64(const char *input, size_t length, char *output, + base64_options options) const noexcept override; + size_t + binary_to_base64_with_lines(const char *input, size_t length, char *output, + size_t line_length, + base64_options options) const noexcept override; + const char *find(const char *start, const char *end, + char character) const noexcept override; + const char16_t *find(const char16_t *start, const char16_t *end, + char16_t character) const noexcept override; +#endif // SIMDUTF_FEATURE_BASE64 }; } // namespace icelake @@ -2057,9 +3766,9 @@ class implementation final : public simdutf::implementation { #endif // SIMDUTF_ICELAKE_IMPLEMENTATION_H /* end file src/simdutf/icelake/implementation.h */ -// -// The rest need to be inside the region -// + // + // The rest need to be inside the region + // /* begin file src/simdutf/icelake/begin.h */ // redefining SIMDUTF_IMPLEMENTATION to "icelake" // #define SIMDUTF_IMPLEMENTATION icelake @@ -2070,11 +3779,14 @@ class implementation final : public simdutf::implementation { SIMDUTF_TARGET_ICELAKE #endif -#if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593 +#if SIMDUTF_GCC11ORMORE // workaround for + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593 +// clang-format off SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized) +// clang-format on #endif // end of workaround /* end file src/simdutf/icelake/begin.h */ -// Declarations + // Declarations /* begin file src/simdutf/icelake/bitmanipulation.h */ #ifndef SIMDUTF_ICELAKE_BITMANIPULATION_H #define SIMDUTF_ICELAKE_BITMANIPULATION_H @@ -2086,7 +3798,7 @@ namespace { #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) { // note: we do not support legacy 32-bit Windows - return __popcnt64(input_num);// Visual Studio wants two underscores + return __popcnt64(input_num); // Visual Studio wants two underscores } #else simdutf_really_inline long long int count_ones(uint64_t input_num) { @@ -2094,14 +3806,25 @@ simdutf_really_inline long long int count_ones(uint64_t input_num) { } #endif -#if SIMDUTF_NEED_TRAILING_ZEROES -simdutf_really_inline int trailing_zeroes(uint64_t input_num) { -#if SIMDUTF_REGULAR_VISUAL_STUDIO - return (int)_tzcnt_u64(input_num); -#else // SIMDUTF_REGULAR_VISUAL_STUDIO - return __builtin_ctzll(input_num); -#endif // SIMDUTF_REGULAR_VISUAL_STUDIO +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO +simdutf_really_inline unsigned __int64 count_ones32(uint32_t input_num) { + // note: we do not support legacy 32-bit Windows + return __popcnt(input_num); // Visual Studio wants two underscores } +#else +simdutf_really_inline long long int count_ones32(uint32_t input_num) { + return _popcnt32(input_num); +} +#endif + +#if SIMDUTF_NEED_TRAILING_ZEROES +// simdutf_really_inline int trailing_zeroes(uint64_t input_num) { +// #if SIMDUTF_REGULAR_VISUAL_STUDIO +// return (int)_tzcnt_u64(input_num); +// #else // SIMDUTF_REGULAR_VISUAL_STUDIO +// return __builtin_ctzll(input_num); +// #endif // SIMDUTF_REGULAR_VISUAL_STUDIO +// } #endif } // unnamed namespace @@ -2110,6 +3833,165 @@ simdutf_really_inline int trailing_zeroes(uint64_t input_num) { #endif // SIMDUTF_ICELAKE_BITMANIPULATION_H /* end file src/simdutf/icelake/bitmanipulation.h */ +/* begin file src/simdutf/icelake/simd.h */ +#ifndef SIMDUTF_ICELAKE_SIMD_H +#define SIMDUTF_ICELAKE_SIMD_H + +namespace simdutf { +namespace icelake { +namespace { +namespace simd { + +/* begin file src/simdutf/icelake/simd16-inl.h */ +template struct simd16; + +template <> struct simd16 { + static const size_t SIZE = sizeof(__m512i); + static const size_t ELEMENTS = SIZE / sizeof(uint16_t); + + template + static simdutf_really_inline simd16 load(const Pointer *ptr) { + return simd16(ptr); + } + + __m512i value; + + simdutf_really_inline simd16(const __m512i v) : value(v) {} + + template + simdutf_really_inline simd16(const Pointer *ptr) + : value(_mm512_loadu_si512(reinterpret_cast(ptr))) {} + + // operators + simdutf_really_inline simd16 &operator+=(const simd16 other) { + value = _mm512_add_epi32(value, other.value); + return *this; + } + + simdutf_really_inline simd16 &operator-=(const simd16 other) { + value = _mm512_sub_epi32(value, other.value); + return *this; + } + + // methods + simdutf_really_inline simd16 swap_bytes() const { + const __m512i byteflip = _mm512_setr_epi64( + 0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001, + 0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809); + + return _mm512_shuffle_epi8(value, byteflip); + } + + simdutf_really_inline uint64_t sum() const { + const auto lo = _mm512_and_si512(value, _mm512_set1_epi32(0xffff)); + const auto hi = _mm512_srli_epi32(value, 16); + const auto sum32 = _mm512_add_epi32(lo, hi); + + return _mm512_reduce_add_epi32(sum32); + } + + // static members + simdutf_really_inline static simd16 zero() { + return _mm512_setzero_si512(); + } + + simdutf_really_inline static simd16 splat(uint16_t v) { + return _mm512_set1_epi16(v); + } +}; + +template <> struct simd16 { + __mmask32 value; + + simdutf_really_inline simd16(const __mmask32 v) : value(v) {} +}; + +// ------------------------------------------------------------ + +simdutf_really_inline simd16 min(const simd16 b, + const simd16 a) { + return _mm512_min_epu16(a.value, b.value); +} + +simdutf_really_inline simd16 operator&(const simd16 a, + uint16_t b) { + return _mm512_and_si512(a.value, _mm512_set1_epi16(b)); +} + +simdutf_really_inline simd16 operator^(const simd16 a, + uint16_t b) { + return _mm512_xor_si512(a.value, _mm512_set1_epi16(b)); +} + +simdutf_really_inline simd16 operator^(const simd16 a, + const simd16 b) { + return _mm512_xor_si512(a.value, b.value); +} + +simdutf_really_inline simd16 operator==(const simd16 a, + uint16_t b) { + return _mm512_cmpeq_epi16_mask(a.value, _mm512_set1_epi16(b)); +} +/* end file src/simdutf/icelake/simd16-inl.h */ +/* begin file src/simdutf/icelake/simd32-inl.h */ +template struct simd32; + +template <> struct simd32 { + static const size_t SIZE = sizeof(__m512i); + static const size_t ELEMENTS = SIZE / sizeof(uint32_t); + + __m512i value; + + simdutf_really_inline simd32(const __m512i v) : value(v) {} + + template + simdutf_really_inline simd32(const Pointer *ptr) + : value(_mm512_loadu_si512(reinterpret_cast(ptr))) {} + + uint64_t sum() const { + const __m512i mask = _mm512_set1_epi64(0xffffffff); + const __m512i t0 = _mm512_and_si512(value, mask); + const __m512i t1 = _mm512_srli_epi64(value, 32); + const __m512i t2 = _mm512_add_epi64(t0, t1); + return _mm512_reduce_add_epi64(t2); + } + + // operators + simdutf_really_inline simd32 &operator+=(const simd32 other) { + value = _mm512_add_epi32(value, other.value); + return *this; + } + + // static members + simdutf_really_inline static simd32 zero() { + return _mm512_setzero_si512(); + } + + simdutf_really_inline static simd32 splat(uint32_t v) { + return _mm512_set1_epi32(v); + } +}; + +simdutf_really_inline simd32 min(const simd32 b, + const simd32 a) { + return _mm512_min_epu32(a.value, b.value); +} + +simdutf_really_inline simd32 operator&(const simd32 b, + const simd32 a) { + return _mm512_and_si512(a.value, b.value); +} +/* end file src/simdutf/icelake/simd32-inl.h */ + +} // namespace simd +} // unnamed namespace +} // namespace icelake +} // namespace simdutf + +#endif // SIMDUTF_ICELAKE_SIMD_H +/* end file src/simdutf/icelake/simd.h */ + /* begin file src/simdutf/icelake/end.h */ #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE // nothing needed. @@ -2118,13 +4000,12 @@ SIMDUTF_UNTARGET_REGION #endif -#if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593 +#if SIMDUTF_GCC11ORMORE // workaround for + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593 SIMDUTF_POP_DISABLE_WARNINGS #endif // end of workaround /* end file src/simdutf/icelake/end.h */ - - #endif // SIMDUTF_IMPLEMENTATION_ICELAKE #endif // SIMDUTF_ICELAKE_H /* end file src/simdutf/icelake.h */ @@ -2133,56 +4014,56 @@ SIMDUTF_POP_DISABLE_WARNINGS #define SIMDUTF_HASWELL_H #ifdef SIMDUTF_WESTMERE_H -#error "haswell.h must be included before westmere.h" + #error "haswell.h must be included before westmere.h" #endif #ifdef SIMDUTF_FALLBACK_H -#error "haswell.h must be included before fallback.h" + #error "haswell.h must be included before fallback.h" #endif -// Default Haswell to on if this is x86-64. Even if we're not compiled for it, it could be selected -// at runtime. +// Default Haswell to on if this is x86-64. Even if we are not compiled for it, +// it could be selected at runtime. #ifndef SIMDUTF_IMPLEMENTATION_HASWELL -// -// You do not want to restrict it like so: SIMDUTF_IS_X86_64 && __AVX2__ -// because we want to rely on *runtime dispatch*. -// -#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE -#define SIMDUTF_IMPLEMENTATION_HASWELL 0 -#else -#define SIMDUTF_IMPLEMENTATION_HASWELL (SIMDUTF_IS_X86_64) -#endif + // + // You do not want to restrict it like so: SIMDUTF_IS_X86_64 && __AVX2__ + // because we want to rely on *runtime dispatch*. + // + #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE + #define SIMDUTF_IMPLEMENTATION_HASWELL 0 + #else + #define SIMDUTF_IMPLEMENTATION_HASWELL (SIMDUTF_IS_X86_64) + #endif #endif // To see why (__BMI__) && (__LZCNT__) are not part of this next line, see // https://github.com/simdutf/simdutf/issues/1247 #if ((SIMDUTF_IMPLEMENTATION_HASWELL) && (SIMDUTF_IS_X86_64) && (__AVX2__)) -#define SIMDUTF_CAN_ALWAYS_RUN_HASWELL 1 + #define SIMDUTF_CAN_ALWAYS_RUN_HASWELL 1 #else -#define SIMDUTF_CAN_ALWAYS_RUN_HASWELL 0 + #define SIMDUTF_CAN_ALWAYS_RUN_HASWELL 0 #endif #if SIMDUTF_IMPLEMENTATION_HASWELL -#define SIMDUTF_TARGET_HASWELL SIMDUTF_TARGET_REGION("avx2,bmi,lzcnt,popcnt") + #define SIMDUTF_TARGET_HASWELL SIMDUTF_TARGET_REGION("avx2,bmi,lzcnt,popcnt") namespace simdutf { /** * Implementation for Haswell (Intel AVX2). */ -namespace haswell { -} // namespace haswell +namespace haswell {} // namespace haswell } // namespace simdutf -// -// These two need to be included outside SIMDUTF_TARGET_REGION -// + // + // These two need to be included outside SIMDUTF_TARGET_REGION + // /* begin file src/simdutf/haswell/implementation.h */ #ifndef SIMDUTF_HASWELL_IMPLEMENTATION_H #define SIMDUTF_HASWELL_IMPLEMENTATION_H -// The constructor may be executed on any host, so we take care not to use SIMDUTF_TARGET_REGION +// The constructor may be executed on any host, so we take care not to use +// SIMDUTF_TARGET_REGION namespace simdutf { namespace haswell { @@ -2190,92 +4071,311 @@ using namespace simdutf; class implementation final : public simdutf::implementation { public: - simdutf_really_inline implementation() : simdutf::implementation( - "haswell", - "Intel/AMD AVX2", - internal::instruction_set::AVX2 | internal::instruction_set::BMI1 | internal::instruction_set::BMI2 - ) {} - simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final; - simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final; - simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final; - simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final; - simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final; - simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final; - simdutf_warn_unused size_t convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept final; - simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_latin1_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; - simdutf_warn_unused size_t convert_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final; - simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final; - simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; - simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; - simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; - simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; - simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final; - simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * buf, size_t len, char* latin1_output) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final; - simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final; - simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept; - simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept; - simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept; - simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept; - simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept; - simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept; - simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept; - simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept; - simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept; - simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept; - simdutf_warn_unused virtual size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept; - simdutf_warn_unused virtual result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept; - simdutf_warn_unused virtual size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept; - simdutf_warn_unused virtual result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept; - simdutf_warn_unused virtual size_t base64_length_from_binary(size_t length, base64_options options) const noexcept; - size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept; + simdutf_really_inline implementation() + : simdutf::implementation("haswell", "Intel/AMD AVX2", + internal::instruction_set::AVX2 | + internal::instruction_set::BMI1 | + internal::instruction_set::BMI2) {} + +#if SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused int detect_encodings(const char *input, + size_t length) const noexcept final; +#endif // SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool validate_utf8(const char *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF8 + simdutf_warn_unused result + validate_utf8_with_errors(const char *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_ASCII + simdutf_warn_unused bool validate_ascii(const char *buf, + size_t len) const noexcept final; + simdutf_warn_unused result + validate_ascii_with_errors(const char *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_ASCII +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII + simdutf_warn_unused bool + validate_utf16le_as_ascii(const char16_t *buf, + size_t len) const noexcept final; + + simdutf_warn_unused bool + validate_utf16be_as_ascii(const char16_t *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII + +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool validate_utf16le(const char16_t *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused bool validate_utf16be(const char16_t *buf, + size_t len) const noexcept final; + simdutf_warn_unused result validate_utf16le_with_errors( + const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf16be_with_errors( + const char16_t *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool validate_utf32(const char32_t *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused result validate_utf32_with_errors( + const char32_t *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf8( + const char *buf, size_t len, char *utf8_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf16le( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_latin1_to_utf16be( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_latin1_with_errors( + const char *buf, size_t len, char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t convert_utf8_to_utf16le( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf16be( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf16le_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf16be_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16le( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16be( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t convert_utf8_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf32_with_errors( + const char *buf, size_t len, char32_t *utf32_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf32( + const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + convert_utf16le_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf16be_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_latin1_with_errors( + const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_latin1_with_errors( + const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t convert_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t convert_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf8_with_errors( + const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + convert_utf32_to_latin1(const char32_t *buf, size_t len, + char *latin1_output) const noexcept final; + simdutf_warn_unused result + convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len, + char *latin1_output) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf32_to_latin1(const char32_t *buf, size_t len, + char *latin1_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t + convert_utf32_to_utf16le(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf32_to_utf16be(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf16le_with_errors( + const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf16be_with_errors( + const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf16le_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf16be_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_utf32_with_errors( + const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_utf32_with_errors( + const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 + void change_endianness_utf16(const char16_t *buf, size_t length, + char16_t *output) const noexcept final; + simdutf_warn_unused size_t + count_utf16le(const char16_t *buf, size_t length) const noexcept override; + simdutf_warn_unused size_t + count_utf16be(const char16_t *buf, size_t length) const noexcept override; + void to_well_formed_utf16be(const char16_t *input, size_t len, + char16_t *output) const noexcept final; + void to_well_formed_utf16le(const char16_t *input, size_t len, + char16_t *output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 + simdutf_warn_unused size_t count_utf8(const char *buf, + size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t utf8_length_from_utf16le( + const char16_t *input, size_t length) const noexcept override; + simdutf_warn_unused size_t utf8_length_from_utf16be( + const char16_t *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t utf32_length_from_utf16le( + const char16_t *input, size_t length) const noexcept override; + simdutf_warn_unused size_t utf32_length_from_utf16be( + const char16_t *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t utf16_length_from_utf8( + const char *input, size_t length) const noexcept override; + simdutf_warn_unused result utf8_length_from_utf16le_with_replacement( + const char16_t *input, size_t length) const noexcept override; + ; + simdutf_warn_unused result utf8_length_from_utf16be_with_replacement( + const char16_t *input, size_t length) const noexcept override; + ; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t utf8_length_from_utf32( + const char32_t *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t utf16_length_from_utf32( + const char32_t *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t utf32_length_from_utf8( + const char *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t latin1_length_from_utf8( + const char *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t utf8_length_from_latin1( + const char *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_BASE64 + simdutf_warn_unused result base64_to_binary( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept override; + simdutf_warn_unused full_result base64_to_binary_details( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept override; + simdutf_warn_unused result base64_to_binary( + const char16_t *input, size_t length, char *output, + base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept override; + simdutf_warn_unused full_result base64_to_binary_details( + const char16_t *input, size_t length, char *output, + base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept override; + size_t binary_to_base64(const char *input, size_t length, char *output, + base64_options options) const noexcept override; + size_t + binary_to_base64_with_lines(const char *input, size_t length, char *output, + size_t line_length, + base64_options options) const noexcept override; + const char *find(const char *start, const char *end, + char character) const noexcept override; + const char16_t *find(const char16_t *start, const char16_t *end, + char16_t character) const noexcept override; +#endif // SIMDUTF_FEATURE_BASE64 }; } // namespace haswell @@ -2289,71 +4389,76 @@ class implementation final : public simdutf::implementation { #ifdef SIMDUTF_VISUAL_STUDIO -// under clang within visual studio, this will include -#include // visual studio or clang + // under clang within visual studio, this will include + #include // visual studio or clang #else -#if SIMDUTF_GCC11ORMORE + #if SIMDUTF_GCC11ORMORE // We should not get warnings while including yet we do // under some versions of GCC. // If the x86intrin.h header has uninitialized values that are problematic, // it is a GCC issue, we want to ignore these warnings. SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized) -#endif - -#include // elsewhere + #endif + #include // elsewhere -#if SIMDUTF_GCC11ORMORE + #if SIMDUTF_GCC11ORMORE // cancels the suppression of the -Wuninitialized SIMDUTF_POP_DISABLE_WARNINGS -#endif + #endif #endif // SIMDUTF_VISUAL_STUDIO #ifdef SIMDUTF_CLANG_VISUAL_STUDIO -/** - * You are not supposed, normally, to include these - * headers directly. Instead you should either include intrin.h - * or x86intrin.h. However, when compiling with clang - * under Windows (i.e., when _MSC_VER is set), these headers - * only get included *if* the corresponding features are detected - * from macros: - * e.g., if __AVX2__ is set... in turn, we normally set these - * macros by compiling against the corresponding architecture - * (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole - * software with these advanced instructions. In simdutf, we - * want to compile the whole program for a generic target, - * and only target our specific kernels. As a workaround, - * we directly include the needed headers. These headers would - * normally guard against such usage, but we carefully included - * (or ) before, so the headers - * are fooled. - */ -#include // for _blsr_u64 -#include // for __lzcnt64 -#include // for most things (AVX2, AVX512, _popcnt64) -#include -#include -#include -#include -// unfortunately, we may not get _blsr_u64, but, thankfully, clang -// has it as a macro. -#ifndef _blsr_u64 -// we roll our own -#define _blsr_u64(n) ((n - 1) & n) -#endif // _blsr_u64 -#endif // SIMDUTF_CLANG_VISUAL_STUDIO + /** + * You are not supposed, normally, to include these + * headers directly. Instead you should either include intrin.h + * or x86intrin.h. However, when compiling with clang + * under Windows (i.e., when _MSC_VER is set), these headers + * only get included *if* the corresponding features are detected + * from macros: + * e.g., if __AVX2__ is set... in turn, we normally set these + * macros by compiling against the corresponding architecture + * (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole + * software with these advanced instructions. In simdutf, we + * want to compile the whole program for a generic target, + * and only target our specific kernels. As a workaround, + * we directly include the needed headers. These headers would + * normally guard against such usage, but we carefully included + * (or ) before, so the headers + * are fooled. + */ + #include // for _blsr_u64 + #include // for __lzcnt64 + #include // for most things (AVX2, AVX512, _popcnt64) + #include + #include + #include + #include + // unfortunately, we may not get _blsr_u64, but, thankfully, clang + // has it as a macro. + #ifndef _blsr_u64 + // we roll our own + #define _blsr_u64(n) (((n) - 1) & (n)) + #endif // _blsr_u64 + // Same issue with _blsmsk_u32: + #ifndef _blsmsk_u32 + // we roll our own + #define _blsmsk_u32(n) (((n) - 1) ^ (n)) + #endif // _blsmsk_u32 +#endif // SIMDUTF_CLANG_VISUAL_STUDIO #endif // SIMDUTF_HASWELL_INTRINSICS_H /* end file src/simdutf/haswell/intrinsics.h */ -// -// The rest need to be inside the region -// + // + // The rest need to be inside the region + // /* begin file src/simdutf/haswell/begin.h */ // redefining SIMDUTF_IMPLEMENTATION to "haswell" // #define SIMDUTF_IMPLEMENTATION haswell +#define SIMDUTF_SIMD_HAS_BYTEMASK 1 #if SIMDUTF_CAN_ALWAYS_RUN_HASWELL // nothing needed. @@ -2361,11 +4466,14 @@ SIMDUTF_POP_DISABLE_WARNINGS SIMDUTF_TARGET_HASWELL #endif -#if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593 +#if SIMDUTF_GCC11ORMORE // workaround for + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593 +// clang-format off SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized) +// clang-format on #endif // end of workaround /* end file src/simdutf/haswell/begin.h */ -// Declarations + // Declarations /* begin file src/simdutf/haswell/bitmanipulation.h */ #ifndef SIMDUTF_HASWELL_BITMANIPULATION_H #define SIMDUTF_HASWELL_BITMANIPULATION_H @@ -2377,7 +4485,7 @@ namespace { #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) { // note: we do not support legacy 32-bit Windows - return __popcnt64(input_num);// Visual Studio wants two underscores + return __popcnt64(input_num); // Visual Studio wants two underscores } #else simdutf_really_inline long long int count_ones(uint64_t input_num) { @@ -2386,15 +4494,17 @@ simdutf_really_inline long long int count_ones(uint64_t input_num) { #endif #if SIMDUTF_NEED_TRAILING_ZEROES -simdutf_inline int trailing_zeroes(uint64_t input_num) { -#if SIMDUTF_REGULAR_VISUAL_STUDIO +simdutf_really_inline int trailing_zeroes(uint64_t input_num) { + #if SIMDUTF_REGULAR_VISUAL_STUDIO return (int)_tzcnt_u64(input_num); -#else // SIMDUTF_REGULAR_VISUAL_STUDIO + #else // SIMDUTF_REGULAR_VISUAL_STUDIO return __builtin_ctzll(input_num); -#endif // SIMDUTF_REGULAR_VISUAL_STUDIO + #endif // SIMDUTF_REGULAR_VISUAL_STUDIO } #endif +template bool is_power_of_two(T x) { return (x & (x - 1)) == 0; } + } // unnamed namespace } // namespace haswell } // namespace simdutf @@ -2405,540 +4515,501 @@ simdutf_inline int trailing_zeroes(uint64_t input_num) { #ifndef SIMDUTF_HASWELL_SIMD_H #define SIMDUTF_HASWELL_SIMD_H - namespace simdutf { namespace haswell { namespace { namespace simd { - // Forward-declared so they can be used by splat and friends. - template - struct base { - __m256i value; - - // Zero constructor - simdutf_really_inline base() : value{__m256i()} {} - - // Conversion from SIMD register - simdutf_really_inline base(const __m256i _value) : value(_value) {} - // Conversion to SIMD register - simdutf_really_inline operator const __m256i&() const { return this->value; } - simdutf_really_inline operator __m256i&() { return this->value; } - template - simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const { - __m256i first = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(*this)); - __m256i second = _mm256_cvtepu8_epi16(_mm256_extractf128_si256(*this,1)); - if (big_endian) { - const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, - 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); - first = _mm256_shuffle_epi8(first, swap); - second = _mm256_shuffle_epi8(second, swap); - } - _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), first); - _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 16), second); - } - simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const { - _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), _mm256_cvtepu8_epi32(_mm256_castsi256_si128(*this))); - _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr+8), _mm256_cvtepu8_epi32(_mm256_castsi256_si128(_mm256_srli_si256(*this,8)))); - _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 16), _mm256_cvtepu8_epi32(_mm256_extractf128_si256(*this,1))); - _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 24), _mm256_cvtepu8_epi32(_mm_srli_si128(_mm256_extractf128_si256(*this,1),8))); - } - // Bit operations - simdutf_really_inline Child operator|(const Child other) const { return _mm256_or_si256(*this, other); } - simdutf_really_inline Child operator&(const Child other) const { return _mm256_and_si256(*this, other); } - simdutf_really_inline Child operator^(const Child other) const { return _mm256_xor_si256(*this, other); } - simdutf_really_inline Child bit_andnot(const Child other) const { return _mm256_andnot_si256(other, *this); } - simdutf_really_inline Child& operator|=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast | other; return *this_cast; } - simdutf_really_inline Child& operator&=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast & other; return *this_cast; } - simdutf_really_inline Child& operator^=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast ^ other; return *this_cast; } - }; - - // Forward-declared so they can be used by splat and friends. - template - struct simd8; - - template> - struct base8: base> { - typedef uint32_t bitmask_t; - typedef uint64_t bitmask2_t; - - simdutf_really_inline base8() : base>() {} - simdutf_really_inline base8(const __m256i _value) : base>(_value) {} - simdutf_really_inline T first() const { return _mm256_extract_epi8(*this,0); } - simdutf_really_inline T last() const { return _mm256_extract_epi8(*this,31); } - friend simdutf_really_inline Mask operator==(const simd8 lhs, const simd8 rhs) { return _mm256_cmpeq_epi8(lhs, rhs); } +// Forward-declared so they can be used by splat and friends. +template struct base { + __m256i value; - static const int SIZE = sizeof(base::value); + // Zero constructor + simdutf_really_inline base() : value{__m256i()} {} - template - simdutf_really_inline simd8 prev(const simd8 prev_chunk) const { - return _mm256_alignr_epi8(*this, _mm256_permute2x128_si256(prev_chunk, *this, 0x21), 16 - N); - } - }; + // Conversion from SIMD register + simdutf_really_inline base(const __m256i _value) : value(_value) {} - // SIMD byte mask type (returned by things like eq and gt) - template<> - struct simd8: base8 { - static simdutf_really_inline simd8 splat(bool _value) { return _mm256_set1_epi8(uint8_t(-(!!_value))); } - - simdutf_really_inline simd8() : base8() {} - simdutf_really_inline simd8(const __m256i _value) : base8(_value) {} - // Splat constructor - simdutf_really_inline simd8(bool _value) : base8(splat(_value)) {} - - simdutf_really_inline uint32_t to_bitmask() const { return uint32_t(_mm256_movemask_epi8(*this)); } - simdutf_really_inline bool any() const { return !_mm256_testz_si256(*this, *this); } - simdutf_really_inline bool none() const { return _mm256_testz_si256(*this, *this); } - simdutf_really_inline bool all() const { return static_cast(_mm256_movemask_epi8(*this)) == 0xFFFFFFFF; } - simdutf_really_inline simd8 operator~() const { return *this ^ true; } - }; + simdutf_really_inline operator const __m256i &() const { return this->value; } - template - struct base8_numeric: base8 { - static simdutf_really_inline simd8 splat(T _value) { return _mm256_set1_epi8(_value); } - static simdutf_really_inline simd8 zero() { return _mm256_setzero_si256(); } - static simdutf_really_inline simd8 load(const T values[32]) { - return _mm256_loadu_si256(reinterpret_cast(values)); - } - // Repeat 16 values as many times as necessary (usually for lookup tables) - static simdutf_really_inline simd8 repeat_16( - T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, - T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15 - ) { - return simd8( - v0, v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10,v11,v12,v13,v14,v15, - v0, v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10,v11,v12,v13,v14,v15 - ); - } - - simdutf_really_inline base8_numeric() : base8() {} - simdutf_really_inline base8_numeric(const __m256i _value) : base8(_value) {} - - // Store to array - simdutf_really_inline void store(T dst[32]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); } - - // Addition/subtraction are the same for signed and unsigned - simdutf_really_inline simd8 operator+(const simd8 other) const { return _mm256_add_epi8(*this, other); } - simdutf_really_inline simd8 operator-(const simd8 other) const { return _mm256_sub_epi8(*this, other); } - simdutf_really_inline simd8& operator+=(const simd8 other) { *this = *this + other; return *static_cast*>(this); } - simdutf_really_inline simd8& operator-=(const simd8 other) { *this = *this - other; return *static_cast*>(this); } - - // Override to distinguish from bool version - simdutf_really_inline simd8 operator~() const { return *this ^ 0xFFu; } - - // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values) - template - simdutf_really_inline simd8 lookup_16(simd8 lookup_table) const { - return _mm256_shuffle_epi8(lookup_table, *this); - } - - template - simdutf_really_inline simd8 lookup_16( - L replace0, L replace1, L replace2, L replace3, - L replace4, L replace5, L replace6, L replace7, - L replace8, L replace9, L replace10, L replace11, - L replace12, L replace13, L replace14, L replace15) const { - return lookup_16(simd8::repeat_16( - replace0, replace1, replace2, replace3, - replace4, replace5, replace6, replace7, - replace8, replace9, replace10, replace11, - replace12, replace13, replace14, replace15 - )); + template + simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const { + __m256i first = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(*this)); + __m256i second = _mm256_cvtepu8_epi16(_mm256_extractf128_si256(*this, 1)); + if (big_endian) { + const __m256i swap = _mm256_setr_epi8( + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, + 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); + first = _mm256_shuffle_epi8(first, swap); + second = _mm256_shuffle_epi8(second, swap); } - }; - - - // Signed bytes - template<> - struct simd8 : base8_numeric { - simdutf_really_inline simd8() : base8_numeric() {} - simdutf_really_inline simd8(const __m256i _value) : base8_numeric(_value) {} - - // Splat constructor - simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {} - // Array constructor - simdutf_really_inline simd8(const int8_t values[32]) : simd8(load(values)) {} - simdutf_really_inline operator simd8() const; - // Member-by-member initialization - simdutf_really_inline simd8( - int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, - int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15, - int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23, - int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31 - ) : simd8(_mm256_setr_epi8( - v0, v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10,v11,v12,v13,v14,v15, - v16,v17,v18,v19,v20,v21,v22,v23, - v24,v25,v26,v27,v28,v29,v30,v31 - )) {} - // Repeat 16 values as many times as necessary (usually for lookup tables) - simdutf_really_inline static simd8 repeat_16( - int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, - int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 - ) { - return simd8( - v0, v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10,v11,v12,v13,v14,v15, - v0, v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10,v11,v12,v13,v14,v15 - ); - } - simdutf_really_inline bool is_ascii() const { return _mm256_movemask_epi8(*this) == 0; } - // Order-sensitive comparisons - simdutf_really_inline simd8 max_val(const simd8 other) const { return _mm256_max_epi8(*this, other); } - simdutf_really_inline simd8 min_val(const simd8 other) const { return _mm256_min_epi8(*this, other); } - simdutf_really_inline simd8 operator>(const simd8 other) const { return _mm256_cmpgt_epi8(*this, other); } - simdutf_really_inline simd8 operator<(const simd8 other) const { return _mm256_cmpgt_epi8(other, *this); } - }; + _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), first); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 16), second); + } - // Unsigned bytes - template<> - struct simd8: base8_numeric { - simdutf_really_inline simd8() : base8_numeric() {} - simdutf_really_inline simd8(const __m256i _value) : base8_numeric(_value) {} - // Splat constructor - simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {} - // Array constructor - simdutf_really_inline simd8(const uint8_t values[32]) : simd8(load(values)) {} - // Member-by-member initialization - simdutf_really_inline simd8( - uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, - uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15, - uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23, - uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31 - ) : simd8(_mm256_setr_epi8( - v0, v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10,v11,v12,v13,v14,v15, - v16,v17,v18,v19,v20,v21,v22,v23, - v24,v25,v26,v27,v28,v29,v30,v31 - )) {} - // Repeat 16 values as many times as necessary (usually for lookup tables) - simdutf_really_inline static simd8 repeat_16( - uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, - uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 - ) { - return simd8( - v0, v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10,v11,v12,v13,v14,v15, - v0, v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10,v11,v12,v13,v14,v15 - ); - } - - - // Saturated math - simdutf_really_inline simd8 saturating_add(const simd8 other) const { return _mm256_adds_epu8(*this, other); } - simdutf_really_inline simd8 saturating_sub(const simd8 other) const { return _mm256_subs_epu8(*this, other); } - - // Order-specific operations - simdutf_really_inline simd8 max_val(const simd8 other) const { return _mm256_max_epu8(*this, other); } - simdutf_really_inline simd8 min_val(const simd8 other) const { return _mm256_min_epu8(other, *this); } - // Same as >, but only guarantees true is nonzero (< guarantees true = -1) - simdutf_really_inline simd8 gt_bits(const simd8 other) const { return this->saturating_sub(other); } - // Same as <, but only guarantees true is nonzero (< guarantees true = -1) - simdutf_really_inline simd8 lt_bits(const simd8 other) const { return other.saturating_sub(*this); } - simdutf_really_inline simd8 operator<=(const simd8 other) const { return other.max_val(*this) == other; } - simdutf_really_inline simd8 operator>=(const simd8 other) const { return other.min_val(*this) == other; } - simdutf_really_inline simd8 operator>(const simd8 other) const { return this->gt_bits(other).any_bits_set(); } - simdutf_really_inline simd8 operator<(const simd8 other) const { return this->lt_bits(other).any_bits_set(); } - - // Bit-specific operations - simdutf_really_inline simd8 bits_not_set() const { return *this == uint8_t(0); } - simdutf_really_inline simd8 bits_not_set(simd8 bits) const { return (*this & bits).bits_not_set(); } - simdutf_really_inline simd8 any_bits_set() const { return ~this->bits_not_set(); } - simdutf_really_inline simd8 any_bits_set(simd8 bits) const { return ~this->bits_not_set(bits); } - simdutf_really_inline bool is_ascii() const { return _mm256_movemask_epi8(*this) == 0; } - simdutf_really_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); } - simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); } - simdutf_really_inline bool bits_not_set_anywhere(simd8 bits) const { return _mm256_testz_si256(*this, bits); } - simdutf_really_inline bool any_bits_set_anywhere(simd8 bits) const { return !bits_not_set_anywhere(bits); } - template - simdutf_really_inline simd8 shr() const { return simd8(_mm256_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); } - template - simdutf_really_inline simd8 shl() const { return simd8(_mm256_slli_epi16(*this, N)) & uint8_t(0xFFu << N); } - // Get one of the bits and make a bitmask out of it. - // e.g. value.get_bit<7>() gets the high bit - template - simdutf_really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 7-N)); } - }; - simdutf_really_inline simd8::operator simd8() const { return this->value; } + simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const { + _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), + _mm256_cvtepu8_epi32(_mm256_castsi256_si128(*this))); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 8), + _mm256_cvtepu8_epi32(_mm256_castsi256_si128( + _mm256_srli_si256(*this, 8)))); + _mm256_storeu_si256( + reinterpret_cast<__m256i *>(ptr + 16), + _mm256_cvtepu8_epi32(_mm256_extractf128_si256(*this, 1))); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 24), + _mm256_cvtepu8_epi32(_mm_srli_si128( + _mm256_extractf128_si256(*this, 1), 8))); + } + // Bit operations + simdutf_really_inline Child operator|(const Child other) const { + return _mm256_or_si256(*this, other); + } + simdutf_really_inline Child operator&(const Child other) const { + return _mm256_and_si256(*this, other); + } + simdutf_really_inline Child operator^(const Child other) const { + return _mm256_xor_si256(*this, other); + } + simdutf_really_inline Child &operator|=(const Child other) { + auto this_cast = static_cast(this); + *this_cast = *this_cast | other; + return *this_cast; + } +}; +// Forward-declared so they can be used by splat and friends. +template struct simd8; - template - struct simd8x64 { - static constexpr int NUM_CHUNKS = 64 / sizeof(simd8); - static_assert(NUM_CHUNKS == 2, "Haswell kernel should use two registers per 64-byte block."); - simd8 chunks[NUM_CHUNKS]; +template > +struct base8 : base> { + simdutf_really_inline base8() : base>() {} - simd8x64(const simd8x64& o) = delete; // no copy allowed - simd8x64& operator=(const simd8 other) = delete; // no assignment allowed - simd8x64() = delete; // no default constructor allowed + simdutf_really_inline base8(const __m256i _value) : base>(_value) {} - simdutf_really_inline simd8x64(const simd8 chunk0, const simd8 chunk1) : chunks{chunk0, chunk1} {} - simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8::load(ptr), simd8::load(ptr+sizeof(simd8)/sizeof(T))} {} + friend simdutf_always_inline Mask operator==(const simd8 lhs, + const simd8 rhs) { + return _mm256_cmpeq_epi8(lhs, rhs); + } - simdutf_really_inline void store(T* ptr) const { - this->chunks[0].store(ptr+sizeof(simd8)*0/sizeof(T)); - this->chunks[1].store(ptr+sizeof(simd8)*1/sizeof(T)); - } + static const int SIZE = sizeof(base::value); - simdutf_really_inline uint64_t to_bitmask() const { - uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask()); - uint64_t r_hi = this->chunks[1].to_bitmask(); - return r_lo | (r_hi << 32); - } + template + simdutf_really_inline simd8 prev(const simd8 prev_chunk) const { + return _mm256_alignr_epi8( + *this, _mm256_permute2x128_si256(prev_chunk, *this, 0x21), 16 - N); + } +}; - simdutf_really_inline simd8x64& operator|=(const simd8x64 &other) { - this->chunks[0] |= other.chunks[0]; - this->chunks[1] |= other.chunks[1]; - return *this; - } +// SIMD byte mask type (returned by things like eq and gt) +template <> struct simd8 : base8 { + static simdutf_really_inline simd8 splat(bool _value) { + return _mm256_set1_epi8(uint8_t(-(!!_value))); + } - simdutf_really_inline simd8 reduce_or() const { - return this->chunks[0] | this->chunks[1]; - } + simdutf_really_inline simd8(const __m256i _value) : base8(_value) {} - simdutf_really_inline bool is_ascii() const { - return this->reduce_or().is_ascii(); - } + simdutf_really_inline simd8(bool _value) : base8(splat(_value)) {} - template - simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const { - this->chunks[0].template store_ascii_as_utf16(ptr+sizeof(simd8)*0); - this->chunks[1].template store_ascii_as_utf16(ptr+sizeof(simd8)*1); - } + simdutf_really_inline uint32_t to_bitmask() const { + return uint32_t(_mm256_movemask_epi8(value)); + } +}; - simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const { - this->chunks[0].store_ascii_as_utf32(ptr+sizeof(simd8)*0); - this->chunks[1].store_ascii_as_utf32(ptr+sizeof(simd8)*1); - } +template struct base8_numeric : base8 { + static simdutf_really_inline simd8 splat(T _value) { + return _mm256_set1_epi8(_value); + } + static simdutf_really_inline simd8 zero() { + return _mm256_setzero_si256(); + } + static simdutf_really_inline simd8 load(const T values[32]) { + return _mm256_loadu_si256(reinterpret_cast(values)); + } + // Repeat 16 values as many times as necessary (usually for lookup tables) + static simdutf_really_inline simd8 repeat_16(T v0, T v1, T v2, T v3, T v4, + T v5, T v6, T v7, T v8, T v9, + T v10, T v11, T v12, T v13, + T v14, T v15) { + return simd8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, + v14, v15, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, + v12, v13, v14, v15); + } - simdutf_really_inline simd8x64 bit_or(const T m) const { - const simd8 mask = simd8::splat(m); - return simd8x64( - this->chunks[0] | mask, - this->chunks[1] | mask - ); - } + simdutf_really_inline base8_numeric() : base8() {} + simdutf_really_inline base8_numeric(const __m256i _value) + : base8(_value) {} - simdutf_really_inline uint64_t eq(const T m) const { - const simd8 mask = simd8::splat(m); - return simd8x64( - this->chunks[0] == mask, - this->chunks[1] == mask - ).to_bitmask(); - } + // Store to array + simdutf_really_inline void store(T dst[32]) const { + return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); + } - simdutf_really_inline uint64_t eq(const simd8x64 &other) const { - return simd8x64( - this->chunks[0] == other.chunks[0], - this->chunks[1] == other.chunks[1] - ).to_bitmask(); - } + // Addition/subtraction are the same for signed and unsigned + simdutf_really_inline simd8 operator-(const simd8 other) const { + return _mm256_sub_epi8(*this, other); + } + simdutf_really_inline simd8 &operator-=(const simd8 other) { + *this = *this - other; + return *static_cast *>(this); + } - simdutf_really_inline uint64_t lteq(const T m) const { - const simd8 mask = simd8::splat(m); - return simd8x64( - this->chunks[0] <= mask, - this->chunks[1] <= mask - ).to_bitmask(); - } + // Override to distinguish from bool version + simdutf_really_inline simd8 operator~() const { return *this ^ 0xFFu; } + + // Perform a lookup assuming the value is between 0 and 16 (undefined behavior + // for out of range values) + template + simdutf_really_inline simd8 lookup_16(simd8 lookup_table) const { + return _mm256_shuffle_epi8(lookup_table, *this); + } + + template + simdutf_really_inline simd8 + lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4, + L replace5, L replace6, L replace7, L replace8, L replace9, + L replace10, L replace11, L replace12, L replace13, L replace14, + L replace15) const { + return lookup_16(simd8::repeat_16( + replace0, replace1, replace2, replace3, replace4, replace5, replace6, + replace7, replace8, replace9, replace10, replace11, replace12, + replace13, replace14, replace15)); + } +}; - simdutf_really_inline uint64_t in_range(const T low, const T high) const { - const simd8 mask_low = simd8::splat(low); - const simd8 mask_high = simd8::splat(high); +// Signed bytes +template <> struct simd8 : base8_numeric { + simdutf_really_inline simd8() : base8_numeric() {} + simdutf_really_inline simd8(const __m256i _value) + : base8_numeric(_value) {} - return simd8x64( - (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low), - (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low) - ).to_bitmask(); - } - simdutf_really_inline uint64_t not_in_range(const T low, const T high) const { - const simd8 mask_low = simd8::splat(low); - const simd8 mask_high = simd8::splat(high); - return simd8x64( - (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low), - (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low) - ).to_bitmask(); - } - simdutf_really_inline uint64_t lt(const T m) const { - const simd8 mask = simd8::splat(m); - return simd8x64( - this->chunks[0] < mask, - this->chunks[1] < mask - ).to_bitmask(); - } + // Splat constructor + simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdutf_really_inline simd8(const int8_t values[32]) : simd8(load(values)) {} + simdutf_really_inline operator simd8() const; - simdutf_really_inline uint64_t gt(const T m) const { - const simd8 mask = simd8::splat(m); - return simd8x64( - this->chunks[0] > mask, - this->chunks[1] > mask - ).to_bitmask(); - } - simdutf_really_inline uint64_t gteq(const T m) const { - const simd8 mask = simd8::splat(m); - return simd8x64( - this->chunks[0] >= mask, - this->chunks[1] >= mask - ).to_bitmask(); - } - simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const { - const simd8 mask = simd8::splat(m); - return simd8x64( - (simd8(__m256i(this->chunks[0])) >= mask), - (simd8(__m256i(this->chunks[1])) >= mask) - ).to_bitmask(); - } - }; // struct simd8x64 + simdutf_really_inline bool is_ascii() const { + return _mm256_movemask_epi8(*this) == 0; + } + // Order-sensitive comparisons + simdutf_really_inline simd8 operator>(const simd8 other) const { + return _mm256_cmpgt_epi8(*this, other); + } + simdutf_really_inline simd8 operator<(const simd8 other) const { + return _mm256_cmpgt_epi8(other, *this); + } +}; + +// Unsigned bytes +template <> struct simd8 : base8_numeric { + simdutf_really_inline simd8() : base8_numeric() {} + simdutf_really_inline simd8(const __m256i _value) + : base8_numeric(_value) {} + // Splat constructor + simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdutf_really_inline simd8(const uint8_t values[32]) : simd8(load(values)) {} + // Member-by-member initialization + simdutf_really_inline + simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, + uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10, + uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15, + uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, + uint8_t v21, uint8_t v22, uint8_t v23, uint8_t v24, uint8_t v25, + uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, + uint8_t v31) + : simd8(_mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, + v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, + v22, v23, v24, v25, v26, v27, v28, v29, v30, + v31)) {} + + // Saturated math + simdutf_really_inline simd8 + saturating_sub(const simd8 other) const { + return _mm256_subs_epu8(*this, other); + } + + // Order-specific operations + simdutf_really_inline simd8 + min_val(const simd8 other) const { + return _mm256_min_epu8(other, *this); + } + // Same as >, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd8 + gt_bits(const simd8 other) const { + return this->saturating_sub(other); + } + simdutf_really_inline simd8 + operator>=(const simd8 other) const { + return other.min_val(*this) == other; + } + + // Bit-specific operations + simdutf_really_inline bool is_ascii() const { + return _mm256_movemask_epi8(*this) == 0; + } + simdutf_really_inline bool bits_not_set_anywhere() const { + return _mm256_testz_si256(*this, *this); + } + + simdutf_really_inline bool any_bits_set_anywhere() const { + return !bits_not_set_anywhere(); + } + + template simdutf_really_inline simd8 shr() const { + return simd8(_mm256_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); + } + + simdutf_really_inline uint64_t sum_bytes() const { + const auto tmp = _mm256_sad_epu8(value, _mm256_setzero_si256()); + + return _mm256_extract_epi64(tmp, 0) + _mm256_extract_epi64(tmp, 1) + + _mm256_extract_epi64(tmp, 2) + _mm256_extract_epi64(tmp, 3); + } +}; +simdutf_really_inline simd8::operator simd8() const { + return this->value; +} + +template struct simd8x64 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd8); + static_assert(NUM_CHUNKS == 2, + "Haswell kernel should use two registers per 64-byte block."); + simd8 chunks[NUM_CHUNKS]; + + simd8x64(const simd8x64 &o) = delete; // no copy allowed + simd8x64 & + operator=(const simd8 other) = delete; // no assignment allowed + simd8x64() = delete; // no default constructor allowed + + simdutf_really_inline simd8x64(const simd8 chunk0, const simd8 chunk1) + : chunks{chunk0, chunk1} {} + simdutf_really_inline simd8x64(const T *ptr) + : chunks{simd8::load(ptr), + simd8::load(ptr + sizeof(simd8) / sizeof(T))} {} + + simdutf_really_inline void store(T *ptr) const { + this->chunks[0].store(ptr + sizeof(simd8) * 0 / sizeof(T)); + this->chunks[1].store(ptr + sizeof(simd8) * 1 / sizeof(T)); + } + + simdutf_really_inline uint64_t to_bitmask() const { + uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask()); + uint64_t r_hi = this->chunks[1].to_bitmask(); + return r_lo | (r_hi << 32); + } + + simdutf_really_inline simd8x64 &operator|=(const simd8x64 &other) { + this->chunks[0] |= other.chunks[0]; + this->chunks[1] |= other.chunks[1]; + return *this; + } + + simdutf_really_inline simd8 reduce_or() const { + return this->chunks[0] | this->chunks[1]; + } + + simdutf_really_inline bool is_ascii() const { + return this->reduce_or().is_ascii(); + } + + template + simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const { + this->chunks[0].template store_ascii_as_utf16(ptr + + sizeof(simd8) * 0); + this->chunks[1].template store_ascii_as_utf16(ptr + + sizeof(simd8) * 1); + } + + simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const { + this->chunks[0].store_ascii_as_utf32(ptr + sizeof(simd8) * 0); + this->chunks[1].store_ascii_as_utf32(ptr + sizeof(simd8) * 1); + } + + simdutf_really_inline uint64_t in_range(const T low, const T high) const { + const simd8 mask_low = simd8::splat(low); + const simd8 mask_high = simd8::splat(high); + + return simd8x64( + (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low), + (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low)) + .to_bitmask(); + } + + simdutf_really_inline uint64_t lt(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64(this->chunks[0] < mask, this->chunks[1] < mask) + .to_bitmask(); + } + + simdutf_really_inline uint64_t gt(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64(this->chunks[0] > mask, this->chunks[1] > mask) + .to_bitmask(); + } + simdutf_really_inline uint64_t eq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64(this->chunks[0] == mask, this->chunks[1] == mask) + .to_bitmask(); + } + simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const { + const simd8 mask = simd8::splat(m); + return simd8x64((simd8(__m256i(this->chunks[0])) >= mask), + (simd8(__m256i(this->chunks[1])) >= mask)) + .to_bitmask(); + } +}; // struct simd8x64 /* begin file src/simdutf/haswell/simd16-inl.h */ #ifdef __GNUC__ -#if __GNUC__ < 8 -#define _mm256_set_m128i(xmm1, xmm2) _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), _mm256_castsi128_si256(xmm2), 2) -#define _mm256_setr_m128i(xmm2, xmm1) _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), _mm256_castsi128_si256(xmm2), 2) -#endif + #if __GNUC__ < 8 + #define _mm256_set_m128i(xmm1, xmm2) \ + _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), \ + _mm256_castsi128_si256(xmm2), 2) + #define _mm256_setr_m128i(xmm2, xmm1) \ + _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), \ + _mm256_castsi128_si256(xmm2), 2) + #endif #endif -template -struct simd16; +template struct simd16; -template> -struct base16: base> { +template > +struct base16 : base> { using bitmask_type = uint32_t; simdutf_really_inline base16() : base>() {} - simdutf_really_inline base16(const __m256i _value) : base>(_value) {} + simdutf_really_inline base16(const __m256i _value) + : base>(_value) {} template - simdutf_really_inline base16(const Pointer* ptr) : base16(_mm256_loadu_si256(reinterpret_cast(ptr))) {} - friend simdutf_really_inline Mask operator==(const simd16 lhs, const simd16 rhs) { return _mm256_cmpeq_epi16(lhs, rhs); } + simdutf_really_inline base16(const Pointer *ptr) + : base16(_mm256_loadu_si256(reinterpret_cast(ptr))) {} + + friend simdutf_always_inline Mask operator==(const simd16 lhs, + const simd16 rhs) { + return _mm256_cmpeq_epi16(lhs, rhs); + } /// the size of vector in bytes static const int SIZE = sizeof(base>::value); /// the number of elements of type T a vector can hold static const int ELEMENTS = SIZE / sizeof(T); - - template - simdutf_really_inline simd16 prev(const simd16 prev_chunk) const { - return _mm256_alignr_epi8(*this, prev_chunk, 16 - N); - } }; // SIMD byte mask type (returned by things like eq and gt) -template<> -struct simd16: base16 { - static simdutf_really_inline simd16 splat(bool _value) { return _mm256_set1_epi16(uint16_t(-(!!_value))); } +template <> struct simd16 : base16 { + static simdutf_really_inline simd16 splat(bool _value) { + return _mm256_set1_epi16(uint16_t(-(!!_value))); + } + + simdutf_really_inline simd16() : base16() {} + + simdutf_really_inline simd16(const __m256i _value) : base16(_value) {} - simdutf_really_inline simd16() : base16() {} - simdutf_really_inline simd16(const __m256i _value) : base16(_value) {} // Splat constructor - simdutf_really_inline simd16(bool _value) : base16(splat(_value)) {} + simdutf_really_inline simd16(bool _value) : base16(splat(_value)) {} + + simdutf_really_inline bitmask_type to_bitmask() const { + return _mm256_movemask_epi8(*this); + } - simdutf_really_inline bitmask_type to_bitmask() const { return _mm256_movemask_epi8(*this); } - simdutf_really_inline bool any() const { return !_mm256_testz_si256(*this, *this); } simdutf_really_inline simd16 operator~() const { return *this ^ true; } }; -template -struct base16_numeric: base16 { - static simdutf_really_inline simd16 splat(T _value) { return _mm256_set1_epi16(_value); } - static simdutf_really_inline simd16 zero() { return _mm256_setzero_si256(); } +template struct base16_numeric : base16 { + static simdutf_really_inline simd16 splat(T _value) { + return _mm256_set1_epi16(_value); + } + + static simdutf_really_inline simd16 zero() { + return _mm256_setzero_si256(); + } + static simdutf_really_inline simd16 load(const T values[8]) { return _mm256_loadu_si256(reinterpret_cast(values)); } simdutf_really_inline base16_numeric() : base16() {} - simdutf_really_inline base16_numeric(const __m256i _value) : base16(_value) {} + + simdutf_really_inline base16_numeric(const __m256i _value) + : base16(_value) {} // Store to array - simdutf_really_inline void store(T dst[8]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); } + simdutf_really_inline void store(T dst[8]) const { + return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); + } // Override to distinguish from bool version simdutf_really_inline simd16 operator~() const { return *this ^ 0xFFFFu; } // Addition/subtraction are the same for signed and unsigned - simdutf_really_inline simd16 operator+(const simd16 other) const { return _mm256_add_epi16(*this, other); } - simdutf_really_inline simd16 operator-(const simd16 other) const { return _mm256_sub_epi16(*this, other); } - simdutf_really_inline simd16& operator+=(const simd16 other) { *this = *this + other; return *static_cast*>(this); } - simdutf_really_inline simd16& operator-=(const simd16 other) { *this = *this - other; return *static_cast*>(this); } -}; - -// Signed code units -template<> -struct simd16 : base16_numeric { - simdutf_really_inline simd16() : base16_numeric() {} - simdutf_really_inline simd16(const __m256i _value) : base16_numeric(_value) {} - // Splat constructor - simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {} - // Array constructor - simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {} - simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast(values))) {} - // Order-sensitive comparisons - simdutf_really_inline simd16 max_val(const simd16 other) const { return _mm256_max_epi16(*this, other); } - simdutf_really_inline simd16 min_val(const simd16 other) const { return _mm256_min_epi16(*this, other); } - simdutf_really_inline simd16 operator>(const simd16 other) const { return _mm256_cmpgt_epi16(*this, other); } - simdutf_really_inline simd16 operator<(const simd16 other) const { return _mm256_cmpgt_epi16(other, *this); } + simdutf_really_inline simd16 operator+(const simd16 other) const { + return _mm256_add_epi16(*this, other); + } + simdutf_really_inline simd16 &operator+=(const simd16 other) { + *this = *this + other; + return *static_cast *>(this); + } }; // Unsigned code units -template<> -struct simd16: base16_numeric { +template <> struct simd16 : base16_numeric { simdutf_really_inline simd16() : base16_numeric() {} - simdutf_really_inline simd16(const __m256i _value) : base16_numeric(_value) {} + simdutf_really_inline simd16(const __m256i _value) + : base16_numeric(_value) {} // Splat constructor simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {} // Array constructor - simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {} - simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast(values))) {} - - // Saturated math - simdutf_really_inline simd16 saturating_add(const simd16 other) const { return _mm256_adds_epu16(*this, other); } - simdutf_really_inline simd16 saturating_sub(const simd16 other) const { return _mm256_subs_epu16(*this, other); } + simdutf_really_inline simd16(const uint16_t *values) : simd16(load(values)) {} + simdutf_really_inline simd16(const char16_t *values) + : simd16(load(reinterpret_cast(values))) {} // Order-specific operations - simdutf_really_inline simd16 max_val(const simd16 other) const { return _mm256_max_epu16(*this, other); } - simdutf_really_inline simd16 min_val(const simd16 other) const { return _mm256_min_epu16(*this, other); } - // Same as >, but only guarantees true is nonzero (< guarantees true = -1) - simdutf_really_inline simd16 gt_bits(const simd16 other) const { return this->saturating_sub(other); } + simdutf_really_inline simd16 + max_val(const simd16 other) const { + return _mm256_max_epu16(*this, other); + } + simdutf_really_inline simd16 + min_val(const simd16 other) const { + return _mm256_min_epu16(*this, other); + } // Same as <, but only guarantees true is nonzero (< guarantees true = -1) - simdutf_really_inline simd16 lt_bits(const simd16 other) const { return other.saturating_sub(*this); } - simdutf_really_inline simd16 operator<=(const simd16 other) const { return other.max_val(*this) == other; } - simdutf_really_inline simd16 operator>=(const simd16 other) const { return other.min_val(*this) == other; } - simdutf_really_inline simd16 operator>(const simd16 other) const { return this->gt_bits(other).any_bits_set(); } - simdutf_really_inline simd16 operator<(const simd16 other) const { return this->gt_bits(other).any_bits_set(); } + simdutf_really_inline simd16 + operator<=(const simd16 other) const { + return other.max_val(*this) == other; + } + simdutf_really_inline simd16 + operator>=(const simd16 other) const { + return other.min_val(*this) == other; + } // Bit-specific operations - simdutf_really_inline simd16 bits_not_set() const { return *this == uint16_t(0); } - simdutf_really_inline simd16 bits_not_set(simd16 bits) const { return (*this & bits).bits_not_set(); } - simdutf_really_inline simd16 any_bits_set() const { return ~this->bits_not_set(); } - simdutf_really_inline simd16 any_bits_set(simd16 bits) const { return ~this->bits_not_set(bits); } - - simdutf_really_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); } - simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); } - simdutf_really_inline bool bits_not_set_anywhere(simd16 bits) const { return _mm256_testz_si256(*this, bits); } - simdutf_really_inline bool any_bits_set_anywhere(simd16 bits) const { return !bits_not_set_anywhere(bits); } - template - simdutf_really_inline simd16 shr() const { return simd16(_mm256_srli_epi16(*this, N)); } - template - simdutf_really_inline simd16 shl() const { return simd16(_mm256_slli_epi16(*this, N)); } - // Get one of the bits and make a bitmask out of it. - // e.g. value.get_bit<7>() gets the high bit - template - simdutf_really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 15-N)); } + simdutf_really_inline simd16 bits_not_set() const { + return *this == uint16_t(0); + } + + simdutf_really_inline simd16 any_bits_set() const { + return ~this->bits_not_set(); + } + + template simdutf_really_inline simd16 shr() const { + return simd16(_mm256_srli_epi16(*this, N)); + } // Change the endianness simdutf_really_inline simd16 swap_bytes() const { - const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, - 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); + const __m256i swap = _mm256_setr_epi8( + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, + 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); return _mm256_shuffle_epi8(*this, swap); } - // Pack with the unsigned saturation of two uint16_t code units into single uint8_t vector - static simdutf_really_inline simd8 pack(const simd16& v0, const simd16& v1) { + // Pack with the unsigned saturation of two uint16_t code units into single + // uint8_t vector + static simdutf_really_inline simd8 pack(const simd16 &v0, + const simd16 &v1) { // Note: the AVX2 variant of pack operates on 128-bit lanes, thus // we have to shuffle lanes in order to produce bytes in the // correct order. @@ -2958,108 +5029,253 @@ struct simd16: base16_numeric { // pack code units in linear order from v0 and v1 return _mm256_packus_epi16(t0, t1); } -}; + simdutf_really_inline uint64_t sum() const { + const auto lo_u16 = _mm256_and_si256(value, _mm256_set1_epi32(0x0000ffff)); + const auto hi_u16 = _mm256_srli_epi32(value, 16); + const auto sum_u32 = _mm256_add_epi32(lo_u16, hi_u16); - template - struct simd16x32 { - static constexpr int NUM_CHUNKS = 64 / sizeof(simd16); - static_assert(NUM_CHUNKS == 2, "Haswell kernel should use two registers per 64-byte block."); - simd16 chunks[NUM_CHUNKS]; + const auto lo_u32 = + _mm256_and_si256(sum_u32, _mm256_set1_epi64x(0xffffffff)); + const auto hi_u32 = _mm256_srli_epi64(sum_u32, 32); + const auto sum_u64 = _mm256_add_epi64(lo_u32, hi_u32); - simd16x32(const simd16x32& o) = delete; // no copy allowed - simd16x32& operator=(const simd16 other) = delete; // no assignment allowed - simd16x32() = delete; // no default constructor allowed + return uint64_t(_mm256_extract_epi64(sum_u64, 0)) + + uint64_t(_mm256_extract_epi64(sum_u64, 1)) + + uint64_t(_mm256_extract_epi64(sum_u64, 2)) + + uint64_t(_mm256_extract_epi64(sum_u64, 3)); + } +}; - simdutf_really_inline simd16x32(const simd16 chunk0, const simd16 chunk1) : chunks{chunk0, chunk1} {} - simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16::load(ptr), simd16::load(ptr+sizeof(simd16)/sizeof(T))} {} +template struct simd16x32 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd16); + static_assert(NUM_CHUNKS == 2, + "Haswell kernel should use two registers per 64-byte block."); + simd16 chunks[NUM_CHUNKS]; - simdutf_really_inline void store(T* ptr) const { - this->chunks[0].store(ptr+sizeof(simd16)*0/sizeof(T)); - this->chunks[1].store(ptr+sizeof(simd16)*1/sizeof(T)); - } + simd16x32(const simd16x32 &o) = delete; // no copy allowed + simd16x32 & + operator=(const simd16 other) = delete; // no assignment allowed + simd16x32() = delete; // no default constructor allowed - simdutf_really_inline uint64_t to_bitmask() const { - uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask()); - uint64_t r_hi = this->chunks[1].to_bitmask(); - return r_lo | (r_hi << 32); - } + simdutf_really_inline simd16x32(const simd16 chunk0, + const simd16 chunk1) + : chunks{chunk0, chunk1} {} + simdutf_really_inline simd16x32(const T *ptr) + : chunks{simd16::load(ptr), + simd16::load(ptr + sizeof(simd16) / sizeof(T))} {} - simdutf_really_inline simd16 reduce_or() const { - return this->chunks[0] | this->chunks[1]; - } + simdutf_really_inline void store(T *ptr) const { + this->chunks[0].store(ptr + sizeof(simd16) * 0 / sizeof(T)); + this->chunks[1].store(ptr + sizeof(simd16) * 1 / sizeof(T)); + } - simdutf_really_inline bool is_ascii() const { - return this->reduce_or().is_ascii(); - } + simdutf_really_inline uint64_t to_bitmask() const { + uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask()); + uint64_t r_hi = this->chunks[1].to_bitmask(); + return r_lo | (r_hi << 32); + } - simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const { - this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16)*0); - this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16)); - } + simdutf_really_inline simd16 reduce_or() const { + return this->chunks[0] | this->chunks[1]; + } - simdutf_really_inline simd16x32 bit_or(const T m) const { - const simd16 mask = simd16::splat(m); - return simd16x32( - this->chunks[0] | mask, - this->chunks[1] | mask - ); - } + simdutf_really_inline bool is_ascii() const { + return this->reduce_or().is_ascii(); + } - simdutf_really_inline void swap_bytes() { - this->chunks[0] = this->chunks[0].swap_bytes(); - this->chunks[1] = this->chunks[1].swap_bytes(); - } + simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const { + this->chunks[0].store_ascii_as_utf16(ptr + sizeof(simd16) * 0); + this->chunks[1].store_ascii_as_utf16(ptr + sizeof(simd16)); + } - simdutf_really_inline uint64_t eq(const T m) const { - const simd16 mask = simd16::splat(m); - return simd16x32( - this->chunks[0] == mask, - this->chunks[1] == mask - ).to_bitmask(); - } + simdutf_really_inline void swap_bytes() { + this->chunks[0] = this->chunks[0].swap_bytes(); + this->chunks[1] = this->chunks[1].swap_bytes(); + } + simdutf_really_inline uint64_t gt(const T m) const { + const simd16 mask = simd16::splat(m); + return simd16x32(this->chunks[0] > mask, this->chunks[1] > mask) + .to_bitmask(); + } - simdutf_really_inline uint64_t eq(const simd16x32 &other) const { - return simd16x32( - this->chunks[0] == other.chunks[0], - this->chunks[1] == other.chunks[1] - ).to_bitmask(); - } + simdutf_really_inline uint64_t lteq(const T m) const { + const simd16 mask = simd16::splat(m); + return simd16x32(this->chunks[0] <= mask, this->chunks[1] <= mask) + .to_bitmask(); + } + simdutf_really_inline uint64_t eq(const T m) const { + const simd16 mask = simd16::splat(m); + return simd16x32(this->chunks[0] == mask, this->chunks[1] == mask) + .to_bitmask(); + } + simdutf_really_inline uint64_t not_in_range(const T low, const T high) const { + const simd16 mask_low = simd16::splat(static_cast(low - 1)); + const simd16 mask_high = simd16::splat(static_cast(high + 1)); + return simd16x32( + (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low), + (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low)) + .to_bitmask(); + } +}; // struct simd16x32 - simdutf_really_inline uint64_t lteq(const T m) const { - const simd16 mask = simd16::splat(m); - return simd16x32( - this->chunks[0] <= mask, - this->chunks[1] <= mask - ).to_bitmask(); - } +simd16 min(const simd16 a, simd16 b) { + return _mm256_min_epu16(a.value, b.value); +} +/* end file src/simdutf/haswell/simd16-inl.h */ +/* begin file src/simdutf/haswell/simd32-inl.h */ +template struct simd32; - simdutf_really_inline uint64_t in_range(const T low, const T high) const { - const simd16 mask_low = simd16::splat(low); - const simd16 mask_high = simd16::splat(high); +template <> struct simd32 { + static const size_t SIZE = sizeof(__m256i); + static const size_t ELEMENTS = SIZE / sizeof(uint32_t); - return simd16x32( - (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low), - (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low) - ).to_bitmask(); - } - simdutf_really_inline uint64_t not_in_range(const T low, const T high) const { - const simd16 mask_low = simd16::splat(static_cast(low-1)); - const simd16 mask_high = simd16::splat(static_cast(high+1)); - return simd16x32( - (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low), - (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low) - ).to_bitmask(); - } - simdutf_really_inline uint64_t lt(const T m) const { - const simd16 mask = simd16::splat(m); - return simd16x32( - this->chunks[0] < mask, - this->chunks[1] < mask - ).to_bitmask(); - } - }; // struct simd16x32 -/* end file src/simdutf/haswell/simd16-inl.h */ + __m256i value; + + simdutf_really_inline simd32(const __m256i v) : value(v) {} + + template + simdutf_really_inline simd32(const Pointer *ptr) + : value(_mm256_loadu_si256(reinterpret_cast(ptr))) {} + + simdutf_really_inline uint64_t sum() const { + const __m256i mask = _mm256_set1_epi64x(0xffffffff); + const __m256i t0 = _mm256_and_si256(value, mask); + const __m256i t1 = _mm256_srli_epi64(value, 32); + const __m256i t2 = _mm256_add_epi64(t0, t1); + + return uint64_t(_mm256_extract_epi64(t2, 0)) + + uint64_t(_mm256_extract_epi64(t2, 1)) + + uint64_t(_mm256_extract_epi64(t2, 2)) + + uint64_t(_mm256_extract_epi64(t2, 3)); + } + + simdutf_really_inline simd32 swap_bytes() const { + const __m256i shuffle = + _mm256_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 15, 14, 13, 12, + 3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 15, 14, 13, 12); + + return _mm256_shuffle_epi8(value, shuffle); + } + + // operators + simdutf_really_inline simd32 &operator+=(const simd32 other) { + value = _mm256_add_epi32(value, other.value); + return *this; + } + + // static members + simdutf_really_inline static simd32 zero() { + return _mm256_setzero_si256(); + } + + simdutf_really_inline static simd32 splat(uint32_t v) { + return _mm256_set1_epi32(v); + } +}; + +//---------------------------------------------------------------------- + +template <> struct simd32 { + // static const size_t SIZE = sizeof(__m128i); + // static const size_t ELEMENTS = SIZE / sizeof(uint32_t); + + __m256i value; + + simdutf_really_inline simd32(const __m256i v) : value(v) {} + + simdutf_really_inline bool any() const { + return _mm256_movemask_epi8(value) != 0; + } +}; + +//---------------------------------------------------------------------- + +template +simdutf_really_inline simd32 operator|(const simd32 a, + const simd32 b) { + return _mm256_or_si256(a.value, b.value); +} + +simdutf_really_inline simd32 min(const simd32 b, + const simd32 a) { + return _mm256_min_epu32(a.value, b.value); +} + +simdutf_really_inline simd32 max(const simd32 a, + const simd32 b) { + return _mm256_max_epu32(a.value, b.value); +} + +simdutf_really_inline simd32 operator&(const simd32 b, + const simd32 a) { + return _mm256_and_si256(a.value, b.value); +} + +simdutf_really_inline simd32 operator+(const simd32 a, + const simd32 b) { + return _mm256_add_epi32(a.value, b.value); +} + +simdutf_really_inline simd32 operator==(const simd32 a, + const simd32 b) { + return _mm256_cmpeq_epi32(a.value, b.value); +} + +simdutf_really_inline simd32 operator>=(const simd32 a, + const simd32 b) { + return _mm256_cmpeq_epi32(_mm256_max_epu32(a.value, b.value), a.value); +} + +simdutf_really_inline simd32 operator!(const simd32 v) { + return _mm256_xor_si256(v.value, _mm256_set1_epi8(-1)); +} + +simdutf_really_inline simd32 operator>(const simd32 a, + const simd32 b) { + return !(b >= a); +} +/* end file src/simdutf/haswell/simd32-inl.h */ +/* begin file src/simdutf/haswell/simd64-inl.h */ +template struct simd64; + +template <> struct simd64 { + // static const size_t SIZE = sizeof(__m256i); + // static const size_t ELEMENTS = SIZE / sizeof(uint64_t); + + __m256i value; + + simdutf_really_inline simd64(const __m256i v) : value(v) {} + + template + simdutf_really_inline simd64(const Pointer *ptr) + : value(_mm256_loadu_si256(reinterpret_cast(ptr))) {} + + simdutf_really_inline uint64_t sum() const { + return _mm256_extract_epi64(value, 0) + _mm256_extract_epi64(value, 1) + + _mm256_extract_epi64(value, 2) + _mm256_extract_epi64(value, 3); + } + + // operators + simdutf_really_inline simd64 &operator+=(const simd64 other) { + value = _mm256_add_epi64(value, other.value); + return *this; + } + + // static members + simdutf_really_inline static simd64 zero() { + return _mm256_setzero_si256(); + } + + simdutf_really_inline static simd64 splat(uint64_t v) { + return _mm256_set1_epi64x(v); + } +}; +/* end file src/simdutf/haswell/simd64-inl.h */ + +simdutf_really_inline simd64 sum_8bytes(const simd8 v) { + return _mm256_sad_epu8(v.value, simd8::zero()); +} } // namespace simd @@ -3077,8 +5293,10 @@ struct simd16: base16_numeric { SIMDUTF_UNTARGET_REGION #endif +#undef SIMDUTF_SIMD_HAS_BYTEMASK -#if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593 +#if SIMDUTF_GCC11ORMORE // workaround for + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593 SIMDUTF_POP_DISABLE_WARNINGS #endif // end of workaround /* end file src/simdutf/haswell/end.h */ @@ -3091,51 +5309,51 @@ SIMDUTF_POP_DISABLE_WARNINGS #define SIMDUTF_WESTMERE_H #ifdef SIMDUTF_FALLBACK_H -#error "westmere.h must be included before fallback.h" + #error "westmere.h must be included before fallback.h" #endif // Default Westmere to on if this is x86-64, unless we'll always select Haswell. #ifndef SIMDUTF_IMPLEMENTATION_WESTMERE -// -// You do not want to set it to (SIMDUTF_IS_X86_64 && !SIMDUTF_REQUIRES_HASWELL) -// because you want to rely on runtime dispatch! -// -#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE || SIMDUTF_CAN_ALWAYS_RUN_HASWELL -#define SIMDUTF_IMPLEMENTATION_WESTMERE 0 -#else -#define SIMDUTF_IMPLEMENTATION_WESTMERE (SIMDUTF_IS_X86_64) -#endif + // + // You do not want to set it to (SIMDUTF_IS_X86_64 && + // !SIMDUTF_REQUIRES_HASWELL) because you want to rely on runtime dispatch! + // + #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE || SIMDUTF_CAN_ALWAYS_RUN_HASWELL + #define SIMDUTF_IMPLEMENTATION_WESTMERE 0 + #else + #define SIMDUTF_IMPLEMENTATION_WESTMERE (SIMDUTF_IS_X86_64) + #endif #endif #if (SIMDUTF_IMPLEMENTATION_WESTMERE && SIMDUTF_IS_X86_64 && __SSE4_2__) -#define SIMDUTF_CAN_ALWAYS_RUN_WESTMERE 1 + #define SIMDUTF_CAN_ALWAYS_RUN_WESTMERE 1 #else -#define SIMDUTF_CAN_ALWAYS_RUN_WESTMERE 0 + #define SIMDUTF_CAN_ALWAYS_RUN_WESTMERE 0 #endif #if SIMDUTF_IMPLEMENTATION_WESTMERE -#define SIMDUTF_TARGET_WESTMERE SIMDUTF_TARGET_REGION("sse4.2,popcnt") + #define SIMDUTF_TARGET_WESTMERE SIMDUTF_TARGET_REGION("sse4.2,popcnt") namespace simdutf { /** * Implementation for Westmere (Intel SSE4.2). */ -namespace westmere { -} // namespace westmere +namespace westmere {} // namespace westmere } // namespace simdutf -// -// These two need to be included outside SIMDUTF_TARGET_REGION -// + // + // These two need to be included outside SIMDUTF_TARGET_REGION + // /* begin file src/simdutf/westmere/implementation.h */ #ifndef SIMDUTF_WESTMERE_IMPLEMENTATION_H #define SIMDUTF_WESTMERE_IMPLEMENTATION_H -// The constructor may be executed on any host, so we take care not to use SIMDUTF_TARGET_REGION +// The constructor may be executed on any host, so we take care not to use +// SIMDUTF_TARGET_REGION namespace simdutf { namespace westmere { @@ -3145,88 +5363,309 @@ using namespace simdutf; class implementation final : public simdutf::implementation { public: - simdutf_really_inline implementation() : simdutf::implementation("westmere", "Intel/AMD SSE4.2", internal::instruction_set::SSE42) {} - simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final; - simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final; - simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final; - simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final; - simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final; - simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final; - simdutf_warn_unused size_t convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept final; - simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_latin1_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; - simdutf_warn_unused size_t convert_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final; - simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final; - simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; - simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; - simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; - simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; - simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final; - simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * buf, size_t len, char* latin1_output) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final; - simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final; - simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept; - simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept; - simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept; - simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept; - simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept; - simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept; - simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept; - simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept; - simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept; - simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept; - simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept; - simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept; - simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept; - simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept; - simdutf_warn_unused size_t base64_length_from_binary(size_t length, base64_options options) const noexcept; - size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept; + simdutf_really_inline implementation() + : simdutf::implementation("westmere", "Intel/AMD SSE4.2", + internal::instruction_set::SSE42) {} + +#if SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused int detect_encodings(const char *input, + size_t length) const noexcept final; +#endif // SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool validate_utf8(const char *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF8 + simdutf_warn_unused result + validate_utf8_with_errors(const char *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_ASCII + simdutf_warn_unused bool validate_ascii(const char *buf, + size_t len) const noexcept final; + simdutf_warn_unused result + validate_ascii_with_errors(const char *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_ASCII +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII + simdutf_warn_unused bool + validate_utf16le_as_ascii(const char16_t *buf, + size_t len) const noexcept final; + + simdutf_warn_unused bool + validate_utf16be_as_ascii(const char16_t *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII + +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool validate_utf16le(const char16_t *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused bool validate_utf16be(const char16_t *buf, + size_t len) const noexcept final; + simdutf_warn_unused result validate_utf16le_with_errors( + const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf16be_with_errors( + const char16_t *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool validate_utf32(const char32_t *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused result validate_utf32_with_errors( + const char32_t *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf8( + const char *buf, size_t len, char *utf8_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf16le( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_latin1_to_utf16be( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_latin1_with_errors( + const char *buf, size_t len, char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t convert_utf8_to_utf16le( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf16be( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf16le_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf16be_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16le( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16be( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t convert_utf8_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf32_with_errors( + const char *buf, size_t len, char32_t *utf32_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf32( + const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + convert_utf16le_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf16be_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_latin1_with_errors( + const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_latin1_with_errors( + const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t convert_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t convert_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf8_with_errors( + const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + convert_utf32_to_latin1(const char32_t *buf, size_t len, + char *latin1_output) const noexcept final; + simdutf_warn_unused result + convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len, + char *latin1_output) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf32_to_latin1(const char32_t *buf, size_t len, + char *latin1_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t + convert_utf32_to_utf16le(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf32_to_utf16be(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf16le_with_errors( + const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf16be_with_errors( + const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf16le_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf16be_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_utf32_with_errors( + const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_utf32_with_errors( + const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 + void change_endianness_utf16(const char16_t *buf, size_t length, + char16_t *output) const noexcept final; + simdutf_warn_unused size_t + count_utf16le(const char16_t *buf, size_t length) const noexcept override; + simdutf_warn_unused size_t + count_utf16be(const char16_t *buf, size_t length) const noexcept override; + void to_well_formed_utf16be(const char16_t *input, size_t len, + char16_t *output) const noexcept final; + void to_well_formed_utf16le(const char16_t *input, size_t len, + char16_t *output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 + simdutf_warn_unused size_t count_utf8(const char *buf, + size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t utf8_length_from_utf16le( + const char16_t *input, size_t length) const noexcept override; + simdutf_warn_unused size_t utf8_length_from_utf16be( + const char16_t *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t utf32_length_from_utf16le( + const char16_t *input, size_t length) const noexcept override; + simdutf_warn_unused size_t utf32_length_from_utf16be( + const char16_t *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t utf16_length_from_utf8( + const char *input, size_t length) const noexcept override; + simdutf_warn_unused result utf8_length_from_utf16le_with_replacement( + const char16_t *input, size_t length) const noexcept override; + ; + simdutf_warn_unused result utf8_length_from_utf16be_with_replacement( + const char16_t *input, size_t length) const noexcept override; + ; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t utf8_length_from_utf32( + const char32_t *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t utf16_length_from_utf32( + const char32_t *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t utf32_length_from_utf8( + const char *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t latin1_length_from_utf8( + const char *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t utf8_length_from_latin1( + const char *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_BASE64 + simdutf_warn_unused result base64_to_binary( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept override; + simdutf_warn_unused full_result base64_to_binary_details( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept override; + simdutf_warn_unused result base64_to_binary( + const char16_t *input, size_t length, char *output, + base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept override; + simdutf_warn_unused full_result base64_to_binary_details( + const char16_t *input, size_t length, char *output, + base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept override; + size_t binary_to_base64(const char *input, size_t length, char *output, + base64_options options) const noexcept override; + size_t + binary_to_base64_with_lines(const char *input, size_t length, char *output, + size_t line_length, + base64_options options) const noexcept override; + const char *find(const char *start, const char *end, + char character) const noexcept override; + const char16_t *find(const char16_t *start, const char16_t *end, + char16_t character) const noexcept override; +#endif // SIMDUTF_FEATURE_BASE64 }; } // namespace westmere @@ -3239,52 +5678,49 @@ class implementation final : public simdutf::implementation { #define SIMDUTF_WESTMERE_INTRINSICS_H #ifdef SIMDUTF_VISUAL_STUDIO -// under clang within visual studio, this will include -#include // visual studio or clang + // under clang within visual studio, this will include + #include // visual studio or clang #else -#if SIMDUTF_GCC11ORMORE + #if SIMDUTF_GCC11ORMORE // We should not get warnings while including yet we do // under some versions of GCC. // If the x86intrin.h header has uninitialized values that are problematic, // it is a GCC issue, we want to ignore these warnings. SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized) -#endif - -#include // elsewhere + #endif + #include // elsewhere -#if SIMDUTF_GCC11ORMORE + #if SIMDUTF_GCC11ORMORE // cancels the suppression of the -Wuninitialized SIMDUTF_POP_DISABLE_WARNINGS -#endif + #endif #endif // SIMDUTF_VISUAL_STUDIO - #ifdef SIMDUTF_CLANG_VISUAL_STUDIO -/** - * You are not supposed, normally, to include these - * headers directly. Instead you should either include intrin.h - * or x86intrin.h. However, when compiling with clang - * under Windows (i.e., when _MSC_VER is set), these headers - * only get included *if* the corresponding features are detected - * from macros: - */ -#include // for _mm_alignr_epi8 + /** + * You are not supposed, normally, to include these + * headers directly. Instead you should either include intrin.h + * or x86intrin.h. However, when compiling with clang + * under Windows (i.e., when _MSC_VER is set), these headers + * only get included *if* the corresponding features are detected + * from macros: + */ + #include // for _mm_alignr_epi8 #endif - - #endif // SIMDUTF_WESTMERE_INTRINSICS_H /* end file src/simdutf/westmere/intrinsics.h */ -// -// The rest need to be inside the region -// + // + // The rest need to be inside the region + // /* begin file src/simdutf/westmere/begin.h */ // redefining SIMDUTF_IMPLEMENTATION to "westmere" // #define SIMDUTF_IMPLEMENTATION westmere +#define SIMDUTF_SIMD_HAS_BYTEMASK 1 #if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE // nothing needed. @@ -3293,7 +5729,7 @@ SIMDUTF_TARGET_WESTMERE #endif /* end file src/simdutf/westmere/begin.h */ -// Declarations + // Declarations /* begin file src/simdutf/westmere/bitmanipulation.h */ #ifndef SIMDUTF_WESTMERE_BITMANIPULATION_H #define SIMDUTF_WESTMERE_BITMANIPULATION_H @@ -3305,7 +5741,7 @@ namespace { #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) { // note: we do not support legacy 32-bit Windows - return __popcnt64(input_num);// Visual Studio wants two underscores + return __popcnt64(input_num); // Visual Studio wants two underscores } #else simdutf_really_inline long long int count_ones(uint64_t input_num) { @@ -3315,16 +5751,18 @@ simdutf_really_inline long long int count_ones(uint64_t input_num) { #if SIMDUTF_NEED_TRAILING_ZEROES simdutf_really_inline int trailing_zeroes(uint64_t input_num) { -#if SIMDUTF_REGULAR_VISUAL_STUDIO + #if SIMDUTF_REGULAR_VISUAL_STUDIO unsigned long ret; _BitScanForward64(&ret, input_num); return (int)ret; -#else // SIMDUTF_REGULAR_VISUAL_STUDIO + #else // SIMDUTF_REGULAR_VISUAL_STUDIO return __builtin_ctzll(input_num); -#endif // SIMDUTF_REGULAR_VISUAL_STUDIO + #endif // SIMDUTF_REGULAR_VISUAL_STUDIO } #endif +template bool is_power_of_two(T x) { return (x & (x - 1)) == 0; } + } // unnamed namespace } // namespace westmere } // namespace simdutf @@ -3340,20630 +5778,15452 @@ namespace westmere { namespace { namespace simd { - template - struct base { - __m128i value; - - // Zero constructor - simdutf_really_inline base() : value{__m128i()} {} - - // Conversion from SIMD register - simdutf_really_inline base(const __m128i _value) : value(_value) {} - // Conversion to SIMD register - simdutf_really_inline operator const __m128i&() const { return this->value; } - simdutf_really_inline operator __m128i&() { return this->value; } - template - simdutf_really_inline void store_ascii_as_utf16(char16_t * p) const { - __m128i first = _mm_cvtepu8_epi16(*this); - __m128i second = _mm_cvtepu8_epi16(_mm_srli_si128(*this,8)); - if (big_endian) { - const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - first = _mm_shuffle_epi8(first, swap); - second = _mm_shuffle_epi8(second, swap); - } - _mm_storeu_si128(reinterpret_cast<__m128i *>(p), first); - _mm_storeu_si128(reinterpret_cast<__m128i *>(p+8), second); - } - simdutf_really_inline void store_ascii_as_utf32(char32_t * p) const { - _mm_storeu_si128(reinterpret_cast<__m128i *>(p), _mm_cvtepu8_epi32(*this)); - _mm_storeu_si128(reinterpret_cast<__m128i *>(p+4), _mm_cvtepu8_epi32(_mm_srli_si128(*this,4))); - _mm_storeu_si128(reinterpret_cast<__m128i *>(p+8), _mm_cvtepu8_epi32(_mm_srli_si128(*this,8))); - _mm_storeu_si128(reinterpret_cast<__m128i *>(p+12), _mm_cvtepu8_epi32(_mm_srli_si128(*this,12))); - } - // Bit operations - simdutf_really_inline Child operator|(const Child other) const { return _mm_or_si128(*this, other); } - simdutf_really_inline Child operator&(const Child other) const { return _mm_and_si128(*this, other); } - simdutf_really_inline Child operator^(const Child other) const { return _mm_xor_si128(*this, other); } - simdutf_really_inline Child bit_andnot(const Child other) const { return _mm_andnot_si128(other, *this); } - simdutf_really_inline Child& operator|=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast | other; return *this_cast; } - simdutf_really_inline Child& operator&=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast & other; return *this_cast; } - simdutf_really_inline Child& operator^=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast ^ other; return *this_cast; } - }; - - // Forward-declared so they can be used by splat and friends. - template - struct simd8; - - template> - struct base8: base> { - typedef uint16_t bitmask_t; - typedef uint32_t bitmask2_t; - - simdutf_really_inline T first() const { return _mm_extract_epi8(*this,0); } - simdutf_really_inline T last() const { return _mm_extract_epi8(*this,15); } - simdutf_really_inline base8() : base>() {} - simdutf_really_inline base8(const __m128i _value) : base>(_value) {} - - friend simdutf_really_inline Mask operator==(const simd8 lhs, const simd8 rhs) { return _mm_cmpeq_epi8(lhs, rhs); } - - static const int SIZE = sizeof(base>::value); - - template - simdutf_really_inline simd8 prev(const simd8 prev_chunk) const { - return _mm_alignr_epi8(*this, prev_chunk, 16 - N); - } - }; - - // SIMD byte mask type (returned by things like eq and gt) - template<> - struct simd8: base8 { - static simdutf_really_inline simd8 splat(bool _value) { return _mm_set1_epi8(uint8_t(-(!!_value))); } - - simdutf_really_inline simd8() : base8() {} - simdutf_really_inline simd8(const __m128i _value) : base8(_value) {} - // Splat constructor - simdutf_really_inline simd8(bool _value) : base8(splat(_value)) {} - - simdutf_really_inline int to_bitmask() const { return _mm_movemask_epi8(*this); } - simdutf_really_inline bool any() const { return !_mm_testz_si128(*this, *this); } - simdutf_really_inline bool none() const { return _mm_testz_si128(*this, *this); } - simdutf_really_inline bool all() const { return _mm_movemask_epi8(*this) == 0xFFFF; } - simdutf_really_inline simd8 operator~() const { return *this ^ true; } - }; - - template - struct base8_numeric: base8 { - static simdutf_really_inline simd8 splat(T _value) { return _mm_set1_epi8(_value); } - static simdutf_really_inline simd8 zero() { return _mm_setzero_si128(); } - static simdutf_really_inline simd8 load(const T values[16]) { - return _mm_loadu_si128(reinterpret_cast(values)); - } - // Repeat 16 values as many times as necessary (usually for lookup tables) - static simdutf_really_inline simd8 repeat_16( - T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, - T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15 - ) { - return simd8( - v0, v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10,v11,v12,v13,v14,v15 - ); - } - - simdutf_really_inline base8_numeric() : base8() {} - simdutf_really_inline base8_numeric(const __m128i _value) : base8(_value) {} - - // Store to array - simdutf_really_inline void store(T dst[16]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); } - - // Override to distinguish from bool version - simdutf_really_inline simd8 operator~() const { return *this ^ 0xFFu; } - - // Addition/subtraction are the same for signed and unsigned - simdutf_really_inline simd8 operator+(const simd8 other) const { return _mm_add_epi8(*this, other); } - simdutf_really_inline simd8 operator-(const simd8 other) const { return _mm_sub_epi8(*this, other); } - simdutf_really_inline simd8& operator+=(const simd8 other) { *this = *this + other; return *static_cast*>(this); } - simdutf_really_inline simd8& operator-=(const simd8 other) { *this = *this - other; return *static_cast*>(this); } - - // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values) - template - simdutf_really_inline simd8 lookup_16(simd8 lookup_table) const { - return _mm_shuffle_epi8(lookup_table, *this); - } - - template - simdutf_really_inline simd8 lookup_16( - L replace0, L replace1, L replace2, L replace3, - L replace4, L replace5, L replace6, L replace7, - L replace8, L replace9, L replace10, L replace11, - L replace12, L replace13, L replace14, L replace15) const { - return lookup_16(simd8::repeat_16( - replace0, replace1, replace2, replace3, - replace4, replace5, replace6, replace7, - replace8, replace9, replace10, replace11, - replace12, replace13, replace14, replace15 - )); - } - }; +template struct base { + __m128i value; - // Signed bytes - template<> - struct simd8 : base8_numeric { - simdutf_really_inline simd8() : base8_numeric() {} - simdutf_really_inline simd8(const __m128i _value) : base8_numeric(_value) {} - // Splat constructor - simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {} - // Array constructor - simdutf_really_inline simd8(const int8_t* values) : simd8(load(values)) {} - // Member-by-member initialization - simdutf_really_inline simd8( - int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, - int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 - ) : simd8(_mm_setr_epi8( - v0, v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10,v11,v12,v13,v14,v15 - )) {} - // Repeat 16 values as many times as necessary (usually for lookup tables) - simdutf_really_inline static simd8 repeat_16( - int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, - int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 - ) { - return simd8( - v0, v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10,v11,v12,v13,v14,v15 - ); - } - simdutf_really_inline operator simd8() const; - simdutf_really_inline bool is_ascii() const { return _mm_movemask_epi8(*this) == 0; } - - // Order-sensitive comparisons - simdutf_really_inline simd8 max_val(const simd8 other) const { return _mm_max_epi8(*this, other); } - simdutf_really_inline simd8 min_val(const simd8 other) const { return _mm_min_epi8(*this, other); } - simdutf_really_inline simd8 operator>(const simd8 other) const { return _mm_cmpgt_epi8(*this, other); } - simdutf_really_inline simd8 operator<(const simd8 other) const { return _mm_cmpgt_epi8(other, *this); } - }; + // Zero constructor + simdutf_really_inline base() : value{__m128i()} {} - // Unsigned bytes - template<> - struct simd8: base8_numeric { - simdutf_really_inline simd8() : base8_numeric() {} - simdutf_really_inline simd8(const __m128i _value) : base8_numeric(_value) {} - - // Splat constructor - simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {} - // Array constructor - simdutf_really_inline simd8(const uint8_t* values) : simd8(load(values)) {} - // Member-by-member initialization - simdutf_really_inline simd8( - uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, - uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 - ) : simd8(_mm_setr_epi8( - v0, v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10,v11,v12,v13,v14,v15 - )) {} - // Repeat 16 values as many times as necessary (usually for lookup tables) - simdutf_really_inline static simd8 repeat_16( - uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, - uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 - ) { - return simd8( - v0, v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10,v11,v12,v13,v14,v15 - ); - } - - // Saturated math - simdutf_really_inline simd8 saturating_add(const simd8 other) const { return _mm_adds_epu8(*this, other); } - simdutf_really_inline simd8 saturating_sub(const simd8 other) const { return _mm_subs_epu8(*this, other); } - - // Order-specific operations - simdutf_really_inline simd8 max_val(const simd8 other) const { return _mm_max_epu8(*this, other); } - simdutf_really_inline simd8 min_val(const simd8 other) const { return _mm_min_epu8(*this, other); } - // Same as >, but only guarantees true is nonzero (< guarantees true = -1) - simdutf_really_inline simd8 gt_bits(const simd8 other) const { return this->saturating_sub(other); } - // Same as <, but only guarantees true is nonzero (< guarantees true = -1) - simdutf_really_inline simd8 lt_bits(const simd8 other) const { return other.saturating_sub(*this); } - simdutf_really_inline simd8 operator<=(const simd8 other) const { return other.max_val(*this) == other; } - simdutf_really_inline simd8 operator>=(const simd8 other) const { return other.min_val(*this) == other; } - simdutf_really_inline simd8 operator>(const simd8 other) const { return this->gt_bits(other).any_bits_set(); } - simdutf_really_inline simd8 operator<(const simd8 other) const { return this->gt_bits(other).any_bits_set(); } - - // Bit-specific operations - simdutf_really_inline simd8 bits_not_set() const { return *this == uint8_t(0); } - simdutf_really_inline simd8 bits_not_set(simd8 bits) const { return (*this & bits).bits_not_set(); } - simdutf_really_inline simd8 any_bits_set() const { return ~this->bits_not_set(); } - simdutf_really_inline simd8 any_bits_set(simd8 bits) const { return ~this->bits_not_set(bits); } - simdutf_really_inline bool is_ascii() const { return _mm_movemask_epi8(*this) == 0; } - - simdutf_really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); } - simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); } - simdutf_really_inline bool bits_not_set_anywhere(simd8 bits) const { return _mm_testz_si128(*this, bits); } - simdutf_really_inline bool any_bits_set_anywhere(simd8 bits) const { return !bits_not_set_anywhere(bits); } - template - simdutf_really_inline simd8 shr() const { return simd8(_mm_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); } - template - simdutf_really_inline simd8 shl() const { return simd8(_mm_slli_epi16(*this, N)) & uint8_t(0xFFu << N); } - // Get one of the bits and make a bitmask out of it. - // e.g. value.get_bit<7>() gets the high bit - template - simdutf_really_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N)); } - }; - simdutf_really_inline simd8::operator simd8() const { return this->value; } - - // Unsigned bytes - template<> - struct simd8: base { - static simdutf_really_inline simd8 splat(uint16_t _value) { return _mm_set1_epi16(_value); } - static simdutf_really_inline simd8 load(const uint16_t values[8]) { - return _mm_loadu_si128(reinterpret_cast(values)); - } - - simdutf_really_inline simd8() : base() {} - simdutf_really_inline simd8(const __m128i _value) : base(_value) {} - // Splat constructor - simdutf_really_inline simd8(uint16_t _value) : simd8(splat(_value)) {} - // Array constructor - simdutf_really_inline simd8(const uint16_t* values) : simd8(load(values)) {} - // Member-by-member initialization - simdutf_really_inline simd8( - uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7 - ) : simd8(_mm_setr_epi16( - v0, v1, v2, v3, v4, v5, v6, v7 - )) {} - - // Saturated math - simdutf_really_inline simd8 saturating_add(const simd8 other) const { return _mm_adds_epu16(*this, other); } - simdutf_really_inline simd8 saturating_sub(const simd8 other) const { return _mm_subs_epu16(*this, other); } - - // Order-specific operations - simdutf_really_inline simd8 max_val(const simd8 other) const { return _mm_max_epu16(*this, other); } - simdutf_really_inline simd8 min_val(const simd8 other) const { return _mm_min_epu16(*this, other); } - // Same as >, but only guarantees true is nonzero (< guarantees true = -1) - simdutf_really_inline simd8 gt_bits(const simd8 other) const { return this->saturating_sub(other); } - // Same as <, but only guarantees true is nonzero (< guarantees true = -1) - simdutf_really_inline simd8 lt_bits(const simd8 other) const { return other.saturating_sub(*this); } - simdutf_really_inline simd8 operator<=(const simd8 other) const { return other.max_val(*this) == other; } - simdutf_really_inline simd8 operator>=(const simd8 other) const { return other.min_val(*this) == other; } - simdutf_really_inline simd8 operator==(const simd8 other) const { return _mm_cmpeq_epi16(*this, other); } - simdutf_really_inline simd8 operator&(const simd8 other) const { return _mm_and_si128(*this, other); } - simdutf_really_inline simd8 operator|(const simd8 other) const { return _mm_or_si128(*this, other); } - - // Bit-specific operations - simdutf_really_inline simd8 bits_not_set() const { return *this == uint16_t(0); } - simdutf_really_inline simd8 any_bits_set() const { return ~this->bits_not_set(); } - - simdutf_really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); } - simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); } - simdutf_really_inline bool bits_not_set_anywhere(simd8 bits) const { return _mm_testz_si128(*this, bits); } - simdutf_really_inline bool any_bits_set_anywhere(simd8 bits) const { return !bits_not_set_anywhere(bits); } - }; - template - struct simd8x64 { - static constexpr int NUM_CHUNKS = 64 / sizeof(simd8); - static_assert(NUM_CHUNKS == 4, "Westmere kernel should use four registers per 64-byte block."); - simd8 chunks[NUM_CHUNKS]; - - simd8x64(const simd8x64& o) = delete; // no copy allowed - simd8x64& operator=(const simd8 other) = delete; // no assignment allowed - simd8x64() = delete; // no default constructor allowed - - simdutf_really_inline simd8x64(const simd8 chunk0, const simd8 chunk1, const simd8 chunk2, const simd8 chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {} - simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8::load(ptr), simd8::load(ptr+sizeof(simd8)/sizeof(T)), simd8::load(ptr+2*sizeof(simd8)/sizeof(T)), simd8::load(ptr+3*sizeof(simd8)/sizeof(T))} {} - - simdutf_really_inline void store(T* ptr) const { - this->chunks[0].store(ptr+sizeof(simd8)*0/sizeof(T)); - this->chunks[1].store(ptr+sizeof(simd8)*1/sizeof(T)); - this->chunks[2].store(ptr+sizeof(simd8)*2/sizeof(T)); - this->chunks[3].store(ptr+sizeof(simd8)*3/sizeof(T)); - } - - simdutf_really_inline simd8x64& operator |=(const simd8x64 &other) { - this->chunks[0] |= other.chunks[0]; - this->chunks[1] |= other.chunks[1]; - this->chunks[2] |= other.chunks[2]; - this->chunks[3] |= other.chunks[3]; - return *this; - } - - simdutf_really_inline simd8 reduce_or() const { - return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]); - } - - simdutf_really_inline bool is_ascii() const { - return this->reduce_or().is_ascii(); - } - - template - simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const { - this->chunks[0].template store_ascii_as_utf16(ptr+sizeof(simd8)*0); - this->chunks[1].template store_ascii_as_utf16(ptr+sizeof(simd8)*1); - this->chunks[2].template store_ascii_as_utf16(ptr+sizeof(simd8)*2); - this->chunks[3].template store_ascii_as_utf16(ptr+sizeof(simd8)*3); - } - - simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const { - this->chunks[0].store_ascii_as_utf32(ptr+sizeof(simd8)*0); - this->chunks[1].store_ascii_as_utf32(ptr+sizeof(simd8)*1); - this->chunks[2].store_ascii_as_utf32(ptr+sizeof(simd8)*2); - this->chunks[3].store_ascii_as_utf32(ptr+sizeof(simd8)*3); - } - - simdutf_really_inline uint64_t to_bitmask() const { - uint64_t r0 = uint32_t(this->chunks[0].to_bitmask()); - uint64_t r1 = this->chunks[1].to_bitmask(); - uint64_t r2 = this->chunks[2].to_bitmask(); - uint64_t r3 = this->chunks[3].to_bitmask(); - return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48); - } - - simdutf_really_inline uint64_t eq(const T m) const { - const simd8 mask = simd8::splat(m); - return simd8x64( - this->chunks[0] == mask, - this->chunks[1] == mask, - this->chunks[2] == mask, - this->chunks[3] == mask - ).to_bitmask(); - } - - simdutf_really_inline uint64_t eq(const simd8x64 &other) const { - return simd8x64( - this->chunks[0] == other.chunks[0], - this->chunks[1] == other.chunks[1], - this->chunks[2] == other.chunks[2], - this->chunks[3] == other.chunks[3] - ).to_bitmask(); - } - - simdutf_really_inline uint64_t lteq(const T m) const { - const simd8 mask = simd8::splat(m); - return simd8x64( - this->chunks[0] <= mask, - this->chunks[1] <= mask, - this->chunks[2] <= mask, - this->chunks[3] <= mask - ).to_bitmask(); - } - - simdutf_really_inline uint64_t in_range(const T low, const T high) const { - const simd8 mask_low = simd8::splat(low); - const simd8 mask_high = simd8::splat(high); - - return simd8x64( - (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low), - (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low), - (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low), - (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low) - ).to_bitmask(); - } - simdutf_really_inline uint64_t not_in_range(const T low, const T high) const { - const simd8 mask_low = simd8::splat(low-1); - const simd8 mask_high = simd8::splat(high+1); - return simd8x64( - (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low), - (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low), - (this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low), - (this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low) - ).to_bitmask(); - } - simdutf_really_inline uint64_t lt(const T m) const { - const simd8 mask = simd8::splat(m); - return simd8x64( - this->chunks[0] < mask, - this->chunks[1] < mask, - this->chunks[2] < mask, - this->chunks[3] < mask - ).to_bitmask(); - } - - simdutf_really_inline uint64_t gt(const T m) const { - const simd8 mask = simd8::splat(m); - return simd8x64( - this->chunks[0] > mask, - this->chunks[1] > mask, - this->chunks[2] > mask, - this->chunks[3] > mask - ).to_bitmask(); - } - simdutf_really_inline uint64_t gteq(const T m) const { - const simd8 mask = simd8::splat(m); - return simd8x64( - this->chunks[0] >= mask, - this->chunks[1] >= mask, - this->chunks[2] >= mask, - this->chunks[3] >= mask - ).to_bitmask(); - } - simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const { - const simd8 mask = simd8::splat(m); - return simd8x64( - simd8(__m128i(this->chunks[0])) >= mask, - simd8(__m128i(this->chunks[1])) >= mask, - simd8(__m128i(this->chunks[2])) >= mask, - simd8(__m128i(this->chunks[3])) >= mask - ).to_bitmask(); - } - }; // struct simd8x64 + // Conversion from SIMD register + simdutf_really_inline base(const __m128i _value) : value(_value) {} + // Conversion to SIMD register + simdutf_really_inline operator const __m128i &() const { return this->value; } + template + simdutf_really_inline void store_ascii_as_utf16(char16_t *p) const { + __m128i first = _mm_cvtepu8_epi16(*this); + __m128i second = _mm_cvtepu8_epi16(_mm_srli_si128(*this, 8)); + if (big_endian) { + const __m128i swap = + _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + first = _mm_shuffle_epi8(first, swap); + second = _mm_shuffle_epi8(second, swap); + } + _mm_storeu_si128(reinterpret_cast<__m128i *>(p), first); + _mm_storeu_si128(reinterpret_cast<__m128i *>(p + 8), second); + } + simdutf_really_inline void store_ascii_as_utf32(char32_t *p) const { + _mm_storeu_si128(reinterpret_cast<__m128i *>(p), _mm_cvtepu8_epi32(*this)); + _mm_storeu_si128(reinterpret_cast<__m128i *>(p + 4), + _mm_cvtepu8_epi32(_mm_srli_si128(*this, 4))); + _mm_storeu_si128(reinterpret_cast<__m128i *>(p + 8), + _mm_cvtepu8_epi32(_mm_srli_si128(*this, 8))); + _mm_storeu_si128(reinterpret_cast<__m128i *>(p + 12), + _mm_cvtepu8_epi32(_mm_srli_si128(*this, 12))); + } + // Bit operations + simdutf_really_inline Child operator|(const Child other) const { + return _mm_or_si128(*this, other); + } + simdutf_really_inline Child operator&(const Child other) const { + return _mm_and_si128(*this, other); + } + simdutf_really_inline Child operator^(const Child other) const { + return _mm_xor_si128(*this, other); + } + simdutf_really_inline Child &operator|=(const Child other) { + auto this_cast = static_cast(this); + *this_cast = *this_cast | other; + return *this_cast; + } +}; -/* begin file src/simdutf/westmere/simd16-inl.h */ -template -struct simd16; +// Forward-declared so they can be used by splat and friends. +template struct simd8; -template> -struct base16: base> { +template > +struct base8 : base> { typedef uint16_t bitmask_t; typedef uint32_t bitmask2_t; - simdutf_really_inline base16() : base>() {} - simdutf_really_inline base16(const __m128i _value) : base>(_value) {} - template - simdutf_really_inline base16(const Pointer* ptr) : base16(_mm_loadu_si128(reinterpret_cast(ptr))) {} + simdutf_really_inline T first() const { return _mm_extract_epi8(*this, 0); } + simdutf_really_inline T last() const { return _mm_extract_epi8(*this, 15); } + simdutf_really_inline base8() : base>() {} + simdutf_really_inline base8(const __m128i _value) : base>(_value) {} - friend simdutf_really_inline Mask operator==(const simd16 lhs, const simd16 rhs) { return _mm_cmpeq_epi16(lhs, rhs); } + friend simdutf_really_inline Mask operator==(const simd8 lhs, + const simd8 rhs) { + return _mm_cmpeq_epi8(lhs, rhs); + } - static const int SIZE = sizeof(base>::value); + static const int SIZE = sizeof(base>::value); - template - simdutf_really_inline simd16 prev(const simd16 prev_chunk) const { + template + simdutf_really_inline simd8 prev(const simd8 prev_chunk) const { return _mm_alignr_epi8(*this, prev_chunk, 16 - N); } }; // SIMD byte mask type (returned by things like eq and gt) -template<> -struct simd16: base16 { - static simdutf_really_inline simd16 splat(bool _value) { return _mm_set1_epi16(uint16_t(-(!!_value))); } +template <> struct simd8 : base8 { + static simdutf_really_inline simd8 splat(bool _value) { + return _mm_set1_epi8(uint8_t(-(!!_value))); + } - simdutf_really_inline simd16() : base16() {} - simdutf_really_inline simd16(const __m128i _value) : base16(_value) {} + simdutf_really_inline simd8() : base8() {} + simdutf_really_inline simd8(const __m128i _value) : base8(_value) {} // Splat constructor - simdutf_really_inline simd16(bool _value) : base16(splat(_value)) {} + simdutf_really_inline simd8(bool _value) : base8(splat(_value)) {} - simdutf_really_inline int to_bitmask() const { return _mm_movemask_epi8(*this); } - simdutf_really_inline bool any() const { return !_mm_testz_si128(*this, *this); } - simdutf_really_inline simd16 operator~() const { return *this ^ true; } + simdutf_really_inline int to_bitmask() const { + return _mm_movemask_epi8(*this); + } + simdutf_really_inline simd8 operator~() const { return *this ^ true; } }; -template -struct base16_numeric: base16 { - static simdutf_really_inline simd16 splat(T _value) { return _mm_set1_epi16(_value); } - static simdutf_really_inline simd16 zero() { return _mm_setzero_si128(); } - static simdutf_really_inline simd16 load(const T values[8]) { +template struct base8_numeric : base8 { + static simdutf_really_inline simd8 splat(T _value) { + return _mm_set1_epi8(_value); + } + static simdutf_really_inline simd8 zero() { return _mm_setzero_si128(); } + static simdutf_really_inline simd8 load(const T values[16]) { return _mm_loadu_si128(reinterpret_cast(values)); } + // Repeat 16 values as many times as necessary (usually for lookup tables) + static simdutf_really_inline simd8 repeat_16(T v0, T v1, T v2, T v3, T v4, + T v5, T v6, T v7, T v8, T v9, + T v10, T v11, T v12, T v13, + T v14, T v15) { + return simd8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, + v14, v15); + } - simdutf_really_inline base16_numeric() : base16() {} - simdutf_really_inline base16_numeric(const __m128i _value) : base16(_value) {} + simdutf_really_inline base8_numeric() : base8() {} + simdutf_really_inline base8_numeric(const __m128i _value) + : base8(_value) {} // Store to array - simdutf_really_inline void store(T dst[8]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); } + simdutf_really_inline void store(T dst[16]) const { + return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); + } // Override to distinguish from bool version - simdutf_really_inline simd16 operator~() const { return *this ^ 0xFFu; } + simdutf_really_inline simd8 operator~() const { return *this ^ 0xFFu; } // Addition/subtraction are the same for signed and unsigned - simdutf_really_inline simd16 operator+(const simd16 other) const { return _mm_add_epi16(*this, other); } - simdutf_really_inline simd16 operator-(const simd16 other) const { return _mm_sub_epi16(*this, other); } - simdutf_really_inline simd16& operator+=(const simd16 other) { *this = *this + other; return *static_cast*>(this); } - simdutf_really_inline simd16& operator-=(const simd16 other) { *this = *this - other; return *static_cast*>(this); } + simdutf_really_inline simd8 operator-(const simd8 other) const { + return _mm_sub_epi8(*this, other); + } + simdutf_really_inline simd8 &operator-=(const simd8 other) { + *this = *this - other; + return *static_cast *>(this); + } + + // Perform a lookup assuming the value is between 0 and 16 (undefined behavior + // for out of range values) + template + simdutf_really_inline simd8 lookup_16(simd8 lookup_table) const { + return _mm_shuffle_epi8(lookup_table, *this); + } + + template + simdutf_really_inline simd8 + lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4, + L replace5, L replace6, L replace7, L replace8, L replace9, + L replace10, L replace11, L replace12, L replace13, L replace14, + L replace15) const { + return lookup_16(simd8::repeat_16( + replace0, replace1, replace2, replace3, replace4, replace5, replace6, + replace7, replace8, replace9, replace10, replace11, replace12, + replace13, replace14, replace15)); + } }; -// Signed code units -template<> -struct simd16 : base16_numeric { - simdutf_really_inline simd16() : base16_numeric() {} - simdutf_really_inline simd16(const __m128i _value) : base16_numeric(_value) {} +// Signed bytes +template <> struct simd8 : base8_numeric { + simdutf_really_inline simd8() : base8_numeric() {} + simdutf_really_inline simd8(const __m128i _value) + : base8_numeric(_value) {} // Splat constructor - simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {} - // Array constructor - simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {} - simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast(values))) {} + simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {} // Member-by-member initialization - simdutf_really_inline simd16( - int16_t v0, int16_t v1, int16_t v2, int16_t v3, int16_t v4, int16_t v5, int16_t v6, int16_t v7) - : simd16(_mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7)) {} - simdutf_really_inline operator simd16() const; + simdutf_really_inline operator simd8() const; + simdutf_really_inline bool is_ascii() const { + return _mm_movemask_epi8(*this) == 0; + } // Order-sensitive comparisons - simdutf_really_inline simd16 max_val(const simd16 other) const { return _mm_max_epi16(*this, other); } - simdutf_really_inline simd16 min_val(const simd16 other) const { return _mm_min_epi16(*this, other); } - simdutf_really_inline simd16 operator>(const simd16 other) const { return _mm_cmpgt_epi16(*this, other); } - simdutf_really_inline simd16 operator<(const simd16 other) const { return _mm_cmpgt_epi16(other, *this); } + simdutf_really_inline simd8 operator>(const simd8 other) const { + return _mm_cmpgt_epi8(*this, other); + } + simdutf_really_inline simd8 operator<(const simd8 other) const { + return _mm_cmpgt_epi8(other, *this); + } }; -// Unsigned code units -template<> -struct simd16: base16_numeric { - simdutf_really_inline simd16() : base16_numeric() {} - simdutf_really_inline simd16(const __m128i _value) : base16_numeric(_value) {} +// Unsigned bytes +template <> struct simd8 : base8_numeric { + simdutf_really_inline simd8() : base8_numeric() {} + simdutf_really_inline simd8(const __m128i _value) + : base8_numeric(_value) {} // Splat constructor - simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {} + simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {} // Array constructor - simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {} - simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast(values))) {} + simdutf_really_inline simd8(const uint8_t *values) : simd8(load(values)) {} // Member-by-member initialization - simdutf_really_inline simd16( - uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7) - : simd16(_mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7)) {} - // Repeat 16 values as many times as necessary (usually for lookup tables) - simdutf_really_inline static simd16 repeat_16( - uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7 - ) { - return simd16(v0, v1, v2, v3, v4, v5, v6, v7); - } + simdutf_really_inline + simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, + uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10, + uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15) + : simd8(_mm_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, + v12, v13, v14, v15)) {} // Saturated math - simdutf_really_inline simd16 saturating_add(const simd16 other) const { return _mm_adds_epu16(*this, other); } - simdutf_really_inline simd16 saturating_sub(const simd16 other) const { return _mm_subs_epu16(*this, other); } - + simdutf_really_inline simd8 + saturating_sub(const simd8 other) const { + return _mm_subs_epu8(*this, other); + } + // Order-specific operations - simdutf_really_inline simd16 max_val(const simd16 other) const { return _mm_max_epu16(*this, other); } - simdutf_really_inline simd16 min_val(const simd16 other) const { return _mm_min_epu16(*this, other); } + simdutf_really_inline simd8 + min_val(const simd8 other) const { + return _mm_min_epu8(*this, other); + } // Same as >, but only guarantees true is nonzero (< guarantees true = -1) - simdutf_really_inline simd16 gt_bits(const simd16 other) const { return this->saturating_sub(other); } + simdutf_really_inline simd8 + gt_bits(const simd8 other) const { + return this->saturating_sub(other); + } // Same as <, but only guarantees true is nonzero (< guarantees true = -1) - simdutf_really_inline simd16 lt_bits(const simd16 other) const { return other.saturating_sub(*this); } - simdutf_really_inline simd16 operator<=(const simd16 other) const { return other.max_val(*this) == other; } - simdutf_really_inline simd16 operator>=(const simd16 other) const { return other.min_val(*this) == other; } - simdutf_really_inline simd16 operator>(const simd16 other) const { return this->gt_bits(other).any_bits_set(); } - simdutf_really_inline simd16 operator<(const simd16 other) const { return this->gt_bits(other).any_bits_set(); } + simdutf_really_inline simd8 + operator>=(const simd8 other) const { + return other.min_val(*this) == other; + } // Bit-specific operations - simdutf_really_inline simd16 bits_not_set() const { return *this == uint16_t(0); } - simdutf_really_inline simd16 bits_not_set(simd16 bits) const { return (*this & bits).bits_not_set(); } - simdutf_really_inline simd16 any_bits_set() const { return ~this->bits_not_set(); } - simdutf_really_inline simd16 any_bits_set(simd16 bits) const { return ~this->bits_not_set(bits); } - - simdutf_really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); } - simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); } - simdutf_really_inline bool bits_not_set_anywhere(simd16 bits) const { return _mm_testz_si128(*this, bits); } - simdutf_really_inline bool any_bits_set_anywhere(simd16 bits) const { return !bits_not_set_anywhere(bits); } - template - simdutf_really_inline simd16 shr() const { return simd16(_mm_srli_epi16(*this, N)); } - template - simdutf_really_inline simd16 shl() const { return simd16(_mm_slli_epi16(*this, N)); } - // Get one of the bits and make a bitmask out of it. - // e.g. value.get_bit<7>() gets the high bit - template - simdutf_really_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N)); } + simdutf_really_inline simd8 bits_not_set() const { + return *this == uint8_t(0); + } + simdutf_really_inline simd8 any_bits_set() const { + return ~this->bits_not_set(); + } + simdutf_really_inline bool is_ascii() const { + return _mm_movemask_epi8(*this) == 0; + } - // Change the endianness - simdutf_really_inline simd16 swap_bytes() const { - const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - return _mm_shuffle_epi8(*this, swap); + simdutf_really_inline bool bits_not_set_anywhere() const { + return _mm_testz_si128(*this, *this); + } + simdutf_really_inline bool any_bits_set_anywhere() const { + return !bits_not_set_anywhere(); + } + template simdutf_really_inline simd8 shr() const { + return simd8(_mm_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); + } + template simdutf_really_inline simd8 shl() const { + return simd8(_mm_slli_epi16(*this, N)) & uint8_t(0xFFu << N); } - // Pack with the unsigned saturation of two uint16_t code units into single uint8_t vector - static simdutf_really_inline simd8 pack(const simd16& v0, const simd16& v1) { - return _mm_packus_epi16(v0, v1); + simdutf_really_inline uint64_t sum_bytes() const { + const auto tmp = _mm_sad_epu8(value, _mm_setzero_si128()); + return _mm_extract_epi64(tmp, 0) + _mm_extract_epi64(tmp, 1); } }; -simdutf_really_inline simd16::operator simd16() const { return this->value; } -template - struct simd16x32 { - static constexpr int NUM_CHUNKS = 64 / sizeof(simd16); - static_assert(NUM_CHUNKS == 4, "Westmere kernel should use four registers per 64-byte block."); - simd16 chunks[NUM_CHUNKS]; - - simd16x32(const simd16x32& o) = delete; // no copy allowed - simd16x32& operator=(const simd16 other) = delete; // no assignment allowed - simd16x32() = delete; // no default constructor allowed - - simdutf_really_inline simd16x32(const simd16 chunk0, const simd16 chunk1, const simd16 chunk2, const simd16 chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {} - simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16::load(ptr), simd16::load(ptr+sizeof(simd16)/sizeof(T)), simd16::load(ptr+2*sizeof(simd16)/sizeof(T)), simd16::load(ptr+3*sizeof(simd16)/sizeof(T))} {} - - simdutf_really_inline void store(T* ptr) const { - this->chunks[0].store(ptr+sizeof(simd16)*0/sizeof(T)); - this->chunks[1].store(ptr+sizeof(simd16)*1/sizeof(T)); - this->chunks[2].store(ptr+sizeof(simd16)*2/sizeof(T)); - this->chunks[3].store(ptr+sizeof(simd16)*3/sizeof(T)); - } - - simdutf_really_inline simd16 reduce_or() const { - return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]); - } - - simdutf_really_inline bool is_ascii() const { - return this->reduce_or().is_ascii(); - } - - simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const { - this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16)*0); - this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16)*1); - this->chunks[2].store_ascii_as_utf16(ptr+sizeof(simd16)*2); - this->chunks[3].store_ascii_as_utf16(ptr+sizeof(simd16)*3); - } - - simdutf_really_inline uint64_t to_bitmask() const { - uint64_t r0 = uint32_t(this->chunks[0].to_bitmask()); - uint64_t r1 = this->chunks[1].to_bitmask(); - uint64_t r2 = this->chunks[2].to_bitmask(); - uint64_t r3 = this->chunks[3].to_bitmask(); - return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48); - } - - simdutf_really_inline void swap_bytes() { - this->chunks[0] = this->chunks[0].swap_bytes(); - this->chunks[1] = this->chunks[1].swap_bytes(); - this->chunks[2] = this->chunks[2].swap_bytes(); - this->chunks[3] = this->chunks[3].swap_bytes(); - } - - simdutf_really_inline uint64_t eq(const T m) const { - const simd16 mask = simd16::splat(m); - return simd16x32( - this->chunks[0] == mask, - this->chunks[1] == mask, - this->chunks[2] == mask, - this->chunks[3] == mask - ).to_bitmask(); - } - - simdutf_really_inline uint64_t eq(const simd16x32 &other) const { - return simd16x32( - this->chunks[0] == other.chunks[0], - this->chunks[1] == other.chunks[1], - this->chunks[2] == other.chunks[2], - this->chunks[3] == other.chunks[3] - ).to_bitmask(); - } - - simdutf_really_inline uint64_t lteq(const T m) const { - const simd16 mask = simd16::splat(m); - return simd16x32( - this->chunks[0] <= mask, - this->chunks[1] <= mask, - this->chunks[2] <= mask, - this->chunks[3] <= mask - ).to_bitmask(); - } - - simdutf_really_inline uint64_t in_range(const T low, const T high) const { - const simd16 mask_low = simd16::splat(low); - const simd16 mask_high = simd16::splat(high); - - return simd16x32( - (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low), - (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low), - (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low), - (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low) - ).to_bitmask(); - } - simdutf_really_inline uint64_t not_in_range(const T low, const T high) const { - const simd16 mask_low = simd16::splat(static_cast(low-1)); - const simd16 mask_high = simd16::splat(static_cast(high+1)); - return simd16x32( - (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low), - (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low), - (this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low), - (this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low) - ).to_bitmask(); - } - simdutf_really_inline uint64_t lt(const T m) const { - const simd16 mask = simd16::splat(m); - return simd16x32( - this->chunks[0] < mask, - this->chunks[1] < mask, - this->chunks[2] < mask, - this->chunks[3] < mask - ).to_bitmask(); - } - }; // struct simd16x32 -/* end file src/simdutf/westmere/simd16-inl.h */ +simdutf_really_inline simd8::operator simd8() const { + return this->value; +} -} // namespace simd -} // unnamed namespace -} // namespace westmere -} // namespace simdutf +template struct simd8x64 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd8); + static_assert(NUM_CHUNKS == 4, + "Westmere kernel should use four registers per 64-byte block."); + simd8 chunks[NUM_CHUNKS]; -#endif // SIMDUTF_WESTMERE_SIMD_INPUT_H -/* end file src/simdutf/westmere/simd.h */ + simd8x64(const simd8x64 &o) = delete; // no copy allowed + simd8x64 & + operator=(const simd8 other) = delete; // no assignment allowed + simd8x64() = delete; // no default constructor allowed -/* begin file src/simdutf/westmere/end.h */ -#if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE -// nothing needed. -#else -SIMDUTF_UNTARGET_REGION -#endif + simdutf_really_inline simd8x64(const simd8 chunk0, const simd8 chunk1, + const simd8 chunk2, const simd8 chunk3) + : chunks{chunk0, chunk1, chunk2, chunk3} {} + simdutf_really_inline simd8x64(const T *ptr) + : chunks{simd8::load(ptr), + simd8::load(ptr + sizeof(simd8) / sizeof(T)), + simd8::load(ptr + 2 * sizeof(simd8) / sizeof(T)), + simd8::load(ptr + 3 * sizeof(simd8) / sizeof(T))} {} -/* end file src/simdutf/westmere/end.h */ + simdutf_really_inline void store(T *ptr) const { + this->chunks[0].store(ptr + sizeof(simd8) * 0 / sizeof(T)); + this->chunks[1].store(ptr + sizeof(simd8) * 1 / sizeof(T)); + this->chunks[2].store(ptr + sizeof(simd8) * 2 / sizeof(T)); + this->chunks[3].store(ptr + sizeof(simd8) * 3 / sizeof(T)); + } -#endif // SIMDUTF_IMPLEMENTATION_WESTMERE -#endif // SIMDUTF_WESTMERE_COMMON_H -/* end file src/simdutf/westmere.h */ -/* begin file src/simdutf/ppc64.h */ -#ifndef SIMDUTF_PPC64_H -#define SIMDUTF_PPC64_H + simdutf_really_inline simd8x64 &operator|=(const simd8x64 &other) { + this->chunks[0] |= other.chunks[0]; + this->chunks[1] |= other.chunks[1]; + this->chunks[2] |= other.chunks[2]; + this->chunks[3] |= other.chunks[3]; + return *this; + } -#ifdef SIMDUTF_FALLBACK_H -#error "ppc64.h must be included before fallback.h" -#endif + simdutf_really_inline simd8 reduce_or() const { + return (this->chunks[0] | this->chunks[1]) | + (this->chunks[2] | this->chunks[3]); + } + simdutf_really_inline bool is_ascii() const { + return this->reduce_or().is_ascii(); + } -#ifndef SIMDUTF_IMPLEMENTATION_PPC64 -#define SIMDUTF_IMPLEMENTATION_PPC64 (SIMDUTF_IS_PPC64) -#endif -#define SIMDUTF_CAN_ALWAYS_RUN_PPC64 SIMDUTF_IMPLEMENTATION_PPC64 && SIMDUTF_IS_PPC64 + template + simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const { + this->chunks[0].template store_ascii_as_utf16(ptr + + sizeof(simd8) * 0); + this->chunks[1].template store_ascii_as_utf16(ptr + + sizeof(simd8) * 1); + this->chunks[2].template store_ascii_as_utf16(ptr + + sizeof(simd8) * 2); + this->chunks[3].template store_ascii_as_utf16(ptr + + sizeof(simd8) * 3); + } + simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const { + this->chunks[0].store_ascii_as_utf32(ptr + sizeof(simd8) * 0); + this->chunks[1].store_ascii_as_utf32(ptr + sizeof(simd8) * 1); + this->chunks[2].store_ascii_as_utf32(ptr + sizeof(simd8) * 2); + this->chunks[3].store_ascii_as_utf32(ptr + sizeof(simd8) * 3); + } + simdutf_really_inline uint64_t to_bitmask() const { + uint64_t r0 = uint32_t(this->chunks[0].to_bitmask()); + uint64_t r1 = this->chunks[1].to_bitmask(); + uint64_t r2 = this->chunks[2].to_bitmask(); + uint64_t r3 = this->chunks[3].to_bitmask(); + return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48); + } -#if SIMDUTF_IMPLEMENTATION_PPC64 + simdutf_really_inline uint64_t lt(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64(this->chunks[0] < mask, this->chunks[1] < mask, + this->chunks[2] < mask, this->chunks[3] < mask) + .to_bitmask(); + } -namespace simdutf { -/** - * Implementation for ALTIVEC (PPC64). - */ -namespace ppc64 { -} // namespace ppc64 -} // namespace simdutf + simdutf_really_inline uint64_t gt(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64(this->chunks[0] > mask, this->chunks[1] > mask, + this->chunks[2] > mask, this->chunks[3] > mask) + .to_bitmask(); + } -/* begin file src/simdutf/ppc64/implementation.h */ -#ifndef SIMDUTF_PPC64_IMPLEMENTATION_H -#define SIMDUTF_PPC64_IMPLEMENTATION_H + simdutf_really_inline uint64_t eq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64(this->chunks[0] == mask, this->chunks[1] == mask, + this->chunks[2] == mask, this->chunks[3] == mask) + .to_bitmask(); + } + simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const { + const simd8 mask = simd8::splat(m); + return simd8x64(simd8(__m128i(this->chunks[0])) >= mask, + simd8(__m128i(this->chunks[1])) >= mask, + simd8(__m128i(this->chunks[2])) >= mask, + simd8(__m128i(this->chunks[3])) >= mask) + .to_bitmask(); + } +}; // struct simd8x64 -namespace simdutf { -namespace ppc64 { +/* begin file src/simdutf/westmere/simd16-inl.h */ +template struct simd16; -namespace { -using namespace simdutf; -} // namespace +template > +struct base16 : base> { + simdutf_really_inline base16() : base>() {} -class implementation final : public simdutf::implementation { -public: - simdutf_really_inline implementation() - : simdutf::implementation("ppc64", "PPC64 ALTIVEC", - internal::instruction_set::ALTIVEC) {} - simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final; - simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final; - simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final; - simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final; - simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final; - simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final; - simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; - simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; - simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; - simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; - simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final; - simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept; - simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept; - simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept; - simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept; - simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept; - simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept; - simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept; - simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept; - simdutf_warn_unused size_t base64_length_from_binary(size_t length, base64_options options) const noexcept; - size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept; -}; + simdutf_really_inline base16(const __m128i _value) + : base>(_value) {} -} // namespace ppc64 -} // namespace simdutf + friend simdutf_really_inline Mask operator==(const simd16 lhs, + const simd16 rhs) { + return _mm_cmpeq_epi16(lhs, rhs); + } -#endif // SIMDUTF_PPC64_IMPLEMENTATION_H -/* end file src/simdutf/ppc64/implementation.h */ + /// the size of vector in bytes + static const int SIZE = sizeof(base>::value); -/* begin file src/simdutf/ppc64/begin.h */ -// redefining SIMDUTF_IMPLEMENTATION to "ppc64" -// #define SIMDUTF_IMPLEMENTATION ppc64 -/* end file src/simdutf/ppc64/begin.h */ + /// the number of elements of type T a vector can hold + static const int ELEMENTS = SIZE / sizeof(T); +}; -// Declarations -/* begin file src/simdutf/ppc64/intrinsics.h */ -#ifndef SIMDUTF_PPC64_INTRINSICS_H -#define SIMDUTF_PPC64_INTRINSICS_H +// SIMD byte mask type (returned by things like eq and gt) +template <> struct simd16 : base16 { + static simdutf_really_inline simd16 splat(bool _value) { + return _mm_set1_epi16(uint16_t(-(!!_value))); + } + simdutf_really_inline simd16(const __m128i _value) : base16(_value) {} -// This should be the correct header whether -// you use visual studio or other compilers. -#include + // Splat constructor + simdutf_really_inline simd16(bool _value) : base16(splat(_value)) {} -// These are defined by altivec.h in GCC toolchain, it is safe to undef them. -#ifdef bool -#undef bool -#endif + simdutf_really_inline int to_bitmask() const { + return _mm_movemask_epi8(*this); + } -#ifdef vector -#undef vector -#endif + simdutf_really_inline simd16 operator~() const { return *this ^ true; } +}; -#endif // SIMDUTF_PPC64_INTRINSICS_H -/* end file src/simdutf/ppc64/intrinsics.h */ -/* begin file src/simdutf/ppc64/bitmanipulation.h */ -#ifndef SIMDUTF_PPC64_BITMANIPULATION_H -#define SIMDUTF_PPC64_BITMANIPULATION_H +template struct base16_numeric : base16 { + static simdutf_really_inline simd16 splat(T _value) { + return _mm_set1_epi16(_value); + } -namespace simdutf { -namespace ppc64 { -namespace { + static simdutf_really_inline simd16 zero() { return _mm_setzero_si128(); } -#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO -simdutf_really_inline int count_ones(uint64_t input_num) { - // note: we do not support legacy 32-bit Windows - return __popcnt64(input_num); // Visual Studio wants two underscores -} -#else -simdutf_really_inline int count_ones(uint64_t input_num) { - return __builtin_popcountll(input_num); -} -#endif + static simdutf_really_inline simd16 load(const T values[8]) { + return _mm_loadu_si128(reinterpret_cast(values)); + } -} // unnamed namespace -} // namespace ppc64 -} // namespace simdutf + simdutf_really_inline base16_numeric() : base16() {} -#endif // SIMDUTF_PPC64_BITMANIPULATION_H -/* end file src/simdutf/ppc64/bitmanipulation.h */ -/* begin file src/simdutf/ppc64/simd.h */ -#ifndef SIMDUTF_PPC64_SIMD_H -#define SIMDUTF_PPC64_SIMD_H + simdutf_really_inline base16_numeric(const __m128i _value) + : base16(_value) {} -#include + // Store to array + simdutf_really_inline void store(T dst[8]) const { + return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); + } -namespace simdutf { -namespace ppc64 { -namespace { -namespace simd { + // Override to distinguish from bool version + simdutf_really_inline simd16 operator~() const { return *this ^ 0xFFu; } -using __m128i = __vector unsigned char; + // Addition/subtraction are the same for signed and unsigned + simdutf_really_inline simd16 operator+(const simd16 other) const { + return _mm_add_epi16(*this, other); + } + simdutf_really_inline simd16 &operator+=(const simd16 other) { + *this = *this + other; + return *static_cast *>(this); + } +}; -template struct base { - __m128i value; +// Unsigned code units +template <> struct simd16 : base16_numeric { + simdutf_really_inline simd16() : base16_numeric() {} - // Zero constructor - simdutf_really_inline base() : value{__m128i()} {} + simdutf_really_inline simd16(const __m128i _value) + : base16_numeric(_value) {} - // Conversion from SIMD register - simdutf_really_inline base(const __m128i _value) : value(_value) {} + // Splat constructor + simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {} - // Conversion to SIMD register - simdutf_really_inline operator const __m128i &() const { - return this->value; + // Array constructor + simdutf_really_inline simd16(const char16_t *values) + : simd16(load(reinterpret_cast(values))) {} + + // Order-specific operations + simdutf_really_inline simd16 + max_val(const simd16 other) const { + return _mm_max_epu16(*this, other); } - simdutf_really_inline operator __m128i &() { return this->value; } - // Bit operations - simdutf_really_inline Child operator|(const Child other) const { - return vec_or(this->value, (__m128i)other); + simdutf_really_inline simd16 + min_val(const simd16 other) const { + return _mm_min_epu16(*this, other); } - simdutf_really_inline Child operator&(const Child other) const { - return vec_and(this->value, (__m128i)other); + + simdutf_really_inline simd16 + operator<=(const simd16 other) const { + return other.max_val(*this) == other; } - simdutf_really_inline Child operator^(const Child other) const { - return vec_xor(this->value, (__m128i)other); + simdutf_really_inline simd16 + operator>=(const simd16 other) const { + return other.min_val(*this) == other; } - simdutf_really_inline Child bit_andnot(const Child other) const { - return vec_andc(this->value, (__m128i)other); + // Bit-specific operations + simdutf_really_inline simd16 bits_not_set() const { + return *this == uint16_t(0); } - simdutf_really_inline Child &operator|=(const Child other) { - auto this_cast = static_cast(this); - *this_cast = *this_cast | other; - return *this_cast; + + simdutf_really_inline simd16 any_bits_set() const { + return ~this->bits_not_set(); } - simdutf_really_inline Child &operator&=(const Child other) { - auto this_cast = static_cast(this); - *this_cast = *this_cast & other; - return *this_cast; + + template simdutf_really_inline simd16 shr() const { + return simd16(_mm_srli_epi16(*this, N)); } - simdutf_really_inline Child &operator^=(const Child other) { - auto this_cast = static_cast(this); - *this_cast = *this_cast ^ other; - return *this_cast; + + // Change the endianness + simdutf_really_inline simd16 swap_bytes() const { + const __m128i swap = + _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + return _mm_shuffle_epi8(*this, swap); } -}; -// Forward-declared so they can be used by splat and friends. -template struct simd8; + // Pack with the unsigned saturation of two uint16_t code units into single + // uint8_t vector + static simdutf_really_inline simd8 pack(const simd16 &v0, + const simd16 &v1) { + return _mm_packus_epi16(v0, v1); + } -template > -struct base8 : base> { - typedef uint16_t bitmask_t; - typedef uint32_t bitmask2_t; + simdutf_really_inline uint64_t sum() const { + const auto lo_u16 = _mm_and_si128(value, _mm_set1_epi32(0x0000ffff)); + const auto hi_u16 = _mm_srli_epi32(value, 16); + const auto sum_u32 = _mm_add_epi32(lo_u16, hi_u16); - simdutf_really_inline base8() : base>() {} - simdutf_really_inline base8(const __m128i _value) : base>(_value) {} + const auto lo_u32 = _mm_and_si128(sum_u32, _mm_set1_epi64x(0xffffffff)); + const auto hi_u32 = _mm_srli_epi64(sum_u32, 32); + const auto sum_u64 = _mm_add_epi64(lo_u32, hi_u32); - friend simdutf_really_inline Mask operator==(const simd8 lhs, const simd8 rhs) { - return (__m128i)vec_cmpeq(lhs.value, (__m128i)rhs); + return uint64_t(_mm_extract_epi64(sum_u64, 0)) + + uint64_t(_mm_extract_epi64(sum_u64, 1)); } +}; - static const int SIZE = sizeof(base>::value); +template struct simd16x32 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd16); + static_assert(NUM_CHUNKS == 4, + "Westmere kernel should use four registers per 64-byte block."); + simd16 chunks[NUM_CHUNKS]; - template - simdutf_really_inline simd8 prev(simd8 prev_chunk) const { - __m128i chunk = this->value; -#ifdef __LITTLE_ENDIAN__ - chunk = (__m128i)vec_reve(this->value); - prev_chunk = (__m128i)vec_reve((__m128i)prev_chunk); -#endif - chunk = (__m128i)vec_sld((__m128i)prev_chunk, (__m128i)chunk, 16 - N); -#ifdef __LITTLE_ENDIAN__ - chunk = (__m128i)vec_reve((__m128i)chunk); -#endif - return chunk; + simd16x32(const simd16x32 &o) = delete; // no copy allowed + simd16x32 & + operator=(const simd16 other) = delete; // no assignment allowed + simd16x32() = delete; // no default constructor allowed + + simdutf_really_inline + simd16x32(const simd16 chunk0, const simd16 chunk1, + const simd16 chunk2, const simd16 chunk3) + : chunks{chunk0, chunk1, chunk2, chunk3} {} + simdutf_really_inline simd16x32(const T *ptr) + : chunks{simd16::load(ptr), + simd16::load(ptr + sizeof(simd16) / sizeof(T)), + simd16::load(ptr + 2 * sizeof(simd16) / sizeof(T)), + simd16::load(ptr + 3 * sizeof(simd16) / sizeof(T))} {} + + simdutf_really_inline void store(T *ptr) const { + this->chunks[0].store(ptr + sizeof(simd16) * 0 / sizeof(T)); + this->chunks[1].store(ptr + sizeof(simd16) * 1 / sizeof(T)); + this->chunks[2].store(ptr + sizeof(simd16) * 2 / sizeof(T)); + this->chunks[3].store(ptr + sizeof(simd16) * 3 / sizeof(T)); } -}; -// SIMD byte mask type (returned by things like eq and gt) -template <> struct simd8 : base8 { - static simdutf_really_inline simd8 splat(bool _value) { - return (__m128i)vec_splats((unsigned char)(-(!!_value))); + simdutf_really_inline simd16 reduce_or() const { + return (this->chunks[0] | this->chunks[1]) | + (this->chunks[2] | this->chunks[3]); } - simdutf_really_inline simd8() : base8() {} - simdutf_really_inline simd8(const __m128i _value) - : base8(_value) {} - // Splat constructor - simdutf_really_inline simd8(bool _value) - : base8(splat(_value)) {} + simdutf_really_inline bool is_ascii() const { + return this->reduce_or().is_ascii(); + } - simdutf_really_inline int to_bitmask() const { - __vector unsigned long long result; - const __m128i perm_mask = {0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40, - 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00}; - - result = ((__vector unsigned long long)vec_vbpermq((__m128i)this->value, - (__m128i)perm_mask)); -#ifdef __LITTLE_ENDIAN__ - return static_cast(result[1]); -#else - return static_cast(result[0]); -#endif + simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const { + this->chunks[0].store_ascii_as_utf16(ptr + sizeof(simd16) * 0); + this->chunks[1].store_ascii_as_utf16(ptr + sizeof(simd16) * 1); + this->chunks[2].store_ascii_as_utf16(ptr + sizeof(simd16) * 2); + this->chunks[3].store_ascii_as_utf16(ptr + sizeof(simd16) * 3); } - simdutf_really_inline bool any() const { - return !vec_all_eq(this->value, (__m128i)vec_splats(0)); + + simdutf_really_inline uint64_t to_bitmask() const { + uint64_t r0 = uint32_t(this->chunks[0].to_bitmask()); + uint64_t r1 = this->chunks[1].to_bitmask(); + uint64_t r2 = this->chunks[2].to_bitmask(); + uint64_t r3 = this->chunks[3].to_bitmask(); + return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48); } - simdutf_really_inline simd8 operator~() const { - return this->value ^ (__m128i)splat(true); + + simdutf_really_inline void swap_bytes() { + this->chunks[0] = this->chunks[0].swap_bytes(); + this->chunks[1] = this->chunks[1].swap_bytes(); + this->chunks[2] = this->chunks[2].swap_bytes(); + this->chunks[3] = this->chunks[3].swap_bytes(); } -}; -template struct base8_numeric : base8 { - static simdutf_really_inline simd8 splat(T value) { - (void)value; - return (__m128i)vec_splats(value); + simdutf_really_inline uint64_t lteq(const T m) const { + const simd16 mask = simd16::splat(m); + return simd16x32(this->chunks[0] <= mask, this->chunks[1] <= mask, + this->chunks[2] <= mask, this->chunks[3] <= mask) + .to_bitmask(); } - static simdutf_really_inline simd8 zero() { return splat(0); } - static simdutf_really_inline simd8 load(const T values[16]) { - return (__m128i)(vec_vsx_ld(0, reinterpret_cast(values))); + + simdutf_really_inline uint64_t eq(const T m) const { + const simd16 mask = simd16::splat(m); + return simd16x32(this->chunks[0] == mask, this->chunks[1] == mask, + this->chunks[2] == mask, this->chunks[3] == mask) + .to_bitmask(); } - // Repeat 16 values as many times as necessary (usually for lookup tables) - static simdutf_really_inline simd8 repeat_16(T v0, T v1, T v2, T v3, T v4, - T v5, T v6, T v7, T v8, T v9, - T v10, T v11, T v12, T v13, - T v14, T v15) { - return simd8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, - v14, v15); + + simdutf_really_inline uint64_t not_in_range(const T low, const T high) const { + const simd16 mask_low = simd16::splat(static_cast(low - 1)); + const simd16 mask_high = simd16::splat(static_cast(high + 1)); + return simd16x32( + (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low), + (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low), + (this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low), + (this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low)) + .to_bitmask(); } +}; // struct simd16x32 - simdutf_really_inline base8_numeric() : base8() {} - simdutf_really_inline base8_numeric(const __m128i _value) - : base8(_value) {} +simd16 min(const simd16 a, simd16 b) { + return _mm_min_epu16(a.value, b.value); +} +/* end file src/simdutf/westmere/simd16-inl.h */ +/* begin file src/simdutf/westmere/simd32-inl.h */ +template struct simd32; - // Store to array - simdutf_really_inline void store(T dst[16]) const { - vec_vsx_st(this->value, 0, reinterpret_cast<__m128i *>(dst)); - } +template <> struct simd32 { + static const size_t SIZE = sizeof(__m128i); + static const size_t ELEMENTS = SIZE / sizeof(uint32_t); - // Override to distinguish from bool version - simdutf_really_inline simd8 operator~() const { return *this ^ 0xFFu; } + __m128i value; - // Addition/subtraction are the same for signed and unsigned - simdutf_really_inline simd8 operator+(const simd8 other) const { - return (__m128i)((__m128i)this->value + (__m128i)other); - } - simdutf_really_inline simd8 operator-(const simd8 other) const { - return (__m128i)((__m128i)this->value - (__m128i)other); - } - simdutf_really_inline simd8 &operator+=(const simd8 other) { - *this = *this + other; - return *static_cast *>(this); - } - simdutf_really_inline simd8 &operator-=(const simd8 other) { - *this = *this - other; - return *static_cast *>(this); - } + simdutf_really_inline simd32(const __m128i v) : value(v) {} - // Perform a lookup assuming the value is between 0 and 16 (undefined behavior - // for out of range values) - template - simdutf_really_inline simd8 lookup_16(simd8 lookup_table) const { - return (__m128i)vec_perm((__m128i)lookup_table, (__m128i)lookup_table, this->value); - } + template + simdutf_really_inline simd32(const Pointer *ptr) + : value(_mm_loadu_si128(reinterpret_cast(ptr))) {} - template - simdutf_really_inline simd8 - lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4, - L replace5, L replace6, L replace7, L replace8, L replace9, - L replace10, L replace11, L replace12, L replace13, L replace14, - L replace15) const { - return lookup_16(simd8::repeat_16( - replace0, replace1, replace2, replace3, replace4, replace5, replace6, - replace7, replace8, replace9, replace10, replace11, replace12, - replace13, replace14, replace15)); + simdutf_really_inline uint64_t sum() const { + return uint64_t(_mm_extract_epi32(value, 0)) + + uint64_t(_mm_extract_epi32(value, 1)) + + uint64_t(_mm_extract_epi32(value, 2)) + + uint64_t(_mm_extract_epi32(value, 3)); } -}; -// Signed bytes -template <> struct simd8 : base8_numeric { - simdutf_really_inline simd8() : base8_numeric() {} - simdutf_really_inline simd8(const __m128i _value) - : base8_numeric(_value) {} + simdutf_really_inline simd32 swap_bytes() const { + const __m128i shuffle = + _mm_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 15, 14, 13, 12); - // Splat constructor - simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {} - // Array constructor - simdutf_really_inline simd8(const int8_t *values) : simd8(load(values)) {} - // Member-by-member initialization - simdutf_really_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, - int8_t v4, int8_t v5, int8_t v6, int8_t v7, - int8_t v8, int8_t v9, int8_t v10, int8_t v11, - int8_t v12, int8_t v13, int8_t v14, int8_t v15) - : simd8((__m128i)(__vector signed char){v0, v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10, v11, v12, v13, v14, - v15}) {} - // Repeat 16 values as many times as necessary (usually for lookup tables) - simdutf_really_inline static simd8 - repeat_16(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, - int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11, - int8_t v12, int8_t v13, int8_t v14, int8_t v15) { - return simd8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, - v13, v14, v15); + return _mm_shuffle_epi8(value, shuffle); } - // Order-sensitive comparisons - simdutf_really_inline simd8 - max_val(const simd8 other) const { - return (__m128i)vec_max((__vector signed char)this->value, - (__vector signed char)(__m128i)other); - } - simdutf_really_inline simd8 - min_val(const simd8 other) const { - return (__m128i)vec_min((__vector signed char)this->value, - (__vector signed char)(__m128i)other); + template simdutf_really_inline simd32 shr() const { + return _mm_srli_epi32(value, N); } - simdutf_really_inline simd8 - operator>(const simd8 other) const { - return (__m128i)vec_cmpgt((__vector signed char)this->value, - (__vector signed char)(__m128i)other); + + template simdutf_really_inline simd32 shl() const { + return _mm_slli_epi32(value, N); } - simdutf_really_inline simd8 - operator<(const simd8 other) const { - return (__m128i)vec_cmplt((__vector signed char)this->value, - (__vector signed char)(__m128i)other); + void dump() const { +#ifdef SIMDUTF_LOGGING + printf("[%08x, %08x, %08x, %08x]\n", uint32_t(_mm_extract_epi32(value, 0)), + uint32_t(_mm_extract_epi32(value, 1)), + uint32_t(_mm_extract_epi32(value, 2)), + uint32_t(_mm_extract_epi32(value, 3))); +#endif // SIMDUTF_LOGGING } -}; -// Unsigned bytes -template <> struct simd8 : base8_numeric { - simdutf_really_inline simd8() : base8_numeric() {} - simdutf_really_inline simd8(const __m128i _value) - : base8_numeric(_value) {} - // Splat constructor - simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {} - // Array constructor - simdutf_really_inline simd8(const uint8_t *values) : simd8(load(values)) {} - // Member-by-member initialization - simdutf_really_inline - simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, - uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10, - uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15) - : simd8((__m128i){v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, - v13, v14, v15}) {} - // Repeat 16 values as many times as necessary (usually for lookup tables) - simdutf_really_inline static simd8 - repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, - uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, - uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, - uint8_t v15) { - return simd8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, - v13, v14, v15); + // operators + simdutf_really_inline simd32 &operator+=(const simd32 other) { + value = _mm_add_epi32(value, other.value); + return *this; } - // Saturated math - simdutf_really_inline simd8 - saturating_add(const simd8 other) const { - return (__m128i)vec_adds(this->value, (__m128i)other); - } - simdutf_really_inline simd8 - saturating_sub(const simd8 other) const { - return (__m128i)vec_subs(this->value, (__m128i)other); + // static members + simdutf_really_inline static simd32 zero() { + return _mm_setzero_si128(); } - // Order-specific operations - simdutf_really_inline simd8 - max_val(const simd8 other) const { - return (__m128i)vec_max(this->value, (__m128i)other); - } - simdutf_really_inline simd8 - min_val(const simd8 other) const { - return (__m128i)vec_min(this->value, (__m128i)other); - } - // Same as >, but only guarantees true is nonzero (< guarantees true = -1) - simdutf_really_inline simd8 - gt_bits(const simd8 other) const { - return this->saturating_sub(other); - } - // Same as <, but only guarantees true is nonzero (< guarantees true = -1) - simdutf_really_inline simd8 - lt_bits(const simd8 other) const { - return other.saturating_sub(*this); - } - simdutf_really_inline simd8 - operator<=(const simd8 other) const { - return other.max_val(*this) == other; - } - simdutf_really_inline simd8 - operator>=(const simd8 other) const { - return other.min_val(*this) == other; - } - simdutf_really_inline simd8 - operator>(const simd8 other) const { - return this->gt_bits(other).any_bits_set(); - } - simdutf_really_inline simd8 - operator<(const simd8 other) const { - return this->gt_bits(other).any_bits_set(); + simdutf_really_inline static simd32 splat(uint32_t v) { + return _mm_set1_epi32(v); } +}; - // Bit-specific operations - simdutf_really_inline simd8 bits_not_set() const { - return (__m128i)vec_cmpeq(this->value, (__m128i)vec_splats(uint8_t(0))); - } - simdutf_really_inline simd8 bits_not_set(simd8 bits) const { - return (*this & bits).bits_not_set(); - } - simdutf_really_inline simd8 any_bits_set() const { - return ~this->bits_not_set(); - } - simdutf_really_inline simd8 any_bits_set(simd8 bits) const { - return ~this->bits_not_set(bits); - } +//---------------------------------------------------------------------- - simdutf_really_inline bool is_ascii() const { - return this->saturating_sub(0b01111111u).bits_not_set_anywhere(); - } +template <> struct simd32 { + // static const size_t SIZE = sizeof(__m128i); + // static const size_t ELEMENTS = SIZE / sizeof(uint32_t); - simdutf_really_inline bool bits_not_set_anywhere() const { - return vec_all_eq(this->value, (__m128i)vec_splats(0)); - } - simdutf_really_inline bool any_bits_set_anywhere() const { - return !bits_not_set_anywhere(); - } - simdutf_really_inline bool bits_not_set_anywhere(simd8 bits) const { - return vec_all_eq(vec_and(this->value, (__m128i)bits), - (__m128i)vec_splats(0)); - } - simdutf_really_inline bool any_bits_set_anywhere(simd8 bits) const { - return !bits_not_set_anywhere(bits); - } - template simdutf_really_inline simd8 shr() const { - return simd8( - (__m128i)vec_sr(this->value, (__m128i)vec_splat_u8(N))); + __m128i value; + + simdutf_really_inline simd32(const __m128i v) : value(v) {} + + simdutf_really_inline bool any() const { + return _mm_movemask_epi8(value) != 0; } - template simdutf_really_inline simd8 shl() const { - return simd8( - (__m128i)vec_sl(this->value, (__m128i)vec_splat_u8(N))); + + simdutf_really_inline uint8_t to_4bit_bitmask() const { + return uint8_t(_mm_movemask_ps(_mm_castsi128_ps(value))); } }; -template struct simd8x64 { - static constexpr int NUM_CHUNKS = 64 / sizeof(simd8); - static_assert(NUM_CHUNKS == 4, - "PPC64 kernel should use four registers per 64-byte block."); - simd8 chunks[NUM_CHUNKS]; +//---------------------------------------------------------------------- - simd8x64(const simd8x64 &o) = delete; // no copy allowed - simd8x64 & - operator=(const simd8 other) = delete; // no assignment allowed - simd8x64() = delete; // no default constructor allowed +template +simdutf_really_inline simd32 operator|(const simd32 a, + const simd32 b) { + return _mm_or_si128(a.value, b.value); +} - simdutf_really_inline simd8x64(const simd8 chunk0, const simd8 chunk1, - const simd8 chunk2, const simd8 chunk3) - : chunks{chunk0, chunk1, chunk2, chunk3} {} +simdutf_really_inline simd32 min(const simd32 a, + const simd32 b) { + return _mm_min_epu32(a.value, b.value); +} - simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8::load(ptr), simd8::load(ptr+sizeof(simd8)/sizeof(T)), simd8::load(ptr+2*sizeof(simd8)/sizeof(T)), simd8::load(ptr+3*sizeof(simd8)/sizeof(T))} {} +simdutf_really_inline simd32 max(const simd32 a, + const simd32 b) { + return _mm_max_epu32(a.value, b.value); +} - simdutf_really_inline void store(T* ptr) const { - this->chunks[0].store(ptr + sizeof(simd8) * 0/sizeof(T)); - this->chunks[1].store(ptr + sizeof(simd8) * 1/sizeof(T)); - this->chunks[2].store(ptr + sizeof(simd8) * 2/sizeof(T)); - this->chunks[3].store(ptr + sizeof(simd8) * 3/sizeof(T)); - } +simdutf_really_inline simd32 operator==(const simd32 a, + uint32_t b) { + return _mm_cmpeq_epi32(a.value, _mm_set1_epi32(b)); +} +simdutf_really_inline simd32 operator&(const simd32 a, + const simd32 b) { + return _mm_and_si128(a.value, b.value); +} - simdutf_really_inline simd8x64& operator |=(const simd8x64 &other) { - this->chunks[0] |= other.chunks[0]; - this->chunks[1] |= other.chunks[1]; - this->chunks[2] |= other.chunks[2]; - this->chunks[3] |= other.chunks[3]; - return *this; - } +simdutf_really_inline simd32 operator&(const simd32 a, + uint32_t b) { + return _mm_and_si128(a.value, _mm_set1_epi32(b)); +} - simdutf_really_inline simd8 reduce_or() const { - return (this->chunks[0] | this->chunks[1]) | - (this->chunks[2] | this->chunks[3]); - } +simdutf_really_inline simd32 operator|(const simd32 a, + uint32_t b) { + return _mm_or_si128(a.value, _mm_set1_epi32(b)); +} +simdutf_really_inline simd32 operator+(const simd32 a, + const simd32 b) { + return _mm_add_epi32(a.value, b.value); +} - simdutf_really_inline bool is_ascii() const { - return input.reduce_or().is_ascii(); - } +simdutf_really_inline simd32 operator-(const simd32 a, + uint32_t b) { + return _mm_sub_epi32(a.value, _mm_set1_epi32(b)); +} - simdutf_really_inline uint64_t to_bitmask() const { - uint64_t r0 = uint32_t(this->chunks[0].to_bitmask()); - uint64_t r1 = this->chunks[1].to_bitmask(); - uint64_t r2 = this->chunks[2].to_bitmask(); - uint64_t r3 = this->chunks[3].to_bitmask(); - return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48); - } +simdutf_really_inline simd32 operator==(const simd32 a, + const simd32 b) { + return _mm_cmpeq_epi32(a.value, b.value); +} - simdutf_really_inline uint64_t eq(const T m) const { - const simd8 mask = simd8::splat(m); - return simd8x64(this->chunks[0] == mask, this->chunks[1] == mask, - this->chunks[2] == mask, this->chunks[3] == mask) - .to_bitmask(); +simdutf_really_inline simd32 operator>=(const simd32 a, + const simd32 b) { + return _mm_cmpeq_epi32(_mm_max_epu32(a.value, b.value), a.value); +} + +simdutf_really_inline simd32 operator!(const simd32 v) { + return _mm_xor_si128(v.value, _mm_set1_epi8(-1)); +} + +simdutf_really_inline simd32 operator>(const simd32 a, + const simd32 b) { + return !(b >= a); +} + +simdutf_really_inline simd32 select(const simd32 cond, + const simd32 v_true, + const simd32 v_false) { + return _mm_blendv_epi8(v_false.value, v_true.value, cond.value); +} +/* end file src/simdutf/westmere/simd32-inl.h */ +/* begin file src/simdutf/westmere/simd64-inl.h */ +template struct simd64; + +template <> struct simd64 { + // static const size_t SIZE = sizeof(__m128i); + // static const size_t ELEMENTS = SIZE / sizeof(uint64_t); + + __m128i value; + + simdutf_really_inline simd64(const __m128i v) : value(v) {} + + template + simdutf_really_inline simd64(const Pointer *ptr) + : value(_mm_loadu_si128(reinterpret_cast(ptr))) {} + + simdutf_really_inline uint64_t sum() const { + return _mm_extract_epi64(value, 0) + _mm_extract_epi64(value, 1); } - simdutf_really_inline uint64_t eq(const simd8x64 &other) const { - return simd8x64(this->chunks[0] == other.chunks[0], - this->chunks[1] == other.chunks[1], - this->chunks[2] == other.chunks[2], - this->chunks[3] == other.chunks[3]) - .to_bitmask(); + // operators + simdutf_really_inline simd64 &operator+=(const simd64 other) { + value = _mm_add_epi64(value, other.value); + return *this; } - simdutf_really_inline uint64_t lteq(const T m) const { - const simd8 mask = simd8::splat(m); - return simd8x64(this->chunks[0] <= mask, this->chunks[1] <= mask, - this->chunks[2] <= mask, this->chunks[3] <= mask) - .to_bitmask(); + // static members + simdutf_really_inline static simd64 zero() { + return _mm_setzero_si128(); } - simdutf_really_inline uint64_t in_range(const T low, const T high) const { - const simd8 mask_low = simd8::splat(low); - const simd8 mask_high = simd8::splat(high); - - return simd8x64( - (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low), - (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low), - (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low), - (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low) - ).to_bitmask(); - } - simdutf_really_inline uint64_t not_in_range(const T low, const T high) const { - const simd8 mask_low = simd8::splat(low); - const simd8 mask_high = simd8::splat(high); - return simd8x64( - (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low), - (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low), - (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low), - (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low) - ).to_bitmask(); - } - simdutf_really_inline uint64_t lt(const T m) const { - const simd8 mask = simd8::splat(m); - return simd8x64(this->chunks[0] < mask, this->chunks[1] < mask, - this->chunks[2] < mask, this->chunks[3] < mask) - .to_bitmask(); + simdutf_really_inline static simd64 splat(uint64_t v) { + return _mm_set1_epi64x(v); } +}; +/* end file src/simdutf/westmere/simd64-inl.h */ - simdutf_really_inline uint64_t gt(const T m) const { - const simd8 mask = simd8::splat(m); - return simd8x64( - this->chunks[0] > mask, - this->chunks[1] > mask, - this->chunks[2] > mask, - this->chunks[3] > mask - ).to_bitmask(); - } - simdutf_really_inline uint64_t gteq(const T m) const { - const simd8 mask = simd8::splat(m); - return simd8x64( - this->chunks[0] >= mask, - this->chunks[1] >= mask, - this->chunks[2] >= mask, - this->chunks[3] >= mask - ).to_bitmask(); - } - simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const { - const simd8 mask = simd8::splat(m); - return simd8x64( - simd8(this->chunks[0]) >= mask, - simd8(this->chunks[1]) >= mask, - simd8(this->chunks[2]) >= mask, - simd8(this->chunks[3]) >= mask - ).to_bitmask(); - } -}; // struct simd8x64 +simdutf_really_inline simd64 sum_8bytes(const simd8 v) { + return _mm_sad_epu8(v.value, simd8::zero()); +} + +simdutf_really_inline simd8 as_vector_u8(const simd32 v) { + return simd8(v.value); +} } // namespace simd } // unnamed namespace -} // namespace ppc64 +} // namespace westmere } // namespace simdutf -#endif // SIMDUTF_PPC64_SIMD_INPUT_H -/* end file src/simdutf/ppc64/simd.h */ +#endif // SIMDUTF_WESTMERE_SIMD_INPUT_H +/* end file src/simdutf/westmere/simd.h */ -/* begin file src/simdutf/ppc64/end.h */ -/* end file src/simdutf/ppc64/end.h */ +/* begin file src/simdutf/westmere/end.h */ +#if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE +// nothing needed. +#else +SIMDUTF_UNTARGET_REGION +#endif -#endif // SIMDUTF_IMPLEMENTATION_PPC64 +#undef SIMDUTF_SIMD_HAS_BYTEMASK +/* end file src/simdutf/westmere/end.h */ -#endif // SIMDUTF_PPC64_H -/* end file src/simdutf/ppc64.h */ -/* begin file src/simdutf/rvv.h */ -#ifndef SIMDUTF_RVV_H -#define SIMDUTF_RVV_H +#endif // SIMDUTF_IMPLEMENTATION_WESTMERE +#endif // SIMDUTF_WESTMERE_COMMON_H +/* end file src/simdutf/westmere.h */ +/* begin file src/simdutf/ppc64.h */ +#ifndef SIMDUTF_PPC64_H +#define SIMDUTF_PPC64_H #ifdef SIMDUTF_FALLBACK_H -#error "rvv.h must be included before fallback.h" + #error "ppc64.h must be included before fallback.h" #endif -#define SIMDUTF_CAN_ALWAYS_RUN_RVV SIMDUTF_IS_RVV - -#ifndef SIMDUTF_IMPLEMENTATION_RVV -#define SIMDUTF_IMPLEMENTATION_RVV (SIMDUTF_CAN_ALWAYS_RUN_RVV || (SIMDUTF_IS_RISCV64 && SIMDUTF_HAS_RVV_INTRINSICS && SIMDUTF_HAS_RVV_TARGET_REGION)) +#ifndef SIMDUTF_IMPLEMENTATION_PPC64 + #define SIMDUTF_IMPLEMENTATION_PPC64 (SIMDUTF_IS_PPC64) #endif +#define SIMDUTF_CAN_ALWAYS_RUN_PPC64 \ + SIMDUTF_IMPLEMENTATION_PPC64 &&SIMDUTF_IS_PPC64 -#if SIMDUTF_IMPLEMENTATION_RVV -#if SIMDUTF_CAN_ALWAYS_RUN_RVV -#define SIMDUTF_TARGET_RVV -#else -#define SIMDUTF_TARGET_RVV SIMDUTF_TARGET_REGION("arch=+v") -#endif -#if !SIMDUTF_IS_ZVBB && SIMDUTF_HAS_ZVBB_INTRINSICS -#define SIMDUTF_TARGET_ZVBB SIMDUTF_TARGET_REGION("arch=+v,+zvbb") -#endif +#if SIMDUTF_IMPLEMENTATION_PPC64 namespace simdutf { -namespace rvv { -} // namespace rvv +/** + * Implementation for ALTIVEC (PPC64). + */ +namespace ppc64 {} // namespace ppc64 } // namespace simdutf -/* begin file src/simdutf/rvv/implementation.h */ -#ifndef SIMDUTF_RVV_IMPLEMENTATION_H -#define SIMDUTF_RVV_IMPLEMENTATION_H +/* begin file src/simdutf/ppc64/implementation.h */ +#ifndef SIMDUTF_PPC64_IMPLEMENTATION_H +#define SIMDUTF_PPC64_IMPLEMENTATION_H namespace simdutf { -namespace rvv { +namespace ppc64 { namespace { using namespace simdutf; + +template simdutf_really_inline size_t align_down(size_t size) { + return N * (size / N); +} } // namespace class implementation final : public simdutf::implementation { public: simdutf_really_inline implementation() - : simdutf::implementation("rvv", "RISC-V Vector Extension", - internal::instruction_set::RVV) - , _supports_zvbb(internal::detect_supported_architectures() & internal::instruction_set::ZVBB) - {} - simdutf_warn_unused int detect_encodings(const char *buf, size_t len) const noexcept final; - simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final; - simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final; - simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final; - simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final; - simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final; - simdutf_warn_unused size_t convert_latin1_to_utf8(const char *buf, size_t len, char *utf8_output) const noexcept final; - simdutf_warn_unused size_t convert_latin1_to_utf16le(const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_latin1_to_utf16be(const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_latin1_to_utf32(const char *buf, size_t len, char32_t *utf32_output) const noexcept final; - simdutf_warn_unused size_t convert_utf8_to_latin1(const char *buf, size_t len, char *latin1_output) const noexcept final; - simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char *buf, size_t len, char *latin1_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char *buf, size_t len, char *latin1_output) const noexcept final; - simdutf_warn_unused size_t convert_utf8_to_utf16le(const char *buf, size_t len, char16_t *utf16_output) const noexcept final; - simdutf_warn_unused size_t convert_utf8_to_utf16be(const char *buf, size_t len, char16_t *utf16_output) const noexcept final; - simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char *buf, size_t len, char16_t *utf16_output) const noexcept final; - simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char *buf, size_t len, char16_t *utf16_output) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf8_to_utf32(const char *buf, size_t len, char32_t *utf32_output) const noexcept final; - simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char *buf, size_t len, char32_t *utf32_output) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t *buf, size_t len, char *latin1_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t *buf, size_t len, char *latin1_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t *buf, size_t len, char *latin1_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t *buf, size_t len, char *latin1_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len, char *latin1_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len, char *latin1_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final; - simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t *buf, size_t len, char *latin1_output) const noexcept final; - simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len, char *latin1_output) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t *buf, size_t len, char *latin1_output) const noexcept final; - simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t *buf, size_t len, char16_t *utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t *buf, size_t len, char16_t *utf16_buffer) const noexcept final; - simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t *buf, size_t len, char16_t *utf16_buffer) const noexcept final; - simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t *buf, size_t len, char16_t *utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len, char16_t *utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len, char16_t *utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t *buf, size_t len, char32_t *utf32_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t *buf, size_t len, char32_t *utf32_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t *buf, size_t len, char32_t *utf32_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t *buf, size_t len, char32_t *utf32_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len, char32_t *utf32_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len, char32_t *utf32_buffer) const noexcept final; - void change_endianness_utf16(const char16_t *buf, size_t len, char16_t *output) const noexcept final; - simdutf_warn_unused size_t count_utf16le(const char16_t *buf, size_t len) const noexcept; - simdutf_warn_unused size_t count_utf16be(const char16_t *buf, size_t len) const noexcept; - simdutf_warn_unused size_t count_utf8(const char *buf, size_t len) const noexcept; - simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *buf, size_t len) const noexcept; - simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *buf, size_t len) const noexcept; - simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t *buf, size_t len) const noexcept; - simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t *buf, size_t len) const noexcept; - simdutf_warn_unused size_t utf16_length_from_utf8(const char *buf, size_t len) const noexcept; - simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *buf, size_t len) const noexcept; - simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t *buf, size_t len) const noexcept; - simdutf_warn_unused size_t utf32_length_from_utf8(const char *buf, size_t len) const noexcept; - simdutf_warn_unused size_t latin1_length_from_utf8(const char *buf, size_t len) const noexcept; - simdutf_warn_unused size_t latin1_length_from_utf16(size_t len) const noexcept; - simdutf_warn_unused size_t latin1_length_from_utf32(size_t len) const noexcept; - simdutf_warn_unused size_t utf32_length_from_latin1(size_t len) const noexcept; - simdutf_warn_unused size_t utf16_length_from_latin1(size_t len) const noexcept; - simdutf_warn_unused size_t utf8_length_from_latin1(const char *buf, size_t len) const noexcept; - simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept; - simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept; - simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept; - simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept; - simdutf_warn_unused size_t base64_length_from_binary(size_t length, base64_options options) const noexcept; - size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept; -private: - const bool _supports_zvbb; - -#if SIMDUTF_IS_ZVBB - bool supports_zvbb() const { return true; } -#elif SIMDUTF_HAS_ZVBB_INTRINSICS - bool supports_zvbb() const { return _supports_zvbb; } -#else - bool supports_zvbb() const { return false; } + : simdutf::implementation("ppc64", "PPC64 ALTIVEC", + internal::instruction_set::ALTIVEC) {} + +#if SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused int detect_encodings(const char *input, + size_t length) const noexcept final; +#endif // SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool validate_utf8(const char *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF8 + simdutf_warn_unused result + validate_utf8_with_errors(const char *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_ASCII + simdutf_warn_unused bool validate_ascii(const char *buf, + size_t len) const noexcept final; + simdutf_warn_unused result + validate_ascii_with_errors(const char *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_ASCII + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII + simdutf_warn_unused bool + validate_utf16le_as_ascii(const char16_t *buf, + size_t len) const noexcept final; + + simdutf_warn_unused bool + validate_utf16be_as_ascii(const char16_t *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII + +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool validate_utf16le(const char16_t *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused bool validate_utf16be(const char16_t *buf, + size_t len) const noexcept final; + simdutf_warn_unused result validate_utf16le_with_errors( + const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf16be_with_errors( + const char16_t *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool validate_utf32(const char32_t *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused result validate_utf32_with_errors( + const char32_t *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf8( + const char *buf, size_t len, char *utf8_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf16le( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_latin1_to_utf16be( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_latin1_with_errors( + const char *buf, size_t len, char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t convert_utf8_to_utf16le( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf16be( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf16le_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf16be_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16le( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16be( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t convert_utf8_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf32_with_errors( + const char *buf, size_t len, char32_t *utf32_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf32( + const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + convert_utf16le_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf16be_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_latin1_with_errors( + const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_latin1_with_errors( + const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t convert_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t convert_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf8_with_errors( + const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + convert_utf32_to_latin1(const char32_t *buf, size_t len, + char *latin1_output) const noexcept final; + simdutf_warn_unused result + convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len, + char *latin1_output) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf32_to_latin1(const char32_t *buf, size_t len, + char *latin1_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t + convert_utf32_to_utf16le(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf32_to_utf16be(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf16le_with_errors( + const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf16be_with_errors( + const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf16le_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf16be_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_utf32_with_errors( + const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_utf32_with_errors( + const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 + void change_endianness_utf16(const char16_t *buf, size_t length, + char16_t *output) const noexcept final; + simdutf_warn_unused size_t count_utf16le(const char16_t *buf, + size_t length) const noexcept; + simdutf_warn_unused size_t count_utf16be(const char16_t *buf, + size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 + simdutf_warn_unused size_t count_utf8(const char *buf, + size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t + utf8_length_from_utf16le(const char16_t *input, size_t length) const noexcept; + simdutf_warn_unused size_t + utf8_length_from_utf16be(const char16_t *input, size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t utf32_length_from_utf16le( + const char16_t *input, size_t length) const noexcept; + simdutf_warn_unused size_t utf32_length_from_utf16be( + const char16_t *input, size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t + utf16_length_from_utf8(const char *input, size_t length) const noexcept; + simdutf_warn_unused result utf8_length_from_utf16le_with_replacement( + const char16_t *input, size_t length) const noexcept; + ; + simdutf_warn_unused result utf8_length_from_utf16be_with_replacement( + const char16_t *input, size_t length) const noexcept; + ; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t + utf8_length_from_utf32(const char32_t *input, size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t + utf16_length_from_utf32(const char32_t *input, size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t + utf32_length_from_utf8(const char *input, size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + latin1_length_from_utf8(const char *input, size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + utf8_length_from_latin1(const char *input, size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_BASE64 + simdutf_warn_unused size_t maximal_binary_length_from_base64( + const char *input, size_t length) const noexcept; + simdutf_warn_unused result base64_to_binary( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept; + simdutf_warn_unused full_result base64_to_binary_details( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept; + simdutf_warn_unused result + base64_to_binary(const char16_t *input, size_t length, char *output, + base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept; + simdutf_warn_unused full_result base64_to_binary_details( + const char16_t *input, size_t length, char *output, + base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept; + size_t binary_to_base64(const char *input, size_t length, char *output, + base64_options options) const noexcept; + + size_t binary_to_base64_with_lines(const char *input, size_t length, + char *output, size_t line_length, + base64_options options) const noexcept; + const char *find(const char *start, const char *end, + char character) const noexcept; + + const char16_t *find(const char16_t *start, const char16_t *end, + char16_t character) const noexcept; +#endif // SIMDUTF_FEATURE_BASE64 + +#ifdef SIMDUTF_INTERNAL_TESTS + virtual std::vector internal_tests() const override; #endif +#if SIMDUTF_FEATURE_UTF16 + + void to_well_formed_utf16be(const char16_t *input, size_t len, + char16_t *output) const noexcept final; + void to_well_formed_utf16le(const char16_t *input, size_t len, + char16_t *output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 }; -} // namespace rvv +} // namespace ppc64 } // namespace simdutf -#endif // SIMDUTF_RVV_IMPLEMENTATION_H -/* end file src/simdutf/rvv/implementation.h */ -/* begin file src/simdutf/rvv/begin.h */ -// redefining SIMDUTF_IMPLEMENTATION to "rvv" -// #define SIMDUTF_IMPLEMENTATION rvv +#endif // SIMDUTF_PPC64_IMPLEMENTATION_H +/* end file src/simdutf/ppc64/implementation.h */ -#if SIMDUTF_CAN_ALWAYS_RUN_RVV -// nothing needed. -#else -SIMDUTF_TARGET_RVV -#endif -/* end file src/simdutf/rvv/begin.h */ -/* begin file src/simdutf/rvv/intrinsics.h */ -#ifndef SIMDUTF_RVV_INTRINSICS_H -#define SIMDUTF_RVV_INTRINSICS_H +/* begin file src/simdutf/ppc64/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "ppc64" +// #define SIMDUTF_IMPLEMENTATION ppc64 +/* end file src/simdutf/ppc64/begin.h */ + // Declarations +/* begin file src/simdutf/ppc64/intrinsics.h */ +#ifndef SIMDUTF_PPC64_INTRINSICS_H +#define SIMDUTF_PPC64_INTRINSICS_H -#include -#if __riscv_v_intrinsic >= 1000000 || __GCC__ >= 14 -#define simdutf_vrgather_u8m1x2(tbl, idx) __riscv_vcreate_v_u8m1_u8m2( \ - __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m2_u8m1(idx, 0), __riscv_vsetvlmax_e8m1()), \ - __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m2_u8m1(idx, 1), __riscv_vsetvlmax_e8m1())); +// This should be the correct header whether +// you use visual studio or other compilers. +#include -#define simdutf_vrgather_u8m1x4(tbl, idx) __riscv_vcreate_v_u8m1_u8m4( \ - __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 0), __riscv_vsetvlmax_e8m1()), \ - __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 1), __riscv_vsetvlmax_e8m1()), \ - __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 2), __riscv_vsetvlmax_e8m1()), \ - __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 3), __riscv_vsetvlmax_e8m1())); -#else -// This has worse codegen on gcc -#define simdutf_vrgather_u8m1x2(tbl, idx) \ - __riscv_vset_v_u8m1_u8m2(__riscv_vlmul_ext_v_u8m1_u8m2( \ - __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m2_u8m1(idx, 0), __riscv_vsetvlmax_e8m1())), 1, \ - __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m2_u8m1(idx, 1), __riscv_vsetvlmax_e8m1())) - -#define simdutf_vrgather_u8m1x4(tbl, idx) \ - __riscv_vset_v_u8m1_u8m4(__riscv_vset_v_u8m1_u8m4(\ - __riscv_vset_v_u8m1_u8m4(__riscv_vlmul_ext_v_u8m1_u8m4( \ - __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 0), __riscv_vsetvlmax_e8m1())), 1, \ - __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 1), __riscv_vsetvlmax_e8m1())), 2, \ - __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 2), __riscv_vsetvlmax_e8m1())), 3, \ - __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 3), __riscv_vsetvlmax_e8m1())) +// These are defined by altivec.h in GCC toolchain, it is safe to undef them. +#ifdef bool + #undef bool #endif -/* Zvbb adds dedicated support for endianness swaps with vrev8, but if we can't - * use that, we have to emulate it with the standard V extension. - * Using LMUL=1 vrgathers could be faster than the srl+macc variant, but that - * would increase register pressure, and vrgather implementations performance - * varies a lot. */ -enum class simdutf_ByteFlip { NONE, V, ZVBB }; +#ifdef vector + #undef vector +#endif -template -simdutf_really_inline static uint16_t simdutf_byteflip(uint16_t v) { - if (method != simdutf_ByteFlip::NONE) - return (uint16_t)((v*1u) << 8 | (v*1u) >> 8); - return v; -} +#endif // SIMDUTF_PPC64_INTRINSICS_H +/* end file src/simdutf/ppc64/intrinsics.h */ +/* begin file src/simdutf/ppc64/bitmanipulation.h */ +#ifndef SIMDUTF_PPC64_BITMANIPULATION_H +#define SIMDUTF_PPC64_BITMANIPULATION_H -#ifdef SIMDUTF_TARGET_ZVBB -SIMDUTF_UNTARGET_REGION -SIMDUTF_TARGET_ZVBB -#endif +namespace simdutf { +namespace ppc64 { +namespace { -template -simdutf_really_inline static vuint16m1_t simdutf_byteflip(vuint16m1_t v, size_t vl) { -#if SIMDUTF_HAS_ZVBB_INTRINSICS - if (method == simdutf_ByteFlip::ZVBB) - return __riscv_vrev8_v_u16m1(v, vl); -#endif - if (method == simdutf_ByteFlip::V) - return __riscv_vmacc_vx_u16m1(__riscv_vsrl_vx_u16m1(v, 8, vl), 0x100, v, vl); - return v; +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO +simdutf_really_inline int count_ones(uint64_t input_num) { + // note: we do not support legacy 32-bit Windows + return __popcnt64(input_num); // Visual Studio wants two underscores } - -template -simdutf_really_inline static vuint16m2_t simdutf_byteflip(vuint16m2_t v, size_t vl) { -#if SIMDUTF_HAS_ZVBB_INTRINSICS - if (method == simdutf_ByteFlip::ZVBB) - return __riscv_vrev8_v_u16m2(v, vl); -#endif - if (method == simdutf_ByteFlip::V) - return __riscv_vmacc_vx_u16m2(__riscv_vsrl_vx_u16m2(v, 8, vl), 0x100, v, vl); - return v; +#else +simdutf_really_inline int count_ones(uint64_t input_num) { + return __builtin_popcountll(input_num); } - -template -simdutf_really_inline static vuint16m4_t simdutf_byteflip(vuint16m4_t v, size_t vl) { -#if SIMDUTF_HAS_ZVBB_INTRINSICS - if (method == simdutf_ByteFlip::ZVBB) - return __riscv_vrev8_v_u16m4(v, vl); #endif - if (method == simdutf_ByteFlip::V) - return __riscv_vmacc_vx_u16m4(__riscv_vsrl_vx_u16m4(v, 8, vl), 0x100, v, vl); - return v; -} -template -simdutf_really_inline static vuint16m8_t simdutf_byteflip(vuint16m8_t v, size_t vl) { -#if SIMDUTF_HAS_ZVBB_INTRINSICS - if (method == simdutf_ByteFlip::ZVBB) - return __riscv_vrev8_v_u16m8(v, vl); -#endif - if (method == simdutf_ByteFlip::V) - return __riscv_vmacc_vx_u16m8(__riscv_vsrl_vx_u16m8(v, 8, vl), 0x100, v, vl); - return v; +#if SIMDUTF_NEED_TRAILING_ZEROES +simdutf_really_inline int trailing_zeroes(uint64_t input_num) { + return __builtin_ctzll(input_num); } - -#ifdef SIMDUTF_TARGET_ZVBB -SIMDUTF_UNTARGET_REGION -SIMDUTF_TARGET_RVV #endif -#endif // SIMDUTF_RVV_INTRINSICS_H -/* end file src/simdutf/rvv/intrinsics.h */ -/* begin file src/simdutf/rvv/end.h */ -#if SIMDUTF_CAN_ALWAYS_RUN_RVV -// nothing needed. -#else -SIMDUTF_UNTARGET_REGION -#endif +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf -/* end file src/simdutf/rvv/end.h */ +#endif // SIMDUTF_PPC64_BITMANIPULATION_H +/* end file src/simdutf/ppc64/bitmanipulation.h */ +/* begin file src/simdutf/ppc64/simd.h */ +#ifndef SIMDUTF_PPC64_SIMD_H +#define SIMDUTF_PPC64_SIMD_H -#endif // SIMDUTF_IMPLEMENTATION_RVV +#include -#endif // SIMDUTF_RVV_H -/* end file src/simdutf/rvv.h */ -/* begin file src/simdutf/fallback.h */ -#ifndef SIMDUTF_FALLBACK_H -#define SIMDUTF_FALLBACK_H +namespace simdutf { +namespace ppc64 { +namespace { +namespace simd { +using vec_bool_t = __vector __bool char; +using vec_bool16_t = __vector __bool short; +using vec_bool32_t = __vector __bool int; +using vec_u8_t = __vector unsigned char; +using vec_i8_t = __vector signed char; +using vec_u16_t = __vector unsigned short; +using vec_i16_t = __vector signed short; +using vec_u32_t = __vector unsigned int; +using vec_i32_t = __vector signed int; +using vec_u64_t = __vector unsigned long long; +using vec_i64_t = __vector signed long long; + +// clang-format off +template struct vector_u8_type_for_element_aux { + using type = typename std::conditional::value, vec_bool_t, + typename std::conditional::value, vec_u8_t, + typename std::conditional::value, vec_i8_t, void>::type>::type>::type; + + static_assert(not std::is_same::value, + "accepted element types are 8 bit integers or bool"); +}; -// Note that fallback.h is always imported last. +template struct vector_u16_type_for_element_aux { + using type = typename std::conditional::value, vec_bool16_t, + typename std::conditional::value, vec_u16_t, + typename std::conditional::value, vec_i16_t, void>::type>::type>::type; -// Default Fallback to on unless a builtin implementation has already been selected. -#ifndef SIMDUTF_IMPLEMENTATION_FALLBACK -#if SIMDUTF_CAN_ALWAYS_RUN_ARM64 || SIMDUTF_CAN_ALWAYS_RUN_ICELAKE || SIMDUTF_CAN_ALWAYS_RUN_HASWELL || SIMDUTF_CAN_ALWAYS_RUN_WESTMERE || SIMDUTF_CAN_ALWAYS_RUN_PPC64 || SIMDUTF_CAN_ALWAYS_RUN_RVV -#define SIMDUTF_IMPLEMENTATION_FALLBACK 0 -#else -#define SIMDUTF_IMPLEMENTATION_FALLBACK 1 -#endif -#endif + static_assert(not std::is_same::value, + "accepted element types are 16 bit integers or bool"); +}; -#define SIMDUTF_CAN_ALWAYS_RUN_FALLBACK (SIMDUTF_IMPLEMENTATION_FALLBACK) +template struct vector_u32_type_for_element_aux { + using type = typename std::conditional::value, vec_bool32_t, + typename std::conditional::value, vec_u32_t, + typename std::conditional::value, vec_i32_t, void>::type>::type>::type; -#if SIMDUTF_IMPLEMENTATION_FALLBACK + static_assert(not std::is_same::value, + "accepted element types are 32 bit integers or bool"); +}; +// clang-format on -namespace simdutf { -/** - * Fallback implementation (runs on any machine). - */ -namespace fallback { -} // namespace fallback -} // namespace simdutf +template +using vector_u8_type_for_element = + typename vector_u8_type_for_element_aux::type; -/* begin file src/simdutf/fallback/implementation.h */ -#ifndef SIMDUTF_FALLBACK_IMPLEMENTATION_H -#define SIMDUTF_FALLBACK_IMPLEMENTATION_H +template +using vector_u16_type_for_element = + typename vector_u16_type_for_element_aux::type; +template +using vector_u32_type_for_element = + typename vector_u32_type_for_element_aux::type; -namespace simdutf { -namespace fallback { +template uint16_t move_mask_u8(T vec) { + const vec_u8_t perm_mask = {15 * 8, 14 * 8, 13 * 8, 12 * 8, 11 * 8, 10 * 8, + 9 * 8, 8 * 8, 7 * 8, 6 * 8, 5 * 8, 4 * 8, + 3 * 8, 2 * 8, 1 * 8, 0 * 8}; -namespace { -using namespace simdutf; + const auto result = (vec_u64_t)vec_vbpermq((vec_u8_t)vec, perm_mask); +#if SIMDUTF_IS_BIG_ENDIAN + return static_cast(result[0]); +#else + return static_cast(result[1]); +#endif } -class implementation final : public simdutf::implementation { -public: - simdutf_really_inline implementation() : simdutf::implementation( - "fallback", - "Generic fallback implementation", - 0 - ) {} - simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final; - simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final; - simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final; - simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final; - simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final; - simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final; - simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final; - simdutf_warn_unused size_t convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept final; - simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_latin1_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; - simdutf_warn_unused size_t convert_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final; - simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final; - simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; - simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; - simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; - simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; - simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final; - simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * buf, size_t len, char* latin1_output) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final; - simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; - void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final; - simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept; - simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept; - simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept; - simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept; - simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept; - simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept; - simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept; - simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept; - simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept; - simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept; - simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept; - simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept; - simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept; - simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept; - simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept; - simdutf_warn_unused size_t base64_length_from_binary(size_t length, base64_options options) const noexcept; - size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept; -}; -} // namespace fallback -} // namespace simdutf +/* begin file src/simdutf/ppc64/simd8-inl.h */ +// file included directly -#endif // SIMDUTF_FALLBACK_IMPLEMENTATION_H -/* end file src/simdutf/fallback/implementation.h */ +template struct base8 { + using vector_type = vector_u8_type_for_element; + vector_type value; + static const int SIZE = sizeof(vector_type); + static const int ELEMENTS = sizeof(vector_type) / sizeof(T); -/* begin file src/simdutf/fallback/begin.h */ -// redefining SIMDUTF_IMPLEMENTATION to "fallback" -// #define SIMDUTF_IMPLEMENTATION fallback -/* end file src/simdutf/fallback/begin.h */ + // Zero constructor + simdutf_really_inline base8() : value{vec_splats(T(0))} {} -// Declarations -/* begin file src/simdutf/fallback/bitmanipulation.h */ -#ifndef SIMDUTF_FALLBACK_BITMANIPULATION_H -#define SIMDUTF_FALLBACK_BITMANIPULATION_H + // Conversion from SIMD register + simdutf_really_inline base8(const vector_type _value) : value{_value} {} -#include + // Splat scalar + simdutf_really_inline base8(T v) : value{vec_splats(v)} {} -namespace simdutf { -namespace fallback { -namespace { - -} // unnamed namespace -} // namespace fallback -} // namespace simdutf - -#endif // SIMDUTF_FALLBACK_BITMANIPULATION_H -/* end file src/simdutf/fallback/bitmanipulation.h */ - -/* begin file src/simdutf/fallback/end.h */ -/* end file src/simdutf/fallback/end.h */ - -#endif // SIMDUTF_IMPLEMENTATION_FALLBACK -#endif // SIMDUTF_FALLBACK_H -/* end file src/simdutf/fallback.h */ - -/* begin file src/scalar/utf8.h */ -#ifndef SIMDUTF_UTF8_H -#define SIMDUTF_UTF8_H - -namespace simdutf { -namespace scalar { -namespace { -namespace utf8 { -#if SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_RVV -// only used by the fallback kernel. -// credit: based on code from Google Fuchsia (Apache Licensed) -inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept { - const uint8_t *data = reinterpret_cast(buf); - uint64_t pos = 0; - uint32_t code_point = 0; - while (pos < len) { - // check of the next 16 bytes are ascii. - uint64_t next_pos = pos + 16; - if (next_pos <= len) { // if it is safe to read 16 more bytes, check that they are ascii - uint64_t v1; - std::memcpy(&v1, data + pos, sizeof(uint64_t)); - uint64_t v2; - std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); - uint64_t v{v1 | v2}; - if ((v & 0x8080808080808080) == 0) { - pos = next_pos; - continue; - } - } - unsigned char byte = data[pos]; - - while (byte < 0b10000000) { - if (++pos == len) { return true; } - byte = data[pos]; - } - - if ((byte & 0b11100000) == 0b11000000) { - next_pos = pos + 2; - if (next_pos > len) { return false; } - if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; } - // range check - code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); - if ((code_point < 0x80) || (0x7ff < code_point)) { return false; } - } else if ((byte & 0b11110000) == 0b11100000) { - next_pos = pos + 3; - if (next_pos > len) { return false; } - if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; } - if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; } - // range check - code_point = (byte & 0b00001111) << 12 | - (data[pos + 1] & 0b00111111) << 6 | - (data[pos + 2] & 0b00111111); - if ((code_point < 0x800) || (0xffff < code_point) || - (0xd7ff < code_point && code_point < 0xe000)) { - return false; - } - } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000 - next_pos = pos + 4; - if (next_pos > len) { return false; } - if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; } - if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; } - if ((data[pos + 3] & 0b11000000) != 0b10000000) { return false; } - // range check - code_point = - (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | - (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111); - if (code_point <= 0xffff || 0x10ffff < code_point) { return false; } - } else { - // we may have a continuation - return false; - } - pos = next_pos; + // Conversion to SIMD register + simdutf_really_inline operator const vector_type &() const { + return this->value; } - return true; -} -#endif -inline simdutf_warn_unused result validate_with_errors(const char *buf, size_t len) noexcept { - const uint8_t *data = reinterpret_cast(buf); - size_t pos = 0; - uint32_t code_point = 0; - while (pos < len) { - // check of the next 16 bytes are ascii. - size_t next_pos = pos + 16; - if (next_pos <= len) { // if it is safe to read 16 more bytes, check that they are ascii - uint64_t v1; - std::memcpy(&v1, data + pos, sizeof(uint64_t)); - uint64_t v2; - std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); - uint64_t v{v1 | v2}; - if ((v & 0x8080808080808080) == 0) { - pos = next_pos; - continue; - } - } - unsigned char byte = data[pos]; - - while (byte < 0b10000000) { - if (++pos == len) { return result(error_code::SUCCESS, len); } - byte = data[pos]; - } - - if ((byte & 0b11100000) == 0b11000000) { - next_pos = pos + 2; - if (next_pos > len) { return result(error_code::TOO_SHORT, pos); } - if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } - // range check - code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); - if ((code_point < 0x80) || (0x7ff < code_point)) { return result(error_code::OVERLONG, pos); } - } else if ((byte & 0b11110000) == 0b11100000) { - next_pos = pos + 3; - if (next_pos > len) { return result(error_code::TOO_SHORT, pos); } - if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } - if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } - // range check - code_point = (byte & 0b00001111) << 12 | - (data[pos + 1] & 0b00111111) << 6 | - (data[pos + 2] & 0b00111111); - if ((code_point < 0x800) || (0xffff < code_point)) { return result(error_code::OVERLONG, pos);} - if (0xd7ff < code_point && code_point < 0xe000) { return result(error_code::SURROGATE, pos); } - } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000 - next_pos = pos + 4; - if (next_pos > len) { return result(error_code::TOO_SHORT, pos); } - if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } - if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } - if ((data[pos + 3] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } - // range check - code_point = - (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | - (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111); - if (code_point <= 0xffff) { return result(error_code::OVERLONG, pos); } - if (0x10ffff < code_point) { return result(error_code::TOO_LARGE, pos); } - } else { - // we either have too many continuation bytes or an invalid leading byte - if ((byte & 0b11000000) == 0b10000000) { return result(error_code::TOO_LONG, pos); } - else { return result(error_code::HEADER_BITS, pos); } - } - pos = next_pos; + template simdutf_really_inline void store(U *ptr) const { + vec_xst(value, 0, reinterpret_cast(ptr)); } - return result(error_code::SUCCESS, len); -} -// Finds the previous leading byte starting backward from buf and validates with errors from there -// Used to pinpoint the location of an error when an invalid chunk is detected -// We assume that the stream starts with a leading byte, and to check that it is the case, we -// ask that you pass a pointer to the start of the stream (start). -inline simdutf_warn_unused result rewind_and_validate_with_errors(const char *start, const char *buf, size_t len) noexcept { - // First check that we start with a leading byte - if ((*start & 0b11000000) == 0b10000000) { - return result(error_code::TOO_LONG, 0); - } - size_t extra_len{0}; - // A leading byte cannot be further than 4 bytes away - for(int i = 0; i < 5; i++) { - unsigned char byte = *buf; - if ((byte & 0b11000000) != 0b10000000) { - break; - } else { - buf--; - extra_len++; - } + template void operator|=(const SIMD8 other) { + this->value = vec_or(this->value, other.value); } - result res = validate_with_errors(buf, len + extra_len); - res.count -= extra_len; - return res; -} - -inline size_t count_code_points(const char* buf, size_t len) { - const int8_t * p = reinterpret_cast(buf); - size_t counter{0}; - for(size_t i = 0; i < len; i++) { - // -65 is 0b10111111, anything larger in two-complement's should start a new code point. - if(p[i] > -65) { counter++; } - } - return counter; -} - -inline size_t utf16_length_from_utf8(const char* buf, size_t len) { - const int8_t * p = reinterpret_cast(buf); - size_t counter{0}; - for(size_t i = 0; i < len; i++) { - if(p[i] > -65) { counter++; } - if(uint8_t(p[i]) >= 240) { counter++; } - } - return counter; -} + template vector_type prev_aux(vector_type prev_chunk) const { + vector_type chunk = this->value; +#if !SIMDUTF_IS_BIG_ENDIAN + chunk = (vector_type)vec_reve(this->value); + prev_chunk = (vector_type)vec_reve((vector_type)prev_chunk); +#endif + chunk = (vector_type)vec_sld((vector_type)prev_chunk, (vector_type)chunk, + 16 - N); +#if !SIMDUTF_IS_BIG_ENDIAN + chunk = (vector_type)vec_reve((vector_type)chunk); +#endif + return chunk; + } -simdutf_warn_unused inline size_t trim_partial_utf8(const char *input, size_t length) { - if (length < 3) { - switch (length) { - case 2: - if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left - if (uint8_t(input[length-2]) >= 0xe0) { return length-2; } // 3- and 4-byte characters with only 2 bytes left - return length; - case 1: - if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left - return length; - case 0: - return length; - } + simdutf_really_inline bool is_ascii() const { + return move_mask_u8(this->value) == 0; } - if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left - if (uint8_t(input[length-2]) >= 0xe0) { return length-2; } // 3- and 4-byte characters with only 1 byte left - if (uint8_t(input[length-3]) >= 0xf0) { return length-3; } // 4-byte characters with only 3 bytes left - return length; -} -} // utf8 namespace -} // unnamed namespace -} // namespace scalar -} // namespace simdutf + simdutf_really_inline uint16_t to_bitmask() const { + return move_mask_u8(value); + } -#endif -/* end file src/scalar/utf8.h */ -/* begin file src/scalar/utf16.h */ -#ifndef SIMDUTF_UTF16_H -#define SIMDUTF_UTF16_H + template + simdutf_really_inline void store_bytes_as_utf16(char16_t *p) const { + const vector_type zero = vec_splats(T(0)); -namespace simdutf { -namespace scalar { -namespace { -namespace utf16 { + if (big_endian) { + const vec_u8_t perm_lo = {16, 0, 16, 1, 16, 2, 16, 3, + 16, 4, 16, 5, 16, 6, 16, 7}; + const vec_u8_t perm_hi = {16, 8, 16, 9, 16, 10, 16, 11, + 16, 12, 16, 13, 16, 14, 16, 15}; -inline simdutf_warn_unused uint16_t swap_bytes(const uint16_t word) { - return uint16_t((word >> 8) | (word << 8)); -} + const vector_type v0 = vec_perm(value, zero, perm_lo); + const vector_type v1 = vec_perm(value, zero, perm_hi); -template -inline simdutf_warn_unused bool validate(const char16_t *buf, size_t len) noexcept { - const uint16_t *data = reinterpret_cast(buf); - uint64_t pos = 0; - while (pos < len) { - uint16_t word = !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos]; - if((word &0xF800) == 0xD800) { - if(pos + 1 >= len) { return false; } - uint16_t diff = uint16_t(word - 0xD800); - if(diff > 0x3FF) { return false; } - uint16_t next_word = !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1]; - uint16_t diff2 = uint16_t(next_word - 0xDC00); - if(diff2 > 0x3FF) { return false; } - pos += 2; +#if defined(__clang__) + vec_xst(v0, 0, reinterpret_cast(p)); + vec_xst(v1, 16, reinterpret_cast(p)); +#else + vec_xst(v0, 0, reinterpret_cast(p)); + vec_xst(v1, 16, reinterpret_cast(p)); +#endif // defined(__clang__) } else { - pos++; - } - } - return true; -} + const vec_u8_t perm_lo = {0, 16, 1, 16, 2, 16, 3, 16, + 4, 16, 5, 16, 6, 16, 7, 16}; + const vec_u8_t perm_hi = {8, 16, 9, 16, 10, 16, 11, 16, + 12, 16, 13, 16, 14, 16, 15, 16}; -template -inline simdutf_warn_unused result validate_with_errors(const char16_t *buf, size_t len) noexcept { - const uint16_t *data = reinterpret_cast(buf); - size_t pos = 0; - while (pos < len) { - uint16_t word = !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos]; - if((word & 0xF800) == 0xD800) { - if(pos + 1 >= len) { return result(error_code::SURROGATE, pos); } - uint16_t diff = uint16_t(word - 0xD800); - if(diff > 0x3FF) { return result(error_code::SURROGATE, pos); } - uint16_t next_word = !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1]; - uint16_t diff2 = uint16_t(next_word - 0xDC00); - if(diff2 > 0x3FF) { return result(error_code::SURROGATE, pos); } - pos += 2; - } else { - pos++; + const vector_type v0 = vec_perm(value, zero, perm_lo); + const vector_type v1 = vec_perm(value, zero, perm_hi); + +#if defined(__clang__) + vec_xst(v0, 0, reinterpret_cast(p)); + vec_xst(v1, 16, reinterpret_cast(p)); +#else + vec_xst(v0, 0, reinterpret_cast(p)); + vec_xst(v1, 16, reinterpret_cast(p)); +#endif // defined(__clang__) } } - return result(error_code::SUCCESS, pos); -} -template -inline size_t count_code_points(const char16_t* buf, size_t len) { - // We are not BOM aware. - const uint16_t * p = reinterpret_cast(buf); - size_t counter{0}; - for(size_t i = 0; i < len; i++) { - uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i]; - counter += ((word & 0xFC00) != 0xDC00); + template + simdutf_really_inline void store_ascii_as_utf16(char16_t *p) const { + store_bytes_as_utf16(p); } - return counter; -} -template -inline size_t utf8_length_from_utf16(const char16_t* buf, size_t len) { - // We are not BOM aware. - const uint16_t * p = reinterpret_cast(buf); - size_t counter{0}; - for(size_t i = 0; i < len; i++) { - uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i]; - counter++; // ASCII - counter += static_cast(word > 0x7F); // non-ASCII is at least 2 bytes, surrogates are 2*2 == 4 bytes - counter += static_cast((word > 0x7FF && word <= 0xD7FF) || (word >= 0xE000)); // three-byte - } - return counter; -} + simdutf_really_inline void store_bytes_as_utf32(char32_t *p) const { + const vector_type zero = vec_splats(T(0)); -template -inline size_t utf32_length_from_utf16(const char16_t* buf, size_t len) { - // We are not BOM aware. - const uint16_t * p = reinterpret_cast(buf); - size_t counter{0}; - for(size_t i = 0; i < len; i++) { - uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i]; - counter += ((word & 0xFC00) != 0xDC00); - } - return counter; -} +#if SIMDUTF_IS_BIG_ENDIAN + const vec_u8_t perm0 = {16, 16, 16, 0, 16, 16, 16, 1, + 16, 16, 16, 2, 16, 16, 16, 3}; + const vec_u8_t perm1 = {16, 16, 16, 4, 16, 16, 16, 5, + 16, 16, 16, 6, 16, 16, 16, 7}; -inline size_t latin1_length_from_utf16(size_t len) { - return len; -} + const vec_u8_t perm2 = {16, 16, 16, 8, 16, 16, 16, 9, + 16, 16, 16, 10, 16, 16, 16, 11}; -simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* out) { - const uint16_t * input = reinterpret_cast(in); - uint16_t * output = reinterpret_cast(out); - for (size_t i = 0; i < size; i++) { - *output++ = uint16_t(input[i] >> 8 | input[i] << 8); - } -} + const vec_u8_t perm3 = {16, 16, 16, 12, 16, 16, 16, 13, + 16, 16, 16, 14, 16, 16, 16, 15}; +#else + const vec_u8_t perm0 = {0, 16, 16, 16, 1, 16, 16, 16, + 2, 16, 16, 16, 3, 16, 16, 16}; + const vec_u8_t perm1 = {4, 16, 16, 16, 5, 16, 16, 16, + 6, 16, 16, 16, 7, 16, 16, 16}; -template -simdutf_warn_unused inline size_t trim_partial_utf16(const char16_t* input, size_t length) { - if (length <= 1) { - return length; - } - uint16_t last_word = uint16_t(input[length-1]); - last_word = !match_system(big_endian) ? swap_bytes(last_word) : last_word; - length -= ((last_word & 0xFC00) == 0xD800); - return length; -} + const vec_u8_t perm2 = {8, 16, 16, 16, 9, 16, 16, 16, + 10, 16, 16, 16, 11, 16, 16, 16}; -} // utf16 namespace -} // unnamed namespace -} // namespace scalar -} // namespace simdutf + const vec_u8_t perm3 = {12, 16, 16, 16, 13, 16, 16, 16, + 14, 16, 16, 16, 15, 16, 16, 16}; +#endif // SIMDUTF_IS_BIG_ENDIAN -#endif -/* end file src/scalar/utf16.h */ -/* begin file src/scalar/utf32.h */ -#ifndef SIMDUTF_UTF32_H -#define SIMDUTF_UTF32_H + const vector_type v0 = vec_perm(value, zero, perm0); + const vector_type v1 = vec_perm(value, zero, perm1); + const vector_type v2 = vec_perm(value, zero, perm2); + const vector_type v3 = vec_perm(value, zero, perm3); -namespace simdutf { -namespace scalar { -namespace { -namespace utf32 { + constexpr size_t n = base8::SIZE; -inline simdutf_warn_unused bool validate(const char32_t *buf, size_t len) noexcept { - const uint32_t *data = reinterpret_cast(buf); - uint64_t pos = 0; - for(;pos < len; pos++) { - uint32_t word = data[pos]; - if(word > 0x10FFFF || (word >= 0xD800 && word <= 0xDFFF)) { - return false; - } +#if defined(__clang__) + vec_xst(v0, 0 * n, reinterpret_cast(p)); + vec_xst(v1, 1 * n, reinterpret_cast(p)); + vec_xst(v2, 2 * n, reinterpret_cast(p)); + vec_xst(v3, 3 * n, reinterpret_cast(p)); +#else + vec_xst(v0, 0 * n, reinterpret_cast(p)); + vec_xst(v1, 1 * n, reinterpret_cast(p)); + vec_xst(v2, 2 * n, reinterpret_cast(p)); + vec_xst(v3, 3 * n, reinterpret_cast(p)); +#endif // defined(__clang__) } - return true; -} -inline simdutf_warn_unused result validate_with_errors(const char32_t *buf, size_t len) noexcept { - const uint32_t *data = reinterpret_cast(buf); - size_t pos = 0; - for(;pos < len; pos++) { - uint32_t word = data[pos]; - if(word > 0x10FFFF) { - return result(error_code::TOO_LARGE, pos); - } - if(word >= 0xD800 && word <= 0xDFFF) { - return result(error_code::SURROGATE, pos); - } - } - return result(error_code::SUCCESS, pos); -} + simdutf_really_inline void store_words_as_utf32(char32_t *p) const { + const vector_type zero = vec_splats(T(0)); + +#if SIMDUTF_IS_BIG_ENDIAN + const vec_u8_t perm0 = {16, 16, 0, 1, 16, 16, 2, 3, + 16, 16, 4, 5, 16, 16, 6, 7}; + const vec_u8_t perm1 = {16, 16, 8, 9, 16, 16, 10, 11, + 16, 16, 12, 13, 16, 16, 14, 15}; +#else + const vec_u8_t perm0 = {0, 1, 16, 16, 2, 3, 16, 16, + 4, 5, 16, 16, 6, 7, 16, 16}; + const vec_u8_t perm1 = {8, 9, 16, 16, 10, 11, 16, 16, + 12, 13, 16, 16, 14, 15, 16, 16}; +#endif // SIMDUTF_IS_BIG_ENDIAN + + const vector_type v0 = vec_perm(value, zero, perm0); + const vector_type v1 = vec_perm(value, zero, perm1); -inline size_t utf8_length_from_utf32(const char32_t* buf, size_t len) { - // We are not BOM aware. - const uint32_t * p = reinterpret_cast(buf); - size_t counter{0}; - for(size_t i = 0; i < len; i++) { - // credit: @ttsugriy for the vectorizable approach - counter++; // ASCII - counter += static_cast(p[i] > 0x7F); // two-byte - counter += static_cast(p[i] > 0x7FF); // three-byte - counter += static_cast(p[i] > 0xFFFF); // four-bytes + constexpr size_t n = base8::SIZE; + +#if defined(__clang__) + vec_xst(v0, 0 * n, reinterpret_cast(p)); + vec_xst(v1, 1 * n, reinterpret_cast(p)); +#else + vec_xst(v0, 0 * n, reinterpret_cast(p)); + vec_xst(v1, 1 * n, reinterpret_cast(p)); +#endif // defined(__clang__) } - return counter; -} -inline size_t utf16_length_from_utf32(const char32_t* buf, size_t len) { - // We are not BOM aware. - const uint32_t * p = reinterpret_cast(buf); - size_t counter{0}; - for(size_t i = 0; i < len; i++) { - counter++; // non-surrogate word - counter += static_cast(p[i] > 0xFFFF); // surrogate pair + simdutf_really_inline void store_ascii_as_utf32(char32_t *p) const { + store_bytes_as_utf32(p); } - return counter; -} +}; -inline size_t latin1_length_from_utf32(size_t len) { - // We are not BOM aware. - return len; // a utf32 codepoint will always represent 1 latin1 character -} +// Forward declaration +template struct simd8; -inline simdutf_warn_unused uint32_t swap_bytes(const uint32_t word) { - return ((word >> 24) & 0xff) | // move byte 3 to byte 0 - ((word << 8) & 0xff0000) | // move byte 1 to byte 2 - ((word >> 8) & 0xff00) | // move byte 2 to byte 1 - ((word << 24) & 0xff000000); // byte 0 to byte 3 -} +template +simd8 operator==(const simd8 a, const simd8 b); -} // utf32 namespace -} // unnamed namespace -} // namespace scalar -} // namespace simdutf +template +simd8 operator!=(const simd8 a, const simd8 b); -#endif -/* end file src/scalar/utf32.h */ -/* begin file src/scalar/base64.h */ -#ifndef SIMDUTF_BASE64_H -#define SIMDUTF_BASE64_H +template simd8 operator&(const simd8 a, const simd8 b); -#include -#include -#include -namespace simdutf { -namespace scalar { -namespace { -namespace base64 { +template simd8 operator|(const simd8 a, const simd8 b); -// This function is not expected to be fast. Do not use in long loops. -template -bool is_ascii_white_space(char_type c) { - return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f'; -} +template simd8 operator^(const simd8 a, const simd8 b); -// Returns true upon success. The destination buffer must be large enough. -// This functions assumes that the padding (=) has been removed. -template -result base64_tail_decode(char *dst, const char_type *src, size_t length, base64_options options) { - // This looks like 5 branches, but we expect the compiler to resolve this to a single branch: - const uint8_t *to_base64 = (options & base64_url) ? tables::base64::to_base64_url_value : tables::base64::to_base64_value; - const uint32_t *d0 = (options & base64_url) ? tables::base64::base64_url::d0 : tables::base64::base64_default::d0; - const uint32_t *d1 = (options & base64_url) ? tables::base64::base64_url::d1 : tables::base64::base64_default::d1; - const uint32_t *d2 = (options & base64_url) ? tables::base64::base64_url::d2 : tables::base64::base64_default::d2; - const uint32_t *d3 = (options & base64_url) ? tables::base64::base64_url::d3 : tables::base64::base64_default::d3; +template simd8 operator+(const simd8 a, const simd8 b); - const char_type *srcend = src + length; - const char_type *srcinit = src; - const char *dstinit = dst; +template simd8 operator<(const simd8 a, const simd8 b); - uint32_t x; - size_t idx; - uint8_t buffer[4]; - while (true) { - while (src + 4 <= srcend && - (x = d0[uint8_t(src[0])] | d1[uint8_t(src[1])] | - d2[uint8_t(src[2])] | d3[uint8_t(src[3])]) < 0x01FFFFFF) { - if(match_system(endianness::BIG)) { - x = scalar::utf32::swap_bytes(x); - } - std::memcpy(dst, &x, 3); // optimization opportunity: copy 4 bytes - dst += 3; - src += 4; - } - idx = 0; - // we need at least four characters. - while (idx < 4 && src < srcend) { - char_type c = *src; - uint8_t code = to_base64[uint8_t(c)]; - buffer[idx] = uint8_t(code); - if (code <= 63) { - idx++; - } else if (code > 64) { - return {INVALID_BASE64_CHARACTER, size_t(src - srcinit)}; - } else { - // We have a space or a newline. We ignore it. - } - src++; - } - if (idx != 4) { - if (idx == 2) { - uint32_t triple = - (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6); - if(match_system(endianness::BIG)) { - triple <<= 8; - std::memcpy(dst, &triple, 1); - } else { - triple = scalar::utf32::swap_bytes(triple); - triple >>= 8; - std::memcpy(dst, &triple, 1); - } - dst += 1; - - } else if (idx == 3) { - uint32_t triple = (uint32_t(buffer[0]) << 3 * 6) + - (uint32_t(buffer[1]) << 2 * 6) + - (uint32_t(buffer[2]) << 1 * 6); - if(match_system(endianness::BIG)) { - triple <<= 8; - std::memcpy(dst, &triple, 2); - } else { - triple = scalar::utf32::swap_bytes(triple); - triple >>= 8; - std::memcpy(dst, &triple, 2); - } - dst += 2; - } else if (idx == 1) { - return {BASE64_INPUT_REMAINDER, size_t(dst - dstinit)}; - } - return {SUCCESS, size_t(dst - dstinit)}; - } +// SIMD byte mask type (returned by things like eq and gt) +template <> struct simd8 : base8 { + using super = base8; - uint32_t triple = - (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6) + - (uint32_t(buffer[2]) << 1 * 6) + (uint32_t(buffer[3]) << 0 * 6); - if(match_system(endianness::BIG)) { - triple <<= 8; - std::memcpy(dst, &triple, 3); - } else { - triple = scalar::utf32::swap_bytes(triple); - triple >>= 8; - std::memcpy(dst, &triple, 3); - } - dst += 3; + static simdutf_really_inline simd8 splat(bool _value) { + return (vector_type)vec_splats((unsigned char)(-(!!_value))); } -} -// like base64_tail_decode, but it will not write past the end of the output buffer. -// outlen is modified to reflect the number of bytes written. -// This functions assumes that the padding (=) has been removed. -template -result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src, size_t length, base64_options options) { - // This looks like 5 branches, but we expect the compiler to resolve this to a single branch: - const uint8_t *to_base64 = (options & base64_url) ? tables::base64::to_base64_url_value : tables::base64::to_base64_value; - const uint32_t *d0 = (options & base64_url) ? tables::base64::base64_url::d0 : tables::base64::base64_default::d0; - const uint32_t *d1 = (options & base64_url) ? tables::base64::base64_url::d1 : tables::base64::base64_default::d1; - const uint32_t *d2 = (options & base64_url) ? tables::base64::base64_url::d2 : tables::base64::base64_default::d2; - const uint32_t *d3 = (options & base64_url) ? tables::base64::base64_url::d3 : tables::base64::base64_default::d3; - - const char_type *srcend = src + length; - const char_type *srcinit = src; - const char *dstinit = dst; - const char *dstend = dst + outlen; + simdutf_really_inline simd8() : super(vector_type()) {} + simdutf_really_inline simd8(const vector_type _value) : super(_value) {} + // Splat constructor + simdutf_really_inline simd8(bool _value) : base8(splat(_value)) {} - uint32_t x; - size_t idx; - uint8_t buffer[4]; - while (true) { - while (src + 4 <= srcend && - (x = d0[uint8_t(src[0])] | d1[uint8_t(src[1])] | - d2[uint8_t(src[2])] | d3[uint8_t(src[3])]) < 0x01FFFFFF) { - if(match_system(endianness::BIG)) { - x = scalar::utf32::swap_bytes(x); - } - if(dst + 3 > dstend) { - outlen = size_t(dst - dstinit); - return {OUTPUT_BUFFER_TOO_SMALL, size_t(src - srcinit)}; - } - std::memcpy(dst, &x, 3); // optimization opportunity: copy 4 bytes - dst += 3; - src += 4; - } - idx = 0; - const char_type *srccur = src; - - // we need at least four characters. - while (idx < 4 && src < srcend) { - char_type c = *src; - uint8_t code = to_base64[uint8_t(c)]; - buffer[idx] = uint8_t(code); - if (code <= 63) { - idx++; - } else if (code > 64) { - outlen = size_t(dst - dstinit); - return {INVALID_BASE64_CHARACTER, size_t(src - srcinit)}; - } else { - // We have a space or a newline. We ignore it. - } - src++; - } - if (idx != 4) { - if (idx == 2) { - if(dst == dstend) { - outlen = size_t(dst - dstinit); - return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit)}; - } - uint32_t triple = - (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6); - if(match_system(endianness::BIG)) { - triple <<= 8; - std::memcpy(dst, &triple, 1); - } else { - triple = scalar::utf32::swap_bytes(triple); - triple >>= 8; - std::memcpy(dst, &triple, 1); - } - dst += 1; + template + simdutf_really_inline simd8(simd8 other) + : simd8(vector_type(other.value)) {} - } else if (idx == 3) { - if(dst + 2 >= dstend) { - outlen = size_t(dst - dstinit); - return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit)}; - } - uint32_t triple = (uint32_t(buffer[0]) << 3 * 6) + - (uint32_t(buffer[1]) << 2 * 6) + - (uint32_t(buffer[2]) << 1 * 6); - if(match_system(endianness::BIG)) { - triple <<= 8; - std::memcpy(dst, &triple, 2); - } else { - triple = scalar::utf32::swap_bytes(triple); - triple >>= 8; - std::memcpy(dst, &triple, 2); - } - dst += 2; - } else if (idx == 1) { - outlen = size_t(dst - dstinit); - return {BASE64_INPUT_REMAINDER, size_t(dst - dstinit)}; - } - outlen = size_t(dst - dstinit); - return {SUCCESS, size_t(dst - dstinit)}; - } - if(dst + 3 >= dstend) { - outlen = size_t(dst - dstinit); - return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit)}; - } - uint32_t triple = - (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6) + - (uint32_t(buffer[2]) << 1 * 6) + (uint32_t(buffer[3]) << 0 * 6); - if(match_system(endianness::BIG)) { - triple <<= 8; - std::memcpy(dst, &triple, 3); - } else { - triple = scalar::utf32::swap_bytes(triple); - triple >>= 8; - std::memcpy(dst, &triple, 3); - } - dst += 3; + simdutf_really_inline uint16_t to_bitmask() const { + return move_mask_u8(value); } -} -// Returns the number of bytes written. The destination buffer must be large -// enough. It will add padding (=) if needed. -size_t tail_encode_base64(char *dst, const char *src, size_t srclen, base64_options options) { - // This looks like 3 branches, but we expect the compiler to resolve this to a single branch: - const char *e0 = (options & base64_url) ? tables::base64::base64_url::e0 : tables::base64::base64_default::e0; - const char *e1 = (options & base64_url) ? tables::base64::base64_url::e1 : tables::base64::base64_default::e1; - const char *e2 = (options & base64_url) ? tables::base64::base64_url::e2 : tables::base64::base64_default::e2; - char *out = dst; - size_t i = 0; - uint8_t t1, t2, t3; - for (; i + 2 < srclen; i += 3) { - t1 = uint8_t(src[i]); - t2 = uint8_t(src[i + 1]); - t3 = uint8_t(src[i + 2]); - *out++ = e0[t1]; - *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; - *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)]; - *out++ = e2[t3]; + simdutf_really_inline bool any() const { + return !vec_all_eq(this->value, (vector_type)vec_splats(0)); } - switch (srclen - i) { - case 0: - break; - case 1: - t1 = uint8_t(src[i]); - *out++ = e0[t1]; - *out++ = e1[(t1 & 0x03) << 4]; - if((options & base64_url) == 0) { - *out++ = '='; - *out++ = '='; - } - break; - default: /* case 2 */ - t1 = uint8_t(src[i]); - t2 = uint8_t(src[i + 1]); - *out++ = e0[t1]; - *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; - *out++ = e2[(t2 & 0x0F) << 2]; - if((options & base64_url) == 0) { - *out++ = '='; - } + + simdutf_really_inline bool all() const { return to_bitmask() == 0xffff; } + + simdutf_really_inline simd8 operator~() const { + return this->value ^ (vector_type)splat(true); } - return (size_t)(out - dst); -} +}; -template -simdutf_warn_unused size_t maximal_binary_length_from_base64(const char_type * input, size_t length) noexcept { - // We follow https://infra.spec.whatwg.org/#forgiving-base64-decode - size_t padding = 0; - if(length > 0) { - if(input[length - 1] == '=') { - padding++; - if(length > 1 && input[length - 2] == '=') { - padding++; - } - } +template struct base8_numeric : base8 { + using super = base8; + using vector_type = typename super::vector_type; + + static simdutf_really_inline simd8 splat(T value) { + return (vector_type)vec_splats(value); } - size_t actual_length = length - padding; - if(actual_length % 4 <= 1) { - return actual_length / 4 * 3; + + static simdutf_really_inline simd8 zero() { return splat(0); } + + template + static simdutf_really_inline simd8 load(const U *values) { + return vec_xl(0, reinterpret_cast(values)); } - // if we have a valid input, then the remainder must be 2 or 3 adding one or two extra bytes. - return actual_length / 4 * 3 + (actual_length %4) - 1; -} -simdutf_warn_unused size_t base64_length_from_binary(size_t length, base64_options options) noexcept { - if(options & base64_url) { - return length/3 * 4 + ((length % 3) ? (length % 3) + 1 : 0); + // Repeat 16 values as many times as necessary (usually for lookup tables) + static simdutf_really_inline simd8 repeat_16(T v0, T v1, T v2, T v3, T v4, + T v5, T v6, T v7, T v8, T v9, + T v10, T v11, T v12, T v13, + T v14, T v15) { + return simd8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, + v14, v15); } - return (length + 2)/3 * 4; // We use padding to make the length a multiple of 4. -} -} // namespace base64 -} // unnamed namespace -} // namespace scalar -} // namespace simdutf + simdutf_really_inline base8_numeric() : base8() {} + simdutf_really_inline base8_numeric(const vector_type _value) + : base8(_value) {} -#endif -/* end file src/scalar/base64.h */ + // Override to distinguish from bool version + simdutf_really_inline simd8 operator~() const { return *this ^ 0xFFu; } -namespace simdutf { -bool implementation::supported_by_runtime_system() const { - uint32_t required_instruction_sets = this->required_instruction_sets(); - uint32_t supported_instruction_sets = internal::detect_supported_architectures(); - return ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets); -} - -simdutf_warn_unused encoding_type implementation::autodetect_encoding(const char * input, size_t length) const noexcept { - // If there is a BOM, then we trust it. - auto bom_encoding = simdutf::BOM::check_bom(input, length); - if(bom_encoding != encoding_type::unspecified) { return bom_encoding; } - // UTF8 is common, it includes ASCII, and is commonly represented - // without a BOM, so if it fits, go with that. Note that it is still - // possible to get it wrong, we are only 'guessing'. If some has UTF-16 - // data without a BOM, it could pass as UTF-8. - // - // An interesting twist might be to check for UTF-16 ASCII first (every - // other byte is zero). - if(validate_utf8(input, length)) { return encoding_type::UTF8; } - // The next most common encoding that might appear without BOM is probably - // UTF-16LE, so try that next. - if((length % 2) == 0) { - // important: we need to divide by two - if(validate_utf16le(reinterpret_cast(input), length/2)) { return encoding_type::UTF16_LE; } - } - if((length % 4) == 0) { - if(validate_utf32(reinterpret_cast(input), length/4)) { return encoding_type::UTF32_LE; } - } - return encoding_type::unspecified; -} + simdutf_really_inline simd8 &operator-=(const simd8 other) { + this->value = vec_sub(this->value, other.value); + return *static_cast *>(this); + } -namespace internal { -// When there is a single implementation, we should not pay a price - // for dispatching to the best implementation. We should just use the - // one we have. This is a compile-time check. - #define SIMDUTF_SINGLE_IMPLEMENTATION (SIMDUTF_IMPLEMENTATION_ICELAKE \ - + SIMDUTF_IMPLEMENTATION_HASWELL + SIMDUTF_IMPLEMENTATION_WESTMERE \ - + SIMDUTF_IMPLEMENTATION_ARM64 + SIMDUTF_IMPLEMENTATION_PPC64 \ - + SIMDUTF_IMPLEMENTATION_FALLBACK == 1) + // Perform a lookup assuming the value is between 0 and 16 (undefined behavior + // for out of range values) + template + simdutf_really_inline simd8 lookup_16(simd8 lookup_table) const { + return (vector_type)vec_perm((vector_type)lookup_table, + (vector_type)lookup_table, this->value); + } -// Static array of known implementations. We're hoping these get baked into the executable -// without requiring a static initializer. + template + simdutf_really_inline simd8 + lookup_32(const simd8 lookup_table_lo, + const simd8 lookup_table_hi) const { + return (vector_type)vec_perm(lookup_table_lo.value, lookup_table_hi.value, + this->value); + } + template + simdutf_really_inline simd8 + lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4, + L replace5, L replace6, L replace7, L replace8, L replace9, + L replace10, L replace11, L replace12, L replace13, L replace14, + L replace15) const { + return lookup_16(simd8::repeat_16( + replace0, replace1, replace2, replace3, replace4, replace5, replace6, + replace7, replace8, replace9, replace10, replace11, replace12, + replace13, replace14, replace15)); + } +}; -#if SIMDUTF_IMPLEMENTATION_ICELAKE -static const icelake::implementation* get_icelake_singleton() { - static const icelake::implementation icelake_singleton{}; - return &icelake_singleton; -} -#endif -#if SIMDUTF_IMPLEMENTATION_HASWELL -static const haswell::implementation* get_haswell_singleton() { - static const haswell::implementation haswell_singleton{}; - return &haswell_singleton; -} -#endif -#if SIMDUTF_IMPLEMENTATION_WESTMERE -static const westmere::implementation* get_westmere_singleton() { - static const westmere::implementation westmere_singleton{}; - return &westmere_singleton; -} -#endif -#if SIMDUTF_IMPLEMENTATION_ARM64 -static const arm64::implementation* get_arm64_singleton() { - static const arm64::implementation arm64_singleton{}; - return &arm64_singleton; -} -#endif -#if SIMDUTF_IMPLEMENTATION_PPC64 -static const ppc64::implementation* get_ppc64_singleton() { - static const ppc64::implementation ppc64_singleton{}; - return &ppc64_singleton; -} -#endif -#if SIMDUTF_IMPLEMENTATION_RVV -static const rvv::implementation* get_rvv_singleton() { - static const rvv::implementation rvv_singleton{}; - return &rvv_singleton; -} -#endif -#if SIMDUTF_IMPLEMENTATION_FALLBACK -static const fallback::implementation* get_fallback_singleton() { - static const fallback::implementation fallback_singleton{}; - return &fallback_singleton; -} -#endif - -#if SIMDUTF_SINGLE_IMPLEMENTATION -static const implementation* get_single_implementation() { - return -#if SIMDUTF_IMPLEMENTATION_ICELAKE - get_icelake_singleton(); -#endif -#if SIMDUTF_IMPLEMENTATION_HASWELL - get_haswell_singleton(); -#endif -#if SIMDUTF_IMPLEMENTATION_WESTMERE - get_westmere_singleton(); -#endif -#if SIMDUTF_IMPLEMENTATION_ARM64 - get_arm64_singleton(); -#endif -#if SIMDUTF_IMPLEMENTATION_PPC64 - get_ppc64_singleton(); -#endif -#if SIMDUTF_IMPLEMENTATION_FALLBACK - get_fallback_singleton(); -#endif -} -#endif - -/** - * @private Detects best supported implementation on first use, and sets it - */ -class detect_best_supported_implementation_on_first_use final : public implementation { -public: - std::string name() const noexcept final { return set_best()->name(); } - std::string description() const noexcept final { return set_best()->description(); } - uint32_t required_instruction_sets() const noexcept final { return set_best()->required_instruction_sets(); } +// Unsigned bytes +template <> struct simd8 : base8_numeric { + using Self = simd8; - simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept override { - return set_best()->detect_encodings(input, length); + simdutf_really_inline simd8() : base8_numeric() {} + simdutf_really_inline simd8(const vector_type _value) + : base8_numeric(_value) {} + // Splat constructor + simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdutf_really_inline simd8(const uint8_t *values) : simd8(load(values)) {} + // Member-by-member initialization + simdutf_really_inline + simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, + uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10, + uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15) + : simd8((vector_type){v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, + v12, v13, v14, v15}) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdutf_really_inline static simd8 + repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, + uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, + uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, + uint8_t v15) { + return simd8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15); } - simdutf_warn_unused bool validate_utf8(const char * buf, size_t len) const noexcept final override { - return set_best()->validate_utf8(buf, len); + simdutf_really_inline bool is_ascii() const { + return move_mask_u8(this->value) == 0; } - simdutf_warn_unused result validate_utf8_with_errors(const char * buf, size_t len) const noexcept final override { - return set_best()->validate_utf8_with_errors(buf, len); - } + template + simdutf_really_inline simd8(simd8 other) + : simd8(vector_type(other.value)) {} - simdutf_warn_unused bool validate_ascii(const char * buf, size_t len) const noexcept final override { - return set_best()->validate_ascii(buf, len); + template + simdutf_really_inline Self prev(const Self prev_chunk) const { + return prev_aux(prev_chunk.value); } - simdutf_warn_unused result validate_ascii_with_errors(const char * buf, size_t len) const noexcept final override { - return set_best()->validate_ascii_with_errors(buf, len); + // Saturated math + simdutf_really_inline simd8 + saturating_sub(const simd8 other) const { + return (vector_type)vec_subs(this->value, (vector_type)other); } - simdutf_warn_unused bool validate_utf16le(const char16_t * buf, size_t len) const noexcept final override { - return set_best()->validate_utf16le(buf, len); + // Same as >, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd8 + gt_bits(const simd8 other) const { + return this->saturating_sub(other); } - simdutf_warn_unused bool validate_utf16be(const char16_t * buf, size_t len) const noexcept final override { - return set_best()->validate_utf16be(buf, len); + // Same as <, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd8 + lt_bits(const simd8 other) const { + return other.saturating_sub(*this); } - simdutf_warn_unused result validate_utf16le_with_errors(const char16_t * buf, size_t len) const noexcept final override { - return set_best()->validate_utf16le_with_errors(buf, len); + // Bit-specific operations + simdutf_really_inline bool bits_not_set_anywhere() const { + return vec_all_eq(this->value, (vector_type)vec_splats(0)); } - simdutf_warn_unused result validate_utf16be_with_errors(const char16_t * buf, size_t len) const noexcept final override { - return set_best()->validate_utf16be_with_errors(buf, len); + simdutf_really_inline bool any_bits_set_anywhere() const { + return !bits_not_set_anywhere(); } - simdutf_warn_unused bool validate_utf32(const char32_t * buf, size_t len) const noexcept final override { - return set_best()->validate_utf32(buf, len); + template simdutf_really_inline simd8 shr() const { + return simd8( + (vector_type)vec_sr(this->value, (vector_type)vec_splat_u8(N))); } - simdutf_warn_unused result validate_utf32_with_errors(const char32_t * buf, size_t len) const noexcept final override { - return set_best()->validate_utf32_with_errors(buf, len); + template simdutf_really_inline simd8 shl() const { + return simd8( + (vector_type)vec_sl(this->value, (vector_type)vec_splat_u8(N))); + } + void dump() const { +#ifdef SIMDUTF_LOGGING + uint8_t tmp[16]; + store(tmp); + for (int i = 0; i < 16; i++) { + if (i == 0) { + printf("[%02x", tmp[i]); + } else if (i == 15) { + printf(" %02x]", tmp[i]); + } else { + printf(" %02x", tmp[i]); + } + } + putchar('\n'); +#endif // SIMDUTF_LOGGING } - simdutf_warn_unused size_t convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept final override { - return set_best()->convert_latin1_to_utf8(buf, len,utf8_output); + void dump_ascii() const { +#ifdef SIMDUTF_LOGGING + uint8_t tmp[16]; + store(tmp); + for (int i = 0; i < 16; i++) { + if (i == 0) { + printf("[%c", tmp[i]); + } else if (i == 15) { + printf("%c]", tmp[i]); + } else { + printf("%c", tmp[i]); + } + } + putchar('\n'); +#endif // SIMDUTF_LOGGING } +}; - simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override { - return set_best()->convert_latin1_to_utf16le(buf, len, utf16_output); - } +// Signed bytes +template <> struct simd8 : base8_numeric { + simdutf_really_inline simd8() : base8_numeric() {} + simdutf_really_inline simd8(const vector_type _value) + : base8_numeric(_value) {} - simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override { - return set_best()->convert_latin1_to_utf16be(buf, len, utf16_output); - } + template + simdutf_really_inline simd8(simd8 other) + : simd8(vector_type(other.value)) {} - simdutf_warn_unused size_t convert_latin1_to_utf32(const char * buf, size_t len, char32_t * latin1_output) const noexcept final override { - return set_best()->convert_latin1_to_utf32(buf, len,latin1_output); - } + // Splat constructor + simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdutf_really_inline simd8(const int8_t *values) : simd8(load(values)) {} - simdutf_warn_unused size_t convert_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final override { - return set_best()->convert_utf8_to_latin1(buf, len,latin1_output); - } + simdutf_really_inline operator simd8() const; - simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept final override { - return set_best()->convert_utf8_to_latin1_with_errors(buf, len, latin1_output); + // Saturated math + simdutf_really_inline simd8 + saturating_add(const simd8 other) const { + return (vector_type)vec_adds(this->value, other.value); + } + + void dump() const { + int8_t tmp[16]; + store(tmp); + for (int i = 0; i < 16; i++) { + if (i == 0) { + printf("[%02x", tmp[i]); + } else if (i == 15) { + printf("%02x]", tmp[i]); + } else { + printf("%02x", tmp[i]); + } + } + putchar('\n'); } +}; - simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final override { - return set_best()->convert_valid_utf8_to_latin1(buf, len,latin1_output); - } +template +simd8 operator==(const simd8 a, const simd8 b) { + return vec_cmpeq(a.value, b.value); +} - simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override { - return set_best()->convert_utf8_to_utf16le(buf, len, utf16_output); - } +template +simd8 operator!=(const simd8 a, const simd8 b) { + return vec_cmpne(a.value, b.value); +} - simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override { - return set_best()->convert_utf8_to_utf16be(buf, len, utf16_output); - } +template simd8 operator&(const simd8 a, const simd8 b) { + return vec_and(a.value, b.value); +} - simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override { - return set_best()->convert_utf8_to_utf16le_with_errors(buf, len, utf16_output); - } +template simd8 operator&(const simd8 a, U b) { + return vec_and(a.value, vec_splats(T(b))); +} - simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override { - return set_best()->convert_utf8_to_utf16be_with_errors(buf, len, utf16_output); - } +template simd8 operator|(const simd8 a, const simd8 b) { + return vec_or(a.value, b.value); +} - simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override { - return set_best()->convert_valid_utf8_to_utf16le(buf, len, utf16_output); - } +template simd8 operator^(const simd8 a, const simd8 b) { + return vec_xor(a.value, b.value); +} - simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override { - return set_best()->convert_valid_utf8_to_utf16be(buf, len, utf16_output); - } +template simd8 operator^(const simd8 a, U b) { + return vec_xor(a.value, vec_splats(T(b))); +} - simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final override { - return set_best()->convert_utf8_to_utf32(buf, len, utf32_output); - } +template simd8 operator+(const simd8 a, const simd8 b) { + return vec_add(a.value, b.value); +} - simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final override { - return set_best()->convert_utf8_to_utf32_with_errors(buf, len, utf32_output); - } +template simd8 operator+(const simd8 a, U b) { + return vec_add(a.value, vec_splats(T(b))); +} - simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final override { - return set_best()->convert_valid_utf8_to_utf32(buf, len, utf32_output); - } +simdutf_really_inline simd8::operator simd8() const { + return (simd8::vector_type)value; +} - simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_output) const noexcept final override { - return set_best()->convert_utf16le_to_latin1(buf, len, latin1_output); - } +template +simd8 operator<(const simd8 a, const simd8 b) { + return vec_cmplt(a.value, b.value); +} - simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_output) const noexcept final override { - return set_best()->convert_utf16be_to_latin1(buf, len, latin1_output); - } +template +simd8 operator>(const simd8 a, const simd8 b) { + return vec_cmpgt(a.value, b.value); +} - simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_output) const noexcept final override { - return set_best()->convert_utf16le_to_latin1_with_errors(buf, len, latin1_output); - } +template +simd8 operator>=(const simd8 a, const simd8 b) { + return vec_cmpge(a.value, b.value); +} - simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_output) const noexcept final override { - return set_best()->convert_utf16be_to_latin1_with_errors(buf, len, latin1_output); - } +template struct simd8x64 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd8); + static constexpr size_t ELEMENTS = simd8::ELEMENTS; - simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_output) const noexcept final override { - return set_best()->convert_valid_utf16le_to_latin1(buf, len, latin1_output); - } + static_assert(NUM_CHUNKS == 4, + "PPC64 kernel should use four registers per 64-byte block."); + simd8 chunks[NUM_CHUNKS]; - simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_output) const noexcept final override { - return set_best()->convert_valid_utf16be_to_latin1(buf, len, latin1_output); - } + simd8x64(const simd8x64 &o) = delete; // no copy allowed + simd8x64 & + operator=(const simd8 other) = delete; // no assignment allowed + simd8x64() = delete; // no default constructor allowed + simd8x64(simd8x64 &&) = default; - simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override { - return set_best()->convert_utf16le_to_utf8(buf, len, utf8_output); - } + simdutf_really_inline simd8x64(const simd8 chunk0, const simd8 chunk1, + const simd8 chunk2, const simd8 chunk3) + : chunks{chunk0, chunk1, chunk2, chunk3} {} + simdutf_really_inline simd8x64(const T *ptr) + : chunks{simd8::load(ptr), + simd8::load(ptr + sizeof(simd8) / sizeof(T)), + simd8::load(ptr + 2 * sizeof(simd8) / sizeof(T)), + simd8::load(ptr + 3 * sizeof(simd8) / sizeof(T))} {} - simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override { - return set_best()->convert_utf16be_to_utf8(buf, len, utf8_output); + simdutf_really_inline void store(T *ptr) const { + this->chunks[0].store(ptr + ELEMENTS * 0); + this->chunks[1].store(ptr + ELEMENTS * 1); + this->chunks[2].store(ptr + ELEMENTS * 2); + this->chunks[3].store(ptr + ELEMENTS * 3); } - simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override { - return set_best()->convert_utf16le_to_utf8_with_errors(buf, len, utf8_output); + simdutf_really_inline simd8x64 &operator|=(const simd8x64 &other) { + this->chunks[0] |= other.chunks[0]; + this->chunks[1] |= other.chunks[1]; + this->chunks[2] |= other.chunks[2]; + this->chunks[3] |= other.chunks[3]; + return *this; } - simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override { - return set_best()->convert_utf16be_to_utf8_with_errors(buf, len, utf8_output); + simdutf_really_inline simd8 reduce_or() const { + return (this->chunks[0] | this->chunks[1]) | + (this->chunks[2] | this->chunks[3]); } - simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override { - return set_best()->convert_valid_utf16le_to_utf8(buf, len, utf8_output); + simdutf_really_inline bool is_ascii() const { + return this->reduce_or().is_ascii(); } - simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override { - return set_best()->convert_valid_utf16be_to_utf8(buf, len, utf8_output); + template + simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const { + this->chunks[0].template store_ascii_as_utf16(ptr + + sizeof(simd8) * 0); + this->chunks[1].template store_ascii_as_utf16(ptr + + sizeof(simd8) * 1); + this->chunks[2].template store_ascii_as_utf16(ptr + + sizeof(simd8) * 2); + this->chunks[3].template store_ascii_as_utf16(ptr + + sizeof(simd8) * 3); } - simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final override { - return set_best()->convert_utf32_to_latin1(buf, len,latin1_output); + simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const { + this->chunks[0].store_ascii_as_utf32(ptr + sizeof(simd8) * 0); + this->chunks[1].store_ascii_as_utf32(ptr + sizeof(simd8) * 1); + this->chunks[2].store_ascii_as_utf32(ptr + sizeof(simd8) * 2); + this->chunks[3].store_ascii_as_utf32(ptr + sizeof(simd8) * 3); } - simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * buf, size_t len, char* latin1_output) const noexcept final override { - return set_best()->convert_utf32_to_latin1_with_errors(buf, len,latin1_output); + simdutf_really_inline uint64_t to_bitmask() const { + uint64_t r0 = uint32_t(this->chunks[0].to_bitmask()); + uint64_t r1 = this->chunks[1].to_bitmask(); + uint64_t r2 = this->chunks[2].to_bitmask(); + uint64_t r3 = this->chunks[3].to_bitmask(); + return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48); } - simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final override { - return set_best()->convert_utf32_to_latin1(buf, len,latin1_output); + simdutf_really_inline uint64_t lt(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64(this->chunks[0] < mask, this->chunks[1] < mask, + this->chunks[2] < mask, this->chunks[3] < mask) + .to_bitmask(); } - simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_output) const noexcept final override { - return set_best()->convert_utf32_to_utf8(buf, len, utf8_output); + simdutf_really_inline uint64_t gt(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64(this->chunks[0] > mask, this->chunks[1] > mask, + this->chunks[2] > mask, this->chunks[3] > mask) + .to_bitmask(); } - - simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_output) const noexcept final override { - return set_best()->convert_utf32_to_utf8_with_errors(buf, len, utf8_output); + simdutf_really_inline uint64_t eq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64(this->chunks[0] == mask, this->chunks[1] == mask, + this->chunks[2] == mask, this->chunks[3] == mask) + .to_bitmask(); } - - simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_output) const noexcept final override { - return set_best()->convert_valid_utf32_to_utf8(buf, len, utf8_output); + simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const { + const simd8 mask = simd8::splat(m); + return simd8x64(simd8(this->chunks[0]) >= mask, + simd8(this->chunks[1]) >= mask, + simd8(this->chunks[2]) >= mask, + simd8(this->chunks[3]) >= mask) + .to_bitmask(); } - simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override { - return set_best()->convert_utf32_to_utf16le(buf, len, utf16_output); + void dump() const { + puts(""); + for (int i = 0; i < 4; i++) { + printf("chunk[%d] = ", i); + this->chunks[i].dump(); + } } +}; // struct simd8x64 - simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override { - return set_best()->convert_utf32_to_utf16be(buf, len, utf16_output); - } +simdutf_really_inline simd8 avg(const simd8 a, + const simd8 b) { + return vec_avg(a.value, b.value); +} +/* end file src/simdutf/ppc64/simd8-inl.h */ +/* begin file src/simdutf/ppc64/simd16-inl.h */ +// file included directly - simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override { - return set_best()->convert_utf32_to_utf16le_with_errors(buf, len, utf16_output); - } +template struct simd16; - simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override { - return set_best()->convert_utf32_to_utf16be_with_errors(buf, len, utf16_output); - } +template struct base16 { + using vector_type = vector_u16_type_for_element; + static const int SIZE = sizeof(vector_type); + static const int ELEMENTS = sizeof(vector_type) / sizeof(T); - simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override { - return set_best()->convert_valid_utf32_to_utf16le(buf, len, utf16_output); - } + vector_type value; - simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override { - return set_best()->convert_valid_utf32_to_utf16be(buf, len, utf16_output); - } + // Zero constructor + simdutf_really_inline base16() : value{vector_type()} {} - simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override { - return set_best()->convert_utf16le_to_utf32(buf, len, utf32_output); + // Conversion from SIMD register + simdutf_really_inline base16(const vector_type _value) : value{_value} {} + void dump() const { +#ifdef SIMDUTF_LOGGING + uint16_t tmp[8]; + vec_xst(value, 0, reinterpret_cast(tmp)); + for (int i = 0; i < 8; i++) { + if (i == 0) { + printf("[%04x", tmp[i]); + } else if (i == 8 - 1) { + printf(" %04x]", tmp[i]); + } else { + printf(" %04x", tmp[i]); + } + } + putchar('\n'); +#endif // SIMDUTF_LOGGING } +}; - simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override { - return set_best()->convert_utf16be_to_utf32(buf, len, utf32_output); - } +// Forward declaration +template struct simd16; - simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override { - return set_best()->convert_utf16le_to_utf32_with_errors(buf, len, utf32_output); - } +template +simd16 operator==(const simd16 a, const simd16 b); - simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override { - return set_best()->convert_utf16be_to_utf32_with_errors(buf, len, utf32_output); - } +template +simd16 operator==(const simd16 a, U b); - simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override { - return set_best()->convert_valid_utf16le_to_utf32(buf, len, utf32_output); - } +template simd16 operator&(const simd16 a, const simd16 b); - simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override { - return set_best()->convert_valid_utf16be_to_utf32(buf, len, utf32_output); - } +template simd16 operator|(const simd16 a, const simd16 b); - void change_endianness_utf16(const char16_t * buf, size_t len, char16_t * output) const noexcept final override { - set_best()->change_endianness_utf16(buf, len, output); - } +template simd16 operator|(const simd16 a, U b); - simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t len) const noexcept final override { - return set_best()->count_utf16le(buf, len); - } +template simd16 operator^(const simd16 a, U b); - simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t len) const noexcept final override { - return set_best()->count_utf16be(buf, len); +// SIMD byte mask type (returned by things like eq and gt) +template <> struct simd16 : base16 { + static simdutf_really_inline simd16 splat(bool _value) { + return (vector_type)vec_splats(uint16_t(-(!!_value))); } - simdutf_warn_unused size_t count_utf8(const char * buf, size_t len) const noexcept final override { - return set_best()->count_utf8(buf, len); - } + simdutf_really_inline simd16() : base16() {} - simdutf_warn_unused size_t latin1_length_from_utf8(const char * buf, size_t len) const noexcept override { - return set_best()->latin1_length_from_utf8(buf, len); - } + simdutf_really_inline simd16(const vector_type _value) + : base16(_value) {} - simdutf_warn_unused size_t latin1_length_from_utf16(size_t len) const noexcept override { - return set_best()->latin1_length_from_utf16(len); - } + // Splat constructor + simdutf_really_inline simd16(bool _value) : base16(splat(_value)) {} - simdutf_warn_unused size_t latin1_length_from_utf32(size_t len) const noexcept override { - return set_best()->latin1_length_from_utf32(len); + simdutf_really_inline uint16_t to_bitmask() const { + return move_mask_u8(value); } - simdutf_warn_unused size_t utf8_length_from_latin1(const char * buf, size_t len) const noexcept override { - return set_best()->utf8_length_from_latin1(buf, len); - } + simdutf_really_inline bool any() const { + const auto tmp = vec_u64_t(value); - simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * buf, size_t len) const noexcept override { - return set_best()->utf8_length_from_utf16le(buf, len); + return tmp[0] || tmp[1]; // Note: logical or, not binary one } - simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * buf, size_t len) const noexcept override { - return set_best()->utf8_length_from_utf16be(buf, len); - } + simdutf_really_inline bool is_zero() const { + const auto tmp = vec_u64_t(value); - simdutf_warn_unused size_t utf16_length_from_latin1(size_t len) const noexcept override { - return set_best()->utf16_length_from_latin1(len); + return (tmp[0] | tmp[1]) == 0; } - simdutf_warn_unused size_t utf32_length_from_latin1(size_t len) const noexcept override { - return set_best()->utf32_length_from_latin1(len); + simdutf_really_inline simd16 &operator|=(const simd16 rhs) { + value = vec_or(this->value, rhs.value); + return *this; } +}; - simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * buf, size_t len) const noexcept override { - return set_best()->utf32_length_from_utf16le(buf, len); - } +template struct base16_numeric : base16 { + using vector_type = typename base16::vector_type; - simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * buf, size_t len) const noexcept override { - return set_best()->utf32_length_from_utf16be(buf, len); + static simdutf_really_inline simd16 splat(T _value) { + return vec_splats(_value); } - simdutf_warn_unused size_t utf16_length_from_utf8(const char * buf, size_t len) const noexcept override { - return set_best()->utf16_length_from_utf8(buf, len); - } + static simdutf_really_inline simd16 zero() { return splat(0); } - simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * buf, size_t len) const noexcept override { - return set_best()->utf8_length_from_utf32(buf, len); + template + static simdutf_really_inline simd16 load(const U *ptr) { + return vec_xl(0, reinterpret_cast(ptr)); } - simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * buf, size_t len) const noexcept override { - return set_best()->utf16_length_from_utf32(buf, len); - } + simdutf_really_inline base16_numeric() : base16() {} + simdutf_really_inline base16_numeric(const vector_type _value) + : base16(_value) {} - simdutf_warn_unused size_t utf32_length_from_utf8(const char * buf, size_t len) const noexcept override { - return set_best()->utf32_length_from_utf8(buf, len); + // Store to array + template simdutf_really_inline void store(U *dst) const { +#if defined(__clang__) + return vec_xst(this->value, 0, reinterpret_cast(dst)); +#else + return vec_xst(this->value, 0, reinterpret_cast(dst)); +#endif // defined(__clang__) } - simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept override { - return set_best()->maximal_binary_length_from_base64(input, length); + // Override to distinguish from bool version + simdutf_really_inline simd16 operator~() const { + return vec_xor(this->value, vec_splats(T(0xffff))); } +}; - simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept override { - return set_best()->base64_to_binary(input, length, output, options); - } +// Signed code units +template <> struct simd16 : base16_numeric { + simdutf_really_inline simd16() : base16_numeric() {} + simdutf_really_inline simd16(const vector_type _value) + : base16_numeric(_value) {} + // Splat constructor + simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {} + // Array constructor + simdutf_really_inline operator simd16() const; +}; - simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept override { - return set_best()->maximal_binary_length_from_base64(input, length); - } +// Unsigned code units +template <> struct simd16 : base16_numeric { + simdutf_really_inline simd16() : base16_numeric() {} + simdutf_really_inline simd16(const vector_type _value) + : base16_numeric(_value) {} - simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept override { - return set_best()->base64_to_binary(input, length, output, options); - } + // Splat constructor + simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {} - simdutf_warn_unused size_t base64_length_from_binary(size_t length, base64_options options) const noexcept override { - return set_best()->base64_length_from_binary(length, options); - } + // Array constructor + simdutf_really_inline simd16(const char16_t *values) + : simd16(load(reinterpret_cast(values))) {} - size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept override { - return set_best()->binary_to_base64(input, length, output, options); + simdutf_really_inline bool is_ascii() const { + return vec_all_lt(value, vec_splats(uint16_t(128))); } - simdutf_really_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {} - -private: - const implementation *set_best() const noexcept; -}; - -static_assert(std::is_trivially_destructible::value, "detect_best_supported_implementation_on_first_use should be trivially destructible"); - -static const std::initializer_list& get_available_implementation_pointers() { - static const std::initializer_list available_implementation_pointers { -#if SIMDUTF_IMPLEMENTATION_ICELAKE - get_icelake_singleton(), -#endif -#if SIMDUTF_IMPLEMENTATION_HASWELL - get_haswell_singleton(), -#endif -#if SIMDUTF_IMPLEMENTATION_WESTMERE - get_westmere_singleton(), -#endif -#if SIMDUTF_IMPLEMENTATION_ARM64 - get_arm64_singleton(), -#endif -#if SIMDUTF_IMPLEMENTATION_PPC64 - get_ppc64_singleton(), -#endif -#if SIMDUTF_IMPLEMENTATION_RVV - get_rvv_singleton(), -#endif -#if SIMDUTF_IMPLEMENTATION_FALLBACK - get_fallback_singleton(), -#endif - }; // available_implementation_pointers - return available_implementation_pointers; -} - -// So we can return UNSUPPORTED_ARCHITECTURE from the parser when there is no support -class unsupported_implementation final : public implementation { -public: - simdutf_warn_unused int detect_encodings(const char *, size_t) const noexcept override { - return encoding_type::unspecified; + // Order-specific operations + simdutf_really_inline simd16 + max_val(const simd16 other) const { + return vec_max(this->value, other.value); } - - simdutf_warn_unused bool validate_utf8(const char *, size_t) const noexcept final override { - return false; // Just refuse to validate. Given that we have a fallback implementation - // it seems unlikely that unsupported_implementation will ever be used. If it is used, - // then it will flag all strings as invalid. The alternative is to return an error_code - // from which the user has to figure out whether the string is valid UTF-8... which seems - // like a lot of work just to handle the very unlikely case that we have an unsupported - // implementation. And, when it does happen (that we have an unsupported implementation), - // what are the chances that the programmer has a fallback? Given that *we* provide the - // fallback, it implies that the programmer would need a fallback for our fallback. + simdutf_really_inline simd16 + min_val(const simd16 other) const { + return vec_min(this->value, other.value); } - - simdutf_warn_unused result validate_utf8_with_errors(const char *, size_t) const noexcept final override { - return result(error_code::OTHER, 0); + // Same as <, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd16 + operator<=(const simd16 other) const { + return other.max_val(*this) == other; } - simdutf_warn_unused bool validate_ascii(const char *, size_t) const noexcept final override { - return false; + simdutf_really_inline simd16 + operator>=(const simd16 other) const { + return other.min_val(*this) == other; } - simdutf_warn_unused result validate_ascii_with_errors(const char *, size_t) const noexcept final override { - return result(error_code::OTHER, 0); + simdutf_really_inline simd16 + operator<(const simd16 other) const { + return vec_cmplt(value, other.value); } - simdutf_warn_unused bool validate_utf16le(const char16_t*, size_t) const noexcept final override { - return false; + // Bit-specific operations + template simdutf_really_inline simd16 shr() const { + return vec_sr(value, vec_splats(uint16_t(N))); } - simdutf_warn_unused bool validate_utf16be(const char16_t*, size_t) const noexcept final override { - return false; + template simdutf_really_inline simd16 shl() const { + return vec_sl(value, vec_splats(uint16_t(N))); } - simdutf_warn_unused result validate_utf16le_with_errors(const char16_t*, size_t) const noexcept final override { - return result(error_code::OTHER, 0); + // Change the endianness + simdutf_really_inline simd16 swap_bytes() const { + return vec_revb(value); } - simdutf_warn_unused result validate_utf16be_with_errors(const char16_t*, size_t) const noexcept final override { - return result(error_code::OTHER, 0); + // Pack with the unsigned saturation of two uint16_t code units into single + // uint8_t vector + static simdutf_really_inline simd8 pack(const simd16 &v0, + const simd16 &v1) { + return vec_packs(v0.value, v1.value); } +}; - simdutf_warn_unused bool validate_utf32(const char32_t*, size_t) const noexcept final override { - return false; - } +template +simd16 operator==(const simd16 a, const simd16 b) { + return vec_cmpeq(a.value, b.value); +} - simdutf_warn_unused result validate_utf32_with_errors(const char32_t*, size_t) const noexcept final override { - return result(error_code::OTHER, 0); - } +template +simd16 operator==(const simd16 a, U b) { + return vec_cmpeq(a.value, vec_splats(T(b))); +} - simdutf_warn_unused size_t convert_latin1_to_utf8(const char*, size_t, char*) const noexcept final override { - return 0; - } +template +simd16 operator&(const simd16 a, const simd16 b) { + return vec_and(a.value, b.value); +} - simdutf_warn_unused size_t convert_latin1_to_utf16le(const char*, size_t, char16_t*) const noexcept final override { - return 0; - } +template simd16 operator&(const simd16 a, U b) { + return vec_and(a.value, vec_splats(T(b))); +} - simdutf_warn_unused size_t convert_latin1_to_utf16be(const char*, size_t, char16_t*) const noexcept final override { - return 0; - } +template +simd16 operator|(const simd16 a, const simd16 b) { + return vec_or(a.value, b.value); +} - simdutf_warn_unused size_t convert_latin1_to_utf32(const char*, size_t, char32_t*) const noexcept final override { - return 0; - } +template simd16 operator|(const simd16 a, U b) { + return vec_or(a.value, vec_splats(T(b))); +} - simdutf_warn_unused size_t convert_utf8_to_latin1(const char*, size_t, char*) const noexcept final override { - return 0; - } +template +simd16 operator^(const simd16 a, const simd16 b) { + return vec_xor(a.value, b.value); +} - simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char*, size_t, char*) const noexcept final override { - return result(error_code::OTHER, 0); - } +template simd16 operator^(const simd16 a, U b) { + return vec_xor(a.value, vec_splats(T(b))); +} - simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char*, size_t, char*) const noexcept final override { - return 0; - } +simdutf_really_inline simd16::operator simd16() const { + return (vec_u16_t)(value); +} - simdutf_warn_unused size_t convert_utf8_to_utf16le(const char*, size_t, char16_t*) const noexcept final override { - return 0; - } +template struct simd16x32 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd16); + static_assert(NUM_CHUNKS == 4, + "AltiVec kernel should use four registers per 64-byte block."); + simd16 chunks[NUM_CHUNKS]; - simdutf_warn_unused size_t convert_utf8_to_utf16be(const char*, size_t, char16_t*) const noexcept final override { - return 0; - } + simd16x32(const simd16x32 &o) = delete; // no copy allowed + simd16x32 & + operator=(const simd16 other) = delete; // no assignment allowed + simd16x32() = delete; // no default constructor allowed - simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char*, size_t, char16_t*) const noexcept final override { - return result(error_code::OTHER, 0); - } + simdutf_really_inline + simd16x32(const simd16 chunk0, const simd16 chunk1, + const simd16 chunk2, const simd16 chunk3) + : chunks{chunk0, chunk1, chunk2, chunk3} {} + simdutf_really_inline simd16x32(const T *ptr) + : chunks{simd16::load(ptr), + simd16::load(ptr + sizeof(simd16) / sizeof(T)), + simd16::load(ptr + 2 * sizeof(simd16) / sizeof(T)), + simd16::load(ptr + 3 * sizeof(simd16) / sizeof(T))} {} - simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char*, size_t, char16_t*) const noexcept final override { - return result(error_code::OTHER, 0); + simdutf_really_inline void store(T *ptr) const { + this->chunks[0].store(ptr + sizeof(simd16) * 0 / sizeof(T)); + this->chunks[1].store(ptr + sizeof(simd16) * 1 / sizeof(T)); + this->chunks[2].store(ptr + sizeof(simd16) * 2 / sizeof(T)); + this->chunks[3].store(ptr + sizeof(simd16) * 3 / sizeof(T)); } - simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char*, size_t, char16_t*) const noexcept final override { - return 0; + simdutf_really_inline simd16 reduce_or() const { + return (this->chunks[0] | this->chunks[1]) | + (this->chunks[2] | this->chunks[3]); } - simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char*, size_t, char16_t*) const noexcept final override { - return 0; + simdutf_really_inline bool is_ascii() const { + return this->reduce_or().is_ascii(); } - simdutf_warn_unused size_t convert_utf8_to_utf32(const char*, size_t, char32_t*) const noexcept final override { - return 0; + simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const { + this->chunks[0].store_ascii_as_utf16(ptr + sizeof(simd16) * 0); + this->chunks[1].store_ascii_as_utf16(ptr + sizeof(simd16) * 1); + this->chunks[2].store_ascii_as_utf16(ptr + sizeof(simd16) * 2); + this->chunks[3].store_ascii_as_utf16(ptr + sizeof(simd16) * 3); } - simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char*, size_t, char32_t*) const noexcept final override { - return result(error_code::OTHER, 0); + simdutf_really_inline uint64_t to_bitmask() const { + uint64_t r0 = uint32_t(this->chunks[0].to_bitmask()); + uint64_t r1 = this->chunks[1].to_bitmask(); + uint64_t r2 = this->chunks[2].to_bitmask(); + uint64_t r3 = this->chunks[3].to_bitmask(); + return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48); } - simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char*, size_t, char32_t*) const noexcept final override { - return 0; + simdutf_really_inline void swap_bytes() { + this->chunks[0] = this->chunks[0].swap_bytes(); + this->chunks[1] = this->chunks[1].swap_bytes(); + this->chunks[2] = this->chunks[2].swap_bytes(); + this->chunks[3] = this->chunks[3].swap_bytes(); } - simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t*, size_t, char*) const noexcept final override { - return 0; + simdutf_really_inline uint64_t gt(const T m) const { + const simd16 mask = simd16::splat(m); + return simd16x32(this->chunks[0] > mask, this->chunks[1] > mask, + this->chunks[2] > mask, this->chunks[3] > mask) + .to_bitmask(); } - simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t*, size_t, char*) const noexcept final override { - return 0; + simdutf_really_inline uint64_t lteq(const T m) const { + const simd16 mask = simd16::splat(m); + return simd16x32(this->chunks[0] <= mask, this->chunks[1] <= mask, + this->chunks[2] <= mask, this->chunks[3] <= mask) + .to_bitmask(); } - simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t*, size_t, char*) const noexcept final override { - return result(error_code::OTHER, 0); + simdutf_really_inline uint64_t eq(const T m) const { + const simd16 mask = simd16::splat(m); + return simd16x32(this->chunks[0] == mask, this->chunks[1] == mask, + this->chunks[2] == mask, this->chunks[3] == mask) + .to_bitmask(); } - simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t*, size_t, char*) const noexcept final override { - return result(error_code::OTHER, 0); + simdutf_really_inline uint64_t not_in_range(const T low, const T high) const { + const simd16 mask_low = simd16::splat(static_cast(low - 1)); + const simd16 mask_high = simd16::splat(static_cast(high + 1)); + return simd16x32( + (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low), + (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low), + (this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low), + (this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low)) + .to_bitmask(); } +}; // struct simd16x32 +/* end file src/simdutf/ppc64/simd16-inl.h */ +/* begin file src/simdutf/ppc64/simd32-inl.h */ +// file included directly - simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t*, size_t, char*) const noexcept final override { - return 0; - } +template struct simd32; - simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t*, size_t, char*) const noexcept final override { - return 0; - } +template struct base32 { + using vector_type = vector_u32_type_for_element; + static const int SIZE = sizeof(vector_type); + static const int ELEMENTS = sizeof(vector_type) / sizeof(T); - simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t*, size_t, char*) const noexcept final override { - return 0; - } + vector_type value; - simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t*, size_t, char*) const noexcept final override { - return 0; - } + // Zero constructor + simdutf_really_inline base32() : value{vector_type()} {} - simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t*, size_t, char*) const noexcept final override { - return result(error_code::OTHER, 0); - } + // Conversion from SIMD register + simdutf_really_inline base32(const vector_type _value) : value{_value} {} - simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t*, size_t, char*) const noexcept final override { - return result(error_code::OTHER, 0); - } + // Splat for scalar + simdutf_really_inline base32(T scalar) : value{vec_splats(scalar)} {} - simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t*, size_t, char*) const noexcept final override { - return 0; - } + template + simdutf_really_inline base32(const Pointer *ptr) + : base32(vec_xl(0, reinterpret_cast(ptr))) {} - simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t*, size_t, char*) const noexcept final override { - return 0; + // Store to array + template simdutf_really_inline void store(U *dst) const { +#if defined(__clang__) + return vec_xst(this->value, 0, reinterpret_cast(dst)); +#else + return vec_xst(this->value, 0, reinterpret_cast(dst)); +#endif // defined(__clang__) + } + void dump(const char *name = nullptr) const { +#ifdef SIMDUTF_LOGGING + if (name != nullptr) { + printf("%-10s = ", name); + } + + uint32_t tmp[4]; + vec_xst(value, 0, reinterpret_cast(tmp)); + for (int i = 0; i < 4; i++) { + if (i == 0) { + printf("[%08x", tmp[i]); + } else if (i == 4 - 1) { + printf(" %08x]", tmp[i]); + } else { + printf(" %08x", tmp[i]); + } + } + putchar('\n'); +#endif // SIMDUTF_LOGGING } +}; - simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t *, size_t, char* ) const noexcept final override { - return 0; - } +template struct base32_numeric : base32 { + using super = base32; + using vector_type = typename super::vector_type; - simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t *, size_t, char* ) const noexcept final override { - return result(error_code::OTHER, 0); + static simdutf_really_inline simd32 splat(T _value) { + return vec_splats(_value); } - simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t *, size_t, char* ) const noexcept final override { - return 0; - } + static simdutf_really_inline simd32 zero() { return splat(0); } - simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t*, size_t, char*) const noexcept final override { - return 0; + template + static simdutf_really_inline simd32 load(const U *values) { + return vec_xl(0, reinterpret_cast(values)); } - simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t*, size_t, char*) const noexcept final override { - return result(error_code::OTHER, 0); - } + simdutf_really_inline base32_numeric() : base32() {} - simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t*, size_t, char*) const noexcept final override { - return 0; - } + simdutf_really_inline base32_numeric(const vector_type _value) + : base32(_value) {} - simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t*, size_t, char16_t*) const noexcept final override { - return 0; + // Addition/subtraction are the same for signed and unsigned + simdutf_really_inline simd32 operator+(const simd32 other) const { + return vec_add(this->value, other.value); } - simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t*, size_t, char16_t*) const noexcept final override { - return 0; + simdutf_really_inline simd32 operator-(const simd32 other) const { + return vec_sub(this->value, other.value); } - simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t*, size_t, char16_t*) const noexcept final override { - return result(error_code::OTHER, 0); + simdutf_really_inline simd32 &operator+=(const simd32 other) { + *this = *this + other; + return *static_cast *>(this); } - simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t*, size_t, char16_t*) const noexcept final override { - return result(error_code::OTHER, 0); + simdutf_really_inline simd32 &operator-=(const simd32 other) { + *this = *this - other; + return *static_cast *>(this); } +}; - simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t*, size_t, char16_t*) const noexcept final override { - return 0; - } +// Forward declaration +template struct simd32; - simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t*, size_t, char16_t*) const noexcept final override { - return 0; - } +template +simd32 operator==(const simd32 a, const simd32 b); - simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override { - return 0; - } +template +simd32 operator!=(const simd32 a, const simd32 b); - simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override { - return 0; - } +template +simd32 operator>(const simd32 a, const simd32 b); - simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t*, size_t, char32_t*) const noexcept final override { - return result(error_code::OTHER, 0); - } +template simd32 operator==(const simd32 a, T b); - simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t*, size_t, char32_t*) const noexcept final override { - return result(error_code::OTHER, 0); - } +template simd32 operator!=(const simd32 a, T b); - simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override { - return 0; - } +template simd32 operator&(const simd32 a, const simd32 b); - simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override { - return 0; - } +template simd32 operator|(const simd32 a, const simd32 b); - void change_endianness_utf16(const char16_t *, size_t, char16_t *) const noexcept final override { +template simd32 operator^(const simd32 a, const simd32 b); +// SIMD byte mask type (returned by things like eq and gt) +template <> struct simd32 : base32 { + static simdutf_really_inline simd32 splat(bool _value) { + return (vector_type)vec_splats(uint32_t(-(!!_value))); } - simdutf_warn_unused size_t count_utf16le(const char16_t *, size_t) const noexcept final override { - return 0; - } + simdutf_really_inline simd32(const vector_type _value) + : base32(_value) {} - simdutf_warn_unused size_t count_utf16be(const char16_t *, size_t) const noexcept final override { - return 0; - } + // Splat constructor + simdutf_really_inline simd32(bool _value) : base32(splat(_value)) {} - simdutf_warn_unused size_t count_utf8(const char *, size_t) const noexcept final override { - return 0; + simdutf_really_inline uint16_t to_bitmask() const { + return move_mask_u8(value); } - simdutf_warn_unused size_t latin1_length_from_utf8(const char *, size_t) const noexcept override { - return 0; - } + simdutf_really_inline bool any() const { + const vec_u64_t tmp = (vec_u64_t)value; - simdutf_warn_unused size_t latin1_length_from_utf16(size_t) const noexcept override { - return 0; + return tmp[0] || tmp[1]; // Note: logical or, not binary one } - simdutf_warn_unused size_t latin1_length_from_utf32(size_t) const noexcept override { - return 0; - } - simdutf_warn_unused size_t utf8_length_from_latin1(const char *, size_t) const noexcept override { - return 0; - } + simdutf_really_inline bool is_zero() const { + const vec_u64_t tmp = (vec_u64_t)value; - simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *, size_t) const noexcept override { - return 0; + return (tmp[0] | tmp[1]) == 0; } - simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *, size_t) const noexcept override { - return 0; + simdutf_really_inline simd32 operator~() const { + return (vec_bool32_t)vec_xor(this->value, vec_splats(uint32_t(0xffffffff))); } +}; - simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t *, size_t) const noexcept override { - return 0; - } +// Unsigned code units +template <> struct simd32 : base32_numeric { + simdutf_really_inline simd32() : base32_numeric() {} - simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t *, size_t) const noexcept override { - return 0; - } + simdutf_really_inline simd32(const vector_type _value) + : base32_numeric(_value) {} - simdutf_warn_unused size_t utf32_length_from_latin1(size_t) const noexcept override { - return 0; - } + // Splat constructor + simdutf_really_inline simd32(uint32_t _value) : simd32(splat(_value)) {} - simdutf_warn_unused size_t utf16_length_from_utf8(const char *, size_t) const noexcept override { - return 0; - } - simdutf_warn_unused size_t utf16_length_from_latin1(size_t) const noexcept override { - return 0; - } - simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *, size_t) const noexcept override { - return 0; - } + // Array constructor + simdutf_really_inline simd32(const char32_t *values) + : simd32(load(reinterpret_cast(values))) {} - simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t *, size_t) const noexcept override { - return 0; + // Bit-specific operations + template simdutf_really_inline simd32 shr() const { + return vec_sr(value, vec_splats(uint32_t(N))); } - simdutf_warn_unused size_t utf32_length_from_utf8(const char *, size_t) const noexcept override { - return 0; + template simdutf_really_inline simd32 shl() const { + return vec_sl(value, vec_splats(uint32_t(N))); } - simdutf_warn_unused size_t maximal_binary_length_from_base64(const char *, size_t) const noexcept override { - return 0; + // Change the endianness + simdutf_really_inline simd32 swap_bytes() const { + return vec_revb(value); } - simdutf_warn_unused result base64_to_binary(const char *, size_t, char*, base64_options) const noexcept override { - return result(error_code::OTHER, 0); + simdutf_really_inline uint64_t sum() const { + return uint64_t(value[0]) + uint64_t(value[1]) + uint64_t(value[2]) + + uint64_t(value[3]); } - simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t *, size_t) const noexcept override { - return 0; + static simdutf_really_inline simd16 + pack(const simd32 &v0, const simd32 &v1) { + return vec_packs(v0.value, v1.value); } +}; - simdutf_warn_unused result base64_to_binary(const char16_t *, size_t, char*, base64_options) const noexcept override { - return result(error_code::OTHER, 0); - } - - - simdutf_warn_unused size_t base64_length_from_binary(size_t, base64_options) const noexcept override { - return 0; - } - - size_t binary_to_base64(const char *, size_t, char*, base64_options) const noexcept override { - return 0; - } - - unsupported_implementation() : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0) {} -}; - -const unsupported_implementation* get_unsupported_singleton() { - static const unsupported_implementation unsupported_singleton{}; - return &unsupported_singleton; +template +simd32 operator==(const simd32 a, const simd32 b) { + return vec_cmpeq(a.value, b.value); } -static_assert(std::is_trivially_destructible::value, "unsupported_singleton should be trivially destructible"); -size_t available_implementation_list::size() const noexcept { - return internal::get_available_implementation_pointers().size(); -} -const implementation * const *available_implementation_list::begin() const noexcept { - return internal::get_available_implementation_pointers().begin(); -} -const implementation * const *available_implementation_list::end() const noexcept { - return internal::get_available_implementation_pointers().end(); -} -const implementation *available_implementation_list::detect_best_supported() const noexcept { - // They are prelisted in priority order, so we just go down the list - uint32_t supported_instruction_sets = internal::detect_supported_architectures(); - for (const implementation *impl : internal::get_available_implementation_pointers()) { - uint32_t required_instruction_sets = impl->required_instruction_sets(); - if ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets) { return impl; } - } - return get_unsupported_singleton(); // this should never happen? +template +simd32 operator!=(const simd32 a, const simd32 b) { + return vec_cmpne(a.value, b.value); } -const implementation *detect_best_supported_implementation_on_first_use::set_best() const noexcept { - SIMDUTF_PUSH_DISABLE_WARNINGS - SIMDUTF_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC: manually verified this is safe - char *force_implementation_name = getenv("SIMDUTF_FORCE_IMPLEMENTATION"); - SIMDUTF_POP_DISABLE_WARNINGS +template simd32 operator==(const simd32 a, T b) { + return vec_cmpeq(a.value, vec_splats(b)); +} - if (force_implementation_name) { - auto force_implementation = get_available_implementations()[force_implementation_name]; - if (force_implementation) { - return get_active_implementation() = force_implementation; - } else { - // Note: abort() and stderr usage within the library is forbidden. - return get_active_implementation() = get_unsupported_singleton(); - } - } - return get_active_implementation() = get_available_implementations().detect_best_supported(); +template simd32 operator!=(const simd32 a, T b) { + return vec_cmpne(a.value, vec_splats(b)); } -} // namespace internal +template +simd32 operator>(const simd32 a, const simd32 b) { + return vec_cmpgt(a.value, b.value); +} +template +simd32 operator>=(const simd32 a, const simd32 b) { + return vec_cmpge(a.value, b.value); +} +template +simd32 operator&(const simd32 a, const simd32 b) { + return vec_and(a.value, b.value); +} -/** - * The list of available implementations compiled into simdutf. - */ -SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list& get_available_implementations() { - static const internal::available_implementation_list available_implementations{}; - return available_implementations; +template simd32 operator&(const simd32 a, U b) { + return vec_and(a.value, vec_splats(T(b))); } -/** - * The active implementation. - */ -SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr& get_active_implementation() { -#if SIMDUTF_SINGLE_IMPLEMENTATION - // skip runtime detection - static internal::atomic_ptr active_implementation{internal::get_single_implementation()}; - return active_implementation; -#else - static const internal::detect_best_supported_implementation_on_first_use detect_best_supported_implementation_on_first_use_singleton; - static internal::atomic_ptr active_implementation{&detect_best_supported_implementation_on_first_use_singleton}; - return active_implementation; -#endif +template +simd32 operator|(const simd32 a, const simd32 b) { + return vec_or(a.value, b.value); } +template +simd32 operator^(const simd32 a, const simd32 b) { + return vec_xor(a.value, b.value); +} -#if SIMDUTF_SINGLE_IMPLEMENTATION -const implementation * get_default_implementation() { - return internal::get_single_implementation(); +template simd32 operator^(const simd32 a, U b) { + return vec_xor(a.value, vec_splats(T(b))); } -#else -internal::atomic_ptr& get_default_implementation() { - return get_active_implementation(); + +template simd32 max_val(const simd32 a, const simd32 b) { + return vec_max(a.value, b.value); } -#endif -#define SIMDUTF_GET_CURRENT_IMPLEMENTATION -simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept { - return get_default_implementation()->validate_utf8(buf, len); +template +simdutf_really_inline simd32 min(const simd32 b, const simd32 a) { + return vec_min(a.value, b.value); } -simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) noexcept { - return get_default_implementation()->validate_utf8_with_errors(buf, len); +/* end file src/simdutf/ppc64/simd32-inl.h */ + +template +simd8 select(const simd8 cond, const simd8 val_true, + const simd8 val_false) { + return vec_sel(val_false.value, val_true.value, cond.value); } -simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept { - return get_default_implementation()->validate_ascii(buf, len); + +template +simd8 select(const T cond, const simd8 val_true, + const simd8 val_false) { + return vec_sel(val_false.value, val_true.value, vec_splats(cond)); } -simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) noexcept { - return get_default_implementation()->validate_ascii_with_errors(buf, len); + +template +simd16 select(const simd16 cond, const simd16 val_true, + const simd16 val_false) { + return vec_sel(val_false.value, val_true.value, cond.value); } -simdutf_warn_unused size_t convert_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_output) noexcept { - #if SIMDUTF_IS_BIG_ENDIAN - return convert_utf8_to_utf16be(input, length, utf16_output); - #else - return convert_utf8_to_utf16le(input, length, utf16_output); - #endif + +template +simd16 select(const T cond, const simd16 val_true, + const simd16 val_false) { + return vec_sel(val_false.value, val_true.value, vec_splats(cond)); } -simdutf_warn_unused size_t convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) noexcept { - return get_default_implementation()->convert_latin1_to_utf8(buf, len,utf8_output); + +template +simd32 select(const simd32 cond, const simd32 val_true, + const simd32 val_false) { + return vec_sel(val_false.value, val_true.value, cond.value); } -simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) noexcept { - return get_default_implementation()->convert_latin1_to_utf16le(buf, len, utf16_output); + +template +simd32 select(const T cond, const simd32 val_true, + const simd32 val_false) { + return vec_sel(val_false.value, val_true.value, vec_splats(cond)); } -simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) noexcept{ - return get_default_implementation()->convert_latin1_to_utf16be(buf, len, utf16_output); + +using vector_u8 = simd8; +using vector_u16 = simd16; +using vector_u32 = simd32; +using vector_i8 = simd8; + +simdutf_really_inline vector_u8 as_vector_u8(const vector_u16 v) { + return vector_u8::vector_type(v.value); } -simdutf_warn_unused size_t convert_latin1_to_utf32(const char * buf, size_t len, char32_t * latin1_output) noexcept { - return get_default_implementation()->convert_latin1_to_utf32(buf, len,latin1_output); + +simdutf_really_inline vector_u8 as_vector_u8(const vector_u32 v) { + return vector_u8::vector_type(v.value); } -simdutf_warn_unused size_t convert_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) noexcept { - return get_default_implementation()->convert_utf8_to_latin1(buf, len,latin1_output); + +simdutf_really_inline vector_u8 as_vector_u8(const vector_i8 v) { + return vector_u8::vector_type(v.value); } -simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) noexcept { - return get_default_implementation()->convert_utf8_to_latin1_with_errors(buf, len, latin1_output); + +simdutf_really_inline vector_u8 as_vector_u8(const simd16 v) { + return vector_u8::vector_type(v.value); } -simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) noexcept { - return get_default_implementation()->convert_valid_utf8_to_latin1(buf, len,latin1_output); + +simdutf_really_inline vector_i8 as_vector_i8(const vector_u8 v) { + return vector_i8::vector_type(v.value); } -simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept { - return get_default_implementation()->convert_utf8_to_utf16le(input, length, utf16_output); + +simdutf_really_inline vector_u16 as_vector_u16(const vector_u8 v) { + return vector_u16::vector_type(v.value); } -simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept { - return get_default_implementation()->convert_utf8_to_utf16be(input, length, utf16_output); + +simdutf_really_inline vector_u16 as_vector_u16(const simd16 v) { + return vector_u16::vector_type(v.value); } -simdutf_warn_unused result convert_utf8_to_utf16_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept { - #if SIMDUTF_IS_BIG_ENDIAN - return convert_utf8_to_utf16be_with_errors(input, length, utf16_output); - #else - return convert_utf8_to_utf16le_with_errors(input, length, utf16_output); - #endif + +simdutf_really_inline vector_u32 as_vector_u32(const vector_u8 v) { + return vector_u32::vector_type(v.value); } -simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept { - return get_default_implementation()->convert_utf8_to_utf16le_with_errors(input, length, utf16_output); + +simdutf_really_inline vector_u32 as_vector_u32(const vector_u16 v) { + return vector_u32::vector_type(v.value); } -simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept { - return get_default_implementation()->convert_utf8_to_utf16be_with_errors(input, length, utf16_output); + +simdutf_really_inline vector_u32 max(vector_u32 a, vector_u32 b) { + return vec_max(a.value, b.value); } -simdutf_warn_unused size_t convert_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_output) noexcept { - return get_default_implementation()->convert_utf8_to_utf32(input, length, utf32_output); + +simdutf_really_inline vector_u32 max(vector_u32 a, vector_u32 b, vector_u32 c) { + return max(max(a, b), c); } -simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) noexcept { - return get_default_implementation()->convert_utf8_to_utf32_with_errors(input, length, utf32_output); + +simdutf_really_inline vector_u32 sum4bytes(vector_u8 bytes, vector_u32 acc) { + return vec_sum4s(bytes.value, acc.value); } -simdutf_warn_unused bool validate_utf16(const char16_t * buf, size_t len) noexcept { - #if SIMDUTF_IS_BIG_ENDIAN - return validate_utf16be(buf, len); + +} // namespace simd +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf + +#endif // SIMDUTF_PPC64_SIMD_INPUT_H +/* end file src/simdutf/ppc64/simd.h */ + +/* begin file src/simdutf/ppc64/end.h */ +/* end file src/simdutf/ppc64/end.h */ + +#endif // SIMDUTF_IMPLEMENTATION_PPC64 + +#endif // SIMDUTF_PPC64_H +/* end file src/simdutf/ppc64.h */ +/* begin file src/simdutf/rvv.h */ +#ifndef SIMDUTF_RVV_H +#define SIMDUTF_RVV_H + +#ifdef SIMDUTF_FALLBACK_H + #error "rvv.h must be included before fallback.h" +#endif + + +#define SIMDUTF_CAN_ALWAYS_RUN_RVV SIMDUTF_IS_RVV + +#ifndef SIMDUTF_IMPLEMENTATION_RVV + #define SIMDUTF_IMPLEMENTATION_RVV \ + (SIMDUTF_CAN_ALWAYS_RUN_RVV || \ + (SIMDUTF_IS_RISCV64 && SIMDUTF_HAS_RVV_INTRINSICS && \ + SIMDUTF_HAS_RVV_TARGET_REGION)) +#endif + +#if SIMDUTF_IMPLEMENTATION_RVV + + #if SIMDUTF_CAN_ALWAYS_RUN_RVV + #define SIMDUTF_TARGET_RVV #else - return validate_utf16le(buf, len); + #define SIMDUTF_TARGET_RVV SIMDUTF_TARGET_REGION("arch=+v") #endif -} -simdutf_warn_unused bool validate_utf16le(const char16_t * buf, size_t len) noexcept { - return get_default_implementation()->validate_utf16le(buf, len); -} -simdutf_warn_unused bool validate_utf16be(const char16_t * buf, size_t len) noexcept { - return get_default_implementation()->validate_utf16be(buf, len); -} -simdutf_warn_unused result validate_utf16_with_errors(const char16_t * buf, size_t len) noexcept { - #if SIMDUTF_IS_BIG_ENDIAN - return validate_utf16be_with_errors(buf, len); - #else - return validate_utf16le_with_errors(buf, len); + #if !SIMDUTF_IS_ZVBB && SIMDUTF_HAS_ZVBB_INTRINSICS + #define SIMDUTF_TARGET_ZVBB SIMDUTF_TARGET_REGION("arch=+v,+zvbb") #endif + +namespace simdutf { +namespace rvv {} // namespace rvv +} // namespace simdutf + +/* begin file src/simdutf/rvv/implementation.h */ +#ifndef SIMDUTF_RVV_IMPLEMENTATION_H +#define SIMDUTF_RVV_IMPLEMENTATION_H + + +namespace simdutf { +namespace rvv { + +namespace { +using namespace simdutf; +} // namespace + +class implementation final : public simdutf::implementation { +public: + simdutf_really_inline implementation() + : simdutf::implementation("rvv", "RISC-V Vector Extension", + internal::instruction_set::RVV), + _supports_zvbb(internal::detect_supported_architectures() & + internal::instruction_set::ZVBB) {} +#if SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused int detect_encodings(const char *input, + size_t length) const noexcept final; +#endif // SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool validate_utf8(const char *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF8 + simdutf_warn_unused result + validate_utf8_with_errors(const char *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 +#if SIMDUTF_FEATURE_ASCII + simdutf_warn_unused bool validate_ascii(const char *buf, + size_t len) const noexcept final; + simdutf_warn_unused result + validate_ascii_with_errors(const char *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_ASCII +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII + simdutf_warn_unused bool + validate_utf16le_as_ascii(const char16_t *buf, + size_t len) const noexcept final; + + simdutf_warn_unused bool + validate_utf16be_as_ascii(const char16_t *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII + +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool validate_utf16le(const char16_t *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused bool validate_utf16be(const char16_t *buf, + size_t len) const noexcept final; + void to_well_formed_utf16be(const char16_t *input, size_t len, + char16_t *output) const noexcept final; + void to_well_formed_utf16le(const char16_t *input, size_t len, + char16_t *output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused result validate_utf16le_with_errors( + const char16_t *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused result validate_utf16be_with_errors( + const char16_t *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool validate_utf32(const char32_t *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused result validate_utf32_with_errors( + const char32_t *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf8( + const char *buf, size_t len, char *utf8_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf16le( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_latin1_to_utf16be( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_latin1_with_errors( + const char *buf, size_t len, char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t convert_utf8_to_utf16le( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf16be( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf16le_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf16be_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16le( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16be( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t convert_utf8_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf32_with_errors( + const char *buf, size_t len, char32_t *utf32_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf32( + const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + convert_utf16le_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf16be_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_latin1_with_errors( + const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_latin1_with_errors( + const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t convert_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + convert_utf32_to_latin1(const char32_t *buf, size_t len, + char *latin1_output) const noexcept final; + simdutf_warn_unused result + convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len, + char *latin1_output) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf32_to_latin1(const char32_t *buf, size_t len, + char *latin1_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t convert_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf8_with_errors( + const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t + convert_utf32_to_utf16le(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf32_to_utf16be(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf16le_with_errors( + const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf16be_with_errors( + const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf16le_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf16be_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_utf32_with_errors( + const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_utf32_with_errors( + const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF16 + void change_endianness_utf16(const char16_t *buf, size_t length, + char16_t *output) const noexcept final; + simdutf_warn_unused size_t count_utf16le(const char16_t *buf, + size_t length) const noexcept; + simdutf_warn_unused size_t count_utf16be(const char16_t *buf, + size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF8 + simdutf_warn_unused size_t count_utf8(const char *buf, + size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF8 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t + utf8_length_from_utf16le(const char16_t *input, size_t length) const noexcept; + simdutf_warn_unused size_t + utf8_length_from_utf16be(const char16_t *input, size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t utf32_length_from_utf16le( + const char16_t *input, size_t length) const noexcept; + simdutf_warn_unused size_t utf32_length_from_utf16be( + const char16_t *input, size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t + utf16_length_from_utf8(const char *input, size_t length) const noexcept; + simdutf_warn_unused result utf8_length_from_utf16le_with_replacement( + const char16_t *input, size_t length) const noexcept; + ; + simdutf_warn_unused result utf8_length_from_utf16be_with_replacement( + const char16_t *input, size_t length) const noexcept; + ; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t + utf8_length_from_utf32(const char32_t *input, size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t + utf16_length_from_utf32(const char32_t *input, size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t + utf32_length_from_utf8(const char *input, size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + latin1_length_from_utf8(const char *input, size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + utf8_length_from_latin1(const char *input, size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_BASE64 + simdutf_warn_unused result base64_to_binary( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept; + simdutf_warn_unused full_result base64_to_binary_details( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept; + simdutf_warn_unused result + base64_to_binary(const char16_t *input, size_t length, char *output, + base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept; + simdutf_warn_unused full_result base64_to_binary_details( + const char16_t *input, size_t length, char *output, + base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept; + size_t binary_to_base64(const char *input, size_t length, char *output, + base64_options options) const noexcept; + + size_t binary_to_base64_with_lines(const char *input, size_t length, + char *output, size_t line_length, + base64_options options) const noexcept; + const char *find(const char *start, const char *end, + char character) const noexcept; + const char16_t *find(const char16_t *start, const char16_t *end, + char16_t character) const noexcept; +#endif // SIMDUTF_FEATURE_BASE64 +private: + const bool _supports_zvbb; + +#if SIMDUTF_IS_ZVBB + bool supports_zvbb() const { return true; } +#elif SIMDUTF_HAS_ZVBB_INTRINSICS + bool supports_zvbb() const { return _supports_zvbb; } +#else + bool supports_zvbb() const { return false; } +#endif +}; + +} // namespace rvv +} // namespace simdutf + +#endif // SIMDUTF_RVV_IMPLEMENTATION_H +/* end file src/simdutf/rvv/implementation.h */ +/* begin file src/simdutf/rvv/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "rvv" +// #define SIMDUTF_IMPLEMENTATION rvv + +#if SIMDUTF_CAN_ALWAYS_RUN_RVV +// nothing needed. +#else +SIMDUTF_TARGET_RVV +#endif +/* end file src/simdutf/rvv/begin.h */ +/* begin file src/simdutf/rvv/intrinsics.h */ +#ifndef SIMDUTF_RVV_INTRINSICS_H +#define SIMDUTF_RVV_INTRINSICS_H + + +#include + +#if __riscv_v_intrinsic >= 1000000 || __GCC__ >= 14 + #define simdutf_vrgather_u8m1x2(tbl, idx) \ + __riscv_vcreate_v_u8m1_u8m2( \ + __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m2_u8m1(idx, 0), \ + __riscv_vsetvlmax_e8m1()), \ + __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m2_u8m1(idx, 1), \ + __riscv_vsetvlmax_e8m1())); + + #define simdutf_vrgather_u8m1x4(tbl, idx) \ + __riscv_vcreate_v_u8m1_u8m4( \ + __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 0), \ + __riscv_vsetvlmax_e8m1()), \ + __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 1), \ + __riscv_vsetvlmax_e8m1()), \ + __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 2), \ + __riscv_vsetvlmax_e8m1()), \ + __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 3), \ + __riscv_vsetvlmax_e8m1())); +#else + // This has worse codegen on gcc + #define simdutf_vrgather_u8m1x2(tbl, idx) \ + __riscv_vset_v_u8m1_u8m2( \ + __riscv_vlmul_ext_v_u8m1_u8m2(__riscv_vrgather_vv_u8m1( \ + tbl, __riscv_vget_v_u8m2_u8m1(idx, 0), __riscv_vsetvlmax_e8m1())), \ + 1, \ + __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m2_u8m1(idx, 1), \ + __riscv_vsetvlmax_e8m1())) + + #define simdutf_vrgather_u8m1x4(tbl, idx) \ + __riscv_vset_v_u8m1_u8m4( \ + __riscv_vset_v_u8m1_u8m4( \ + __riscv_vset_v_u8m1_u8m4( \ + __riscv_vlmul_ext_v_u8m1_u8m4(__riscv_vrgather_vv_u8m1( \ + tbl, __riscv_vget_v_u8m4_u8m1(idx, 0), \ + __riscv_vsetvlmax_e8m1())), \ + 1, \ + __riscv_vrgather_vv_u8m1(tbl, \ + __riscv_vget_v_u8m4_u8m1(idx, 1), \ + __riscv_vsetvlmax_e8m1())), \ + 2, \ + __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 2), \ + __riscv_vsetvlmax_e8m1())), \ + 3, \ + __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 3), \ + __riscv_vsetvlmax_e8m1())) +#endif + +/* Zvbb adds dedicated support for endianness swaps with vrev8, but if we can't + * use that, we have to emulate it with the standard V extension. + * Using LMUL=1 vrgathers could be faster than the srl+macc variant, but that + * would increase register pressure, and vrgather implementations performance + * varies a lot. */ +enum class simdutf_ByteFlip { NONE, V, ZVBB }; + +template +simdutf_really_inline static uint16_t simdutf_byteflip(uint16_t v) { + if (method != simdutf_ByteFlip::NONE) + return (uint16_t)((v * 1u) << 8 | (v * 1u) >> 8); + return v; } -simdutf_warn_unused result validate_utf16le_with_errors(const char16_t * buf, size_t len) noexcept { - return get_default_implementation()->validate_utf16le_with_errors(buf, len); -} -simdutf_warn_unused result validate_utf16be_with_errors(const char16_t * buf, size_t len) noexcept { - return get_default_implementation()->validate_utf16be_with_errors(buf, len); + +#ifdef SIMDUTF_TARGET_ZVBB +SIMDUTF_UNTARGET_REGION +SIMDUTF_TARGET_ZVBB +#endif + +template +simdutf_really_inline static vuint16m1_t simdutf_byteflip(vuint16m1_t v, + size_t vl) { +#if SIMDUTF_HAS_ZVBB_INTRINSICS + if (method == simdutf_ByteFlip::ZVBB) + return __riscv_vrev8_v_u16m1(v, vl); +#endif + if (method == simdutf_ByteFlip::V) + return __riscv_vmacc_vx_u16m1(__riscv_vsrl_vx_u16m1(v, 8, vl), 0x100, v, + vl); + return v; } -simdutf_warn_unused bool validate_utf32(const char32_t * buf, size_t len) noexcept { - return get_default_implementation()->validate_utf32(buf, len); + +template +simdutf_really_inline static vuint16m2_t simdutf_byteflip(vuint16m2_t v, + size_t vl) { +#if SIMDUTF_HAS_ZVBB_INTRINSICS + if (method == simdutf_ByteFlip::ZVBB) + return __riscv_vrev8_v_u16m2(v, vl); +#endif + if (method == simdutf_ByteFlip::V) + return __riscv_vmacc_vx_u16m2(__riscv_vsrl_vx_u16m2(v, 8, vl), 0x100, v, + vl); + return v; } -simdutf_warn_unused result validate_utf32_with_errors(const char32_t * buf, size_t len) noexcept { - return get_default_implementation()->validate_utf32_with_errors(buf, len); + +template +simdutf_really_inline static vuint16m4_t simdutf_byteflip(vuint16m4_t v, + size_t vl) { +#if SIMDUTF_HAS_ZVBB_INTRINSICS + if (method == simdutf_ByteFlip::ZVBB) + return __riscv_vrev8_v_u16m4(v, vl); +#endif + if (method == simdutf_ByteFlip::V) + return __riscv_vmacc_vx_u16m4(__riscv_vsrl_vx_u16m4(v, 8, vl), 0x100, v, + vl); + return v; } -simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_buffer) noexcept { - #if SIMDUTF_IS_BIG_ENDIAN - return convert_valid_utf8_to_utf16be(input, length, utf16_buffer); - #else - return convert_valid_utf8_to_utf16le(input, length, utf16_buffer); - #endif -} -simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_buffer) noexcept { - return get_default_implementation()->convert_valid_utf8_to_utf16le(input, length, utf16_buffer); -} -simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_buffer) noexcept { - return get_default_implementation()->convert_valid_utf8_to_utf16be(input, length, utf16_buffer); -} -simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept { - return get_default_implementation()->convert_valid_utf8_to_utf32(input, length, utf32_buffer); -} -simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept { - #if SIMDUTF_IS_BIG_ENDIAN - return convert_utf16be_to_utf8(buf, len, utf8_buffer); - #else - return convert_utf16le_to_utf8(buf, len, utf8_buffer); - #endif -} -simdutf_warn_unused size_t convert_utf16_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) noexcept { - #if SIMDUTF_IS_BIG_ENDIAN - return convert_utf16be_to_latin1(buf, len, latin1_buffer); - #else - return convert_utf16le_to_latin1(buf, len, latin1_buffer); - #endif -} -simdutf_warn_unused size_t convert_latin1_to_utf16(const char * buf, size_t len, char16_t* utf16_output) noexcept { - #if SIMDUTF_IS_BIG_ENDIAN - return convert_latin1_to_utf16be(buf, len, utf16_output); - #else - return convert_latin1_to_utf16le(buf, len, utf16_output); - #endif -} -simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) noexcept { - return get_default_implementation()->convert_utf16be_to_latin1(buf, len, latin1_buffer); -} -simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) noexcept { - return get_default_implementation()->convert_utf16le_to_latin1(buf, len, latin1_buffer); -} -simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) noexcept { - return get_default_implementation()->convert_valid_utf16be_to_latin1(buf, len, latin1_buffer); -} -simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) noexcept { - return get_default_implementation()->convert_valid_utf16le_to_latin1(buf, len, latin1_buffer); -} -simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) noexcept { - return get_default_implementation()->convert_utf16le_to_latin1_with_errors(buf, len, latin1_buffer); -} -simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) noexcept { - return get_default_implementation()->convert_utf16be_to_latin1_with_errors(buf, len, latin1_buffer); -} -simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept { - return get_default_implementation()->convert_utf16le_to_utf8(buf, len, utf8_buffer); -} -simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept { - return get_default_implementation()->convert_utf16be_to_utf8(buf, len, utf8_buffer); -} -simdutf_warn_unused result convert_utf16_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) noexcept { - #if SIMDUTF_IS_BIG_ENDIAN - return convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer); - #else - return convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer); - #endif -} -simdutf_warn_unused result convert_utf16_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) noexcept { - #if SIMDUTF_IS_BIG_ENDIAN - return convert_utf16be_to_latin1_with_errors(buf, len, latin1_buffer); - #else - return convert_utf16le_to_latin1_with_errors(buf, len, latin1_buffer); - #endif -} -simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) noexcept { - return get_default_implementation()->convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer); -} -simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) noexcept { - return get_default_implementation()->convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer); -} -simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept { - #if SIMDUTF_IS_BIG_ENDIAN - return convert_valid_utf16be_to_utf8(buf, len, utf8_buffer); - #else - return convert_valid_utf16le_to_utf8(buf, len, utf8_buffer); - #endif -} -simdutf_warn_unused size_t convert_valid_utf16_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) noexcept { - #if SIMDUTF_IS_BIG_ENDIAN - return convert_valid_utf16be_to_latin1(buf, len, latin1_buffer); - #else - return convert_valid_utf16le_to_latin1(buf, len, latin1_buffer); - #endif -} -simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept { - return get_default_implementation()->convert_valid_utf16le_to_utf8(buf, len, utf8_buffer); -} -simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept { - return get_default_implementation()->convert_valid_utf16be_to_utf8(buf, len, utf8_buffer); -} -simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) noexcept { - return get_default_implementation()->convert_utf32_to_utf8(buf, len, utf8_buffer); -} -simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) noexcept { - return get_default_implementation()->convert_utf32_to_utf8_with_errors(buf, len, utf8_buffer); -} -simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) noexcept { - return get_default_implementation()->convert_valid_utf32_to_utf8(buf, len, utf8_buffer); -} -simdutf_warn_unused size_t convert_utf32_to_utf16(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept { - #if SIMDUTF_IS_BIG_ENDIAN - return convert_utf32_to_utf16be(buf, len, utf16_buffer); - #else - return convert_utf32_to_utf16le(buf, len, utf16_buffer); - #endif -} -simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_output) noexcept { - return get_default_implementation()->convert_utf32_to_latin1(input, length, latin1_output); -} -simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept { - return get_default_implementation()->convert_utf32_to_utf16le(buf, len, utf16_buffer); -} -simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept { - return get_default_implementation()->convert_utf32_to_utf16be(buf, len, utf16_buffer); -} -simdutf_warn_unused result convert_utf32_to_utf16_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept { - #if SIMDUTF_IS_BIG_ENDIAN - return convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer); - #else - return convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer); - #endif -} -simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept { - return get_default_implementation()->convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer); -} -simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept { - return get_default_implementation()->convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer); -} -simdutf_warn_unused size_t convert_valid_utf32_to_utf16(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept { - #if SIMDUTF_IS_BIG_ENDIAN - return convert_valid_utf32_to_utf16be(buf, len, utf16_buffer); - #else - return convert_valid_utf32_to_utf16le(buf, len, utf16_buffer); - #endif -} -simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept { - return get_default_implementation()->convert_valid_utf32_to_utf16le(buf, len, utf16_buffer); -} -simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept { - return get_default_implementation()->convert_valid_utf32_to_utf16be(buf, len, utf16_buffer); -} -simdutf_warn_unused size_t convert_utf16_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept { - #if SIMDUTF_IS_BIG_ENDIAN - return convert_utf16be_to_utf32(buf, len, utf32_buffer); - #else - return convert_utf16le_to_utf32(buf, len, utf32_buffer); - #endif -} -simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept { - return get_default_implementation()->convert_utf16le_to_utf32(buf, len, utf32_buffer); -} -simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept { - return get_default_implementation()->convert_utf16be_to_utf32(buf, len, utf32_buffer); -} -simdutf_warn_unused result convert_utf16_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept { - #if SIMDUTF_IS_BIG_ENDIAN - return convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer); - #else - return convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer); - #endif -} -simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept { - return get_default_implementation()->convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer); -} -simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept { - return get_default_implementation()->convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer); -} -simdutf_warn_unused size_t convert_valid_utf16_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept { - #if SIMDUTF_IS_BIG_ENDIAN - return convert_valid_utf16be_to_utf32(buf, len, utf32_buffer); - #else - return convert_valid_utf16le_to_utf32(buf, len, utf32_buffer); - #endif -} -simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept { - return get_default_implementation()->convert_valid_utf16le_to_utf32(buf, len, utf32_buffer); -} -simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept { - return get_default_implementation()->convert_valid_utf16be_to_utf32(buf, len, utf32_buffer); -} -void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) noexcept { - get_default_implementation()->change_endianness_utf16(input, length, output); -} -simdutf_warn_unused size_t count_utf16(const char16_t * input, size_t length) noexcept { - #if SIMDUTF_IS_BIG_ENDIAN - return count_utf16be(input, length); - #else - return count_utf16le(input, length); - #endif -} -simdutf_warn_unused size_t count_utf16le(const char16_t * input, size_t length) noexcept { - return get_default_implementation()->count_utf16le(input, length); -} -simdutf_warn_unused size_t count_utf16be(const char16_t * input, size_t length) noexcept { - return get_default_implementation()->count_utf16be(input, length); -} -simdutf_warn_unused size_t count_utf8(const char * input, size_t length) noexcept { - return get_default_implementation()->count_utf8(input, length); -} -simdutf_warn_unused size_t latin1_length_from_utf8(const char * buf, size_t len) noexcept { - return get_default_implementation()->latin1_length_from_utf8(buf, len); -} -simdutf_warn_unused size_t latin1_length_from_utf16(size_t len) noexcept { - return get_default_implementation()->latin1_length_from_utf16(len); -} -simdutf_warn_unused size_t latin1_length_from_utf32(size_t len) noexcept { - return get_default_implementation()->latin1_length_from_utf32(len); -} -simdutf_warn_unused size_t utf8_length_from_latin1(const char * buf, size_t len) noexcept { - return get_default_implementation()->utf8_length_from_latin1(buf, len); -} -simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) noexcept { - #if SIMDUTF_IS_BIG_ENDIAN - return utf8_length_from_utf16be(input, length); - #else - return utf8_length_from_utf16le(input, length); - #endif -} -simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) noexcept { - return get_default_implementation()->utf8_length_from_utf16le(input, length); -} -simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) noexcept { - return get_default_implementation()->utf8_length_from_utf16be(input, length); -} -simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t * input, size_t length) noexcept { - #if SIMDUTF_IS_BIG_ENDIAN - return utf32_length_from_utf16be(input, length); - #else - return utf32_length_from_utf16le(input, length); - #endif -} -simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) noexcept { - return get_default_implementation()->utf32_length_from_utf16le(input, length); -} -simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) noexcept { - return get_default_implementation()->utf32_length_from_utf16be(input, length); -} -simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) noexcept { - return get_default_implementation()->utf16_length_from_utf8(input, length); -} -simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) noexcept { - return get_default_implementation()->utf16_length_from_latin1(length); -} -simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) noexcept { - return get_default_implementation()->utf8_length_from_utf32(input, length); -} -simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) noexcept { - return get_default_implementation()->utf16_length_from_utf32(input, length); -} -simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) noexcept { - return get_default_implementation()->utf32_length_from_utf8(input, length); + +template +simdutf_really_inline static vuint16m8_t simdutf_byteflip(vuint16m8_t v, + size_t vl) { +#if SIMDUTF_HAS_ZVBB_INTRINSICS + if (method == simdutf_ByteFlip::ZVBB) + return __riscv_vrev8_v_u16m8(v, vl); +#endif + if (method == simdutf_ByteFlip::V) + return __riscv_vmacc_vx_u16m8(__riscv_vsrl_vx_u16m8(v, 8, vl), 0x100, v, + vl); + return v; } -simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) noexcept { - return get_default_implementation()->maximal_binary_length_from_base64(input, length); -} +#ifdef SIMDUTF_TARGET_ZVBB +SIMDUTF_UNTARGET_REGION +SIMDUTF_TARGET_RVV +#endif -simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) noexcept { - return get_default_implementation()->base64_to_binary(input, length, output, options); -} +#endif // SIMDUTF_RVV_INTRINSICS_H +/* end file src/simdutf/rvv/intrinsics.h */ +/* begin file src/simdutf/rvv/end.h */ +#if SIMDUTF_CAN_ALWAYS_RUN_RVV +// nothing needed. +#else +SIMDUTF_UNTARGET_REGION +#endif -simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) noexcept { - return get_default_implementation()->maximal_binary_length_from_base64(input, length); -} +/* end file src/simdutf/rvv/end.h */ -simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) noexcept { - return get_default_implementation()->base64_to_binary(input, length, output, options); -} +#endif // SIMDUTF_IMPLEMENTATION_RVV -template -simdutf_warn_unused result base64_to_binary_safe_impl(const chartype * input, size_t length, char* output, size_t& outlen, base64_options options) noexcept { - static_assert(std::is_same::value || std::is_same::value, "Only char and char16_t are supported."); - // The implementation could be nicer, but we expect that most times, the user - // will provide us with a buffer that is large enough. - size_t max_length = maximal_binary_length_from_base64(input, length); - if(outlen >= max_length) { - // fast path - result r = base64_to_binary(input, length, output, options); - if(r.error != error_code::INVALID_BASE64_CHARACTER) { outlen = r.count; r.count = length; } - return r; - } - // The output buffer is maybe too small. We will decode a truncated version of the input. - size_t outlen3 = outlen / 3 * 3; // round down to multiple of 3 - size_t safe_input = base64_length_from_binary(outlen3, options); - result r = base64_to_binary(input, safe_input, output, options); - if(r.error == error_code::INVALID_BASE64_CHARACTER) { return r; } - size_t offset = (r.error == error_code::BASE64_INPUT_REMAINDER) ? 1 : - ((r.count % 3) == 0 ? 0 : (r.count % 3) + 1); - size_t output_index = r.count - (r.count % 3); - size_t input_index = safe_input; - // offset is a value that is no larger than 3. We backtrack - // by up to offset characters + an undetermined number of - // white space characters. It is expected that the next loop - // runs at most 3 times + the number of white space characters - // in between them, so we are not worried about performance. - while(offset > 0 && input_index > 0) { - chartype c = input[--input_index]; - if(scalar::base64::is_ascii_white_space(c)){ - // skipping - } else { - offset--; - } - } - size_t remaining_out = outlen - output_index; - const chartype * tail_input = input + input_index; - size_t tail_length = length - input_index; - while(tail_length > 0 && scalar::base64::is_ascii_white_space(tail_input[tail_length - 1])) { - tail_length--; - } - size_t padding_characts = 0; - if(tail_length > 0 && tail_input[tail_length - 1] == '=') { - tail_length--; - padding_characts++; - while(tail_length > 0 && scalar::base64::is_ascii_white_space(tail_input[tail_length - 1])) { - tail_length--; - } - if(tail_length > 0 && tail_input[tail_length - 1] == '=') { - tail_length--; - padding_characts++; - } - } - r = scalar::base64::base64_tail_decode_safe(output + output_index, remaining_out, tail_input, tail_length, options); - outlen = output_index + remaining_out; - if(r.error == error_code::SUCCESS && padding_characts > 0) { - // additional checks - if((outlen % 3 == 0) || ((outlen % 3) + 1 + padding_characts != 4)) { - r.error = error_code::INVALID_BASE64_CHARACTER; - } - } - r.count += input_index; - return r; -} +#endif // SIMDUTF_RVV_H +/* end file src/simdutf/rvv.h */ +/* begin file src/simdutf/lasx.h */ +#ifndef SIMDUTF_LASX_H +#define SIMDUTF_LASX_H + +#ifdef SIMDUTF_FALLBACK_H + #error "lasx.h must be included before fallback.h" +#endif -simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen, base64_options options) noexcept { - return base64_to_binary_safe_impl(input, length, output, outlen, options); -} -simdutf_warn_unused result base64_to_binary_safe(const char16_t * input, size_t length, char* output, size_t& outlen, base64_options options) noexcept { - return base64_to_binary_safe_impl(input, length, output, outlen, options); -} +#ifndef SIMDUTF_IMPLEMENTATION_LASX + #define SIMDUTF_IMPLEMENTATION_LASX (SIMDUTF_IS_LSX) +#endif +#if SIMDUTF_IMPLEMENTATION_LASX && SIMDUTF_IS_LASX + #define SIMDUTF_CAN_ALWAYS_RUN_LASX 1 +#else + #define SIMDUTF_CAN_ALWAYS_RUN_LASX 0 +#endif -simdutf_warn_unused size_t base64_length_from_binary(size_t length, base64_options options) noexcept { - return get_default_implementation()->base64_length_from_binary(length, options); -} +#define SIMDUTF_CAN_ALWAYS_RUN_FALLBACK (SIMDUTF_IMPLEMENTATION_FALLBACK) -size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) noexcept { - return get_default_implementation()->binary_to_base64(input, length, output, options); -} +#if SIMDUTF_IMPLEMENTATION_LASX + #define SIMDUTF_TARGET_LASX SIMDUTF_TARGET_REGION("lasx,lsx") -simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const char * buf, size_t length) noexcept { - return get_default_implementation()->autodetect_encoding(buf, length); -} -simdutf_warn_unused int detect_encodings(const char * buf, size_t length) noexcept { - return get_default_implementation()->detect_encodings(buf, length); -} -const implementation * builtin_implementation() { - static const implementation * builtin_impl = get_available_implementations()[SIMDUTF_STRINGIFY(SIMDUTF_BUILTIN_IMPLEMENTATION)]; - return builtin_impl; -} + // For runtime dispatching to work, we need the lsxintrin to appear + // before we call SIMDUTF_TARGET_LASX. It is unclear why. + #include -simdutf_warn_unused size_t trim_partial_utf8(const char *input, size_t length) { - return scalar::utf8::trim_partial_utf8(input, length); -} +namespace simdutf { +/** + * Implementation for LoongArch ASX. + */ +namespace lasx {} // namespace lasx +} // namespace simdutf -simdutf_warn_unused size_t trim_partial_utf16be(const char16_t* input, size_t length) { - return scalar::utf16::trim_partial_utf16(input, length); -} +/* begin file src/simdutf/lasx/implementation.h */ +#ifndef SIMDUTF_LASX_IMPLEMENTATION_H +#define SIMDUTF_LASX_IMPLEMENTATION_H -simdutf_warn_unused size_t trim_partial_utf16le(const char16_t* input, size_t length) { - return scalar::utf16::trim_partial_utf16(input, length); -} -simdutf_warn_unused size_t trim_partial_utf16(const char16_t* input, size_t length) { - #if SIMDUTF_IS_BIG_ENDIAN - return trim_partial_utf16be(input, length); - #else - return trim_partial_utf16le(input, length); - #endif +namespace simdutf { +namespace lasx { + +namespace { +using namespace simdutf; } +class implementation final : public simdutf::implementation { +public: + simdutf_really_inline implementation() + : simdutf::implementation("lasx", "LOONGARCH ASX", + internal::instruction_set::LSX | + internal::instruction_set::LASX) {} +#if SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused int detect_encodings(const char *input, + size_t length) const noexcept final; +#endif // SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool validate_utf8(const char *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF8 + simdutf_warn_unused result + validate_utf8_with_errors(const char *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 +#if SIMDUTF_FEATURE_ASCII + simdutf_warn_unused bool validate_ascii(const char *buf, + size_t len) const noexcept final; + simdutf_warn_unused result + validate_ascii_with_errors(const char *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_ASCII + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII + simdutf_warn_unused bool + validate_utf16le_as_ascii(const char16_t *buf, + size_t len) const noexcept final; + + simdutf_warn_unused bool + validate_utf16be_as_ascii(const char16_t *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII + +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool validate_utf16le(const char16_t *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused bool validate_utf16be(const char16_t *buf, + size_t len) const noexcept final; + simdutf_warn_unused result validate_utf16le_with_errors( + const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf16be_with_errors( + const char16_t *buf, size_t len) const noexcept final; + void to_well_formed_utf16be(const char16_t *input, size_t len, + char16_t *output) const noexcept final; + void to_well_formed_utf16le(const char16_t *input, size_t len, + char16_t *output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool validate_utf32(const char32_t *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused result validate_utf32_with_errors( + const char32_t *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf8( + const char *buf, size_t len, char *utf8_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf16le( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_latin1_to_utf16be( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_latin1_with_errors( + const char *buf, size_t len, char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t convert_utf8_to_utf16le( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf16be( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf16le_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf16be_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16le( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16be( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t convert_utf8_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf32_with_errors( + const char *buf, size_t len, char32_t *utf32_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf32( + const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + convert_utf16le_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf16be_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_latin1_with_errors( + const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_latin1_with_errors( + const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t convert_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + convert_utf32_to_latin1(const char32_t *buf, size_t len, + char *latin1_output) const noexcept final; + simdutf_warn_unused result + convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len, + char *latin1_output) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf32_to_latin1(const char32_t *buf, size_t len, + char *latin1_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t convert_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf8_with_errors( + const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t + convert_utf32_to_utf16le(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf32_to_utf16be(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf16le_with_errors( + const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf16be_with_errors( + const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf16le_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf16be_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_utf32_with_errors( + const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_utf32_with_errors( + const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF16 + void change_endianness_utf16(const char16_t *buf, size_t length, + char16_t *output) const noexcept final; + simdutf_warn_unused size_t count_utf16le(const char16_t *buf, + size_t length) const noexcept; + simdutf_warn_unused size_t count_utf16be(const char16_t *buf, + size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF8 + simdutf_warn_unused size_t count_utf8(const char *buf, + size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF8 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t + utf8_length_from_utf16le(const char16_t *input, size_t length) const noexcept; + simdutf_warn_unused size_t + utf8_length_from_utf16be(const char16_t *input, size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t utf32_length_from_utf16le( + const char16_t *input, size_t length) const noexcept; + simdutf_warn_unused size_t utf32_length_from_utf16be( + const char16_t *input, size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t + utf16_length_from_utf8(const char *input, size_t length) const noexcept; + simdutf_warn_unused result utf8_length_from_utf16le_with_replacement( + const char16_t *input, size_t length) const noexcept; + ; + simdutf_warn_unused result utf8_length_from_utf16be_with_replacement( + const char16_t *input, size_t length) const noexcept; + ; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t + utf8_length_from_utf32(const char32_t *input, size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t + utf16_length_from_utf32(const char32_t *input, size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t + utf32_length_from_utf8(const char *input, size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + latin1_length_from_utf8(const char *input, size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + utf8_length_from_latin1(const char *input, size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_BASE64 + simdutf_warn_unused result base64_to_binary( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept; + simdutf_warn_unused full_result base64_to_binary_details( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept; + simdutf_warn_unused result + base64_to_binary(const char16_t *input, size_t length, char *output, + base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept; + simdutf_warn_unused full_result base64_to_binary_details( + const char16_t *input, size_t length, char *output, + base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept; + size_t binary_to_base64(const char *input, size_t length, char *output, + base64_options options) const noexcept; + size_t binary_to_base64_with_lines(const char *input, size_t length, + char *output, size_t line_length, + base64_options options) const noexcept; + const char *find(const char *start, const char *end, + char character) const noexcept; + const char16_t *find(const char16_t *start, const char16_t *end, + char16_t character) const noexcept; +#endif // SIMDUTF_FEATURE_BASE64 +}; + +} // namespace lasx } // namespace simdutf -/* end file src/implementation.cpp */ -/* begin file src/encoding_types.cpp */ +#endif // SIMDUTF_LASX_IMPLEMENTATION_H +/* end file src/simdutf/lasx/implementation.h */ -namespace simdutf { -bool match_system(endianness e) { -#if SIMDUTF_IS_BIG_ENDIAN - return e == endianness::BIG; +/* begin file src/simdutf/lasx/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "lasx" +// #define SIMDUTF_IMPLEMENTATION lasx +#define SIMDUTF_SIMD_HAS_UNSIGNED_CMP 1 + +#if SIMDUTF_CAN_ALWAYS_RUN_LASX +// nothing needed. #else - return e == endianness::LITTLE; +SIMDUTF_TARGET_LASX #endif -} +/* end file src/simdutf/lasx/begin.h */ -std::string to_string(encoding_type bom) { - switch (bom) { - case UTF16_LE: return "UTF16 little-endian"; - case UTF16_BE: return "UTF16 big-endian"; - case UTF32_LE: return "UTF32 little-endian"; - case UTF32_BE: return "UTF32 big-endian"; - case UTF8: return "UTF8"; - case unspecified: return "unknown"; - default: return "error"; - } -} + // Declarations +/* begin file src/simdutf/lasx/intrinsics.h */ +#ifndef SIMDUTF_LASX_INTRINSICS_H +#define SIMDUTF_LASX_INTRINSICS_H -namespace BOM { -// Note that BOM for UTF8 is discouraged. -encoding_type check_bom(const uint8_t* byte, size_t length) { - if (length >= 2 && byte[0] == 0xff and byte[1] == 0xfe) { - if (length >= 4 && byte[2] == 0x00 and byte[3] == 0x0) { - return encoding_type::UTF32_LE; - } else { - return encoding_type::UTF16_LE; - } - } else if (length >= 2 && byte[0] == 0xfe and byte[1] == 0xff) { - return encoding_type::UTF16_BE; - } else if (length >= 4 && byte[0] == 0x00 and byte[1] == 0x00 and byte[2] == 0xfe and byte[3] == 0xff) { - return encoding_type::UTF32_BE; - } else if (length >= 4 && byte[0] == 0xef and byte[1] == 0xbb and byte[2] == 0xbf) { - return encoding_type::UTF8; - } - return encoding_type::unspecified; - } - -encoding_type check_bom(const char* byte, size_t length) { - return check_bom(reinterpret_cast(byte), length); - } - - size_t bom_byte_size(encoding_type bom) { - switch (bom) { - case UTF16_LE: return 2; - case UTF16_BE: return 2; - case UTF32_LE: return 4; - case UTF32_BE: return 4; - case UTF8: return 3; - case unspecified: return 0; - default: return 0; - } -} +// This should be the correct header whether +// you use visual studio or other compilers. +#include +#include + +#if defined(__loongarch_asx) + #ifdef __clang__ + #define VREGS_PREFIX "$vr" + #define XREGS_PREFIX "$xr" + #else // GCC + #define VREGS_PREFIX "$f" + #define XREGS_PREFIX "$f" + #endif + #define __ALL_REGS \ + "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26," \ + "27,28,29,30,31" +// Convert __m128i to __m256i +static inline __m256i ____m256i(__m128i in) { + __m256i out = __lasx_xvldi(0); + __asm__ volatile(".irp i," __ALL_REGS "\n\t" + " .ifc %[out], " XREGS_PREFIX "\\i \n\t" + " .irp j," __ALL_REGS "\n\t" + " .ifc %[in], " VREGS_PREFIX "\\j \n\t" + " xvpermi.q $xr\\i, $xr\\j, 0x0 \n\t" + " .endif \n\t" + " .endr \n\t" + " .endif \n\t" + ".endr \n\t" + : [out] "+f"(out) + : [in] "f"(in)); + return out; } +// Convert two __m128i to __m256i +static inline __m256i lasx_set_q(__m128i inhi, __m128i inlo) { + __m256i out; + __asm__ volatile(".irp i," __ALL_REGS "\n\t" + " .ifc %[hi], " VREGS_PREFIX "\\i \n\t" + " .irp j," __ALL_REGS "\n\t" + " .ifc %[lo], " VREGS_PREFIX "\\j \n\t" + " xvpermi.q $xr\\i, $xr\\j, 0x20 \n\t" + " .endif \n\t" + " .endr \n\t" + " .endif \n\t" + ".endr \n\t" + ".ifnc %[out], %[hi] \n\t" + ".irp i," __ALL_REGS "\n\t" + " .ifc %[out], " XREGS_PREFIX "\\i \n\t" + " .irp j," __ALL_REGS "\n\t" + " .ifc %[hi], " VREGS_PREFIX "\\j \n\t" + " xvori.b $xr\\i, $xr\\j, 0 \n\t" + " .endif \n\t" + " .endr \n\t" + " .endif \n\t" + ".endr \n\t" + ".endif \n\t" + : [out] "=f"(out), [hi] "+f"(inhi) + : [lo] "f"(inlo)); + return out; } -/* end file src/encoding_types.cpp */ -/* begin file src/error.cpp */ -namespace simdutf { -// deliberately empty +// Convert __m256i low part to __m128i +static inline __m128i lasx_extracti128_lo(__m256i in) { + __m128i out; + __asm__ volatile(".ifnc %[out], %[in] \n\t" + ".irp i," __ALL_REGS "\n\t" + " .ifc %[out], " VREGS_PREFIX "\\i \n\t" + " .irp j," __ALL_REGS "\n\t" + " .ifc %[in], " XREGS_PREFIX "\\j \n\t" + " vori.b $vr\\i, $vr\\j, 0 \n\t" + " .endif \n\t" + " .endr \n\t" + " .endif \n\t" + ".endr \n\t" + ".endif \n\t" + : [out] "=f"(out) + : [in] "f"(in)); + return out; } -/* end file src/error.cpp */ -// The large tables should be included once and they -// should not depend on a kernel. -/* begin file src/tables/utf8_to_utf16_tables.h */ -#ifndef SIMDUTF_UTF8_TO_UTF16_TABLES_H -#define SIMDUTF_UTF8_TO_UTF16_TABLES_H -#include +// Convert __m256i high part to __m128i +static inline __m128i lasx_extracti128_hi(__m256i in) { + __m128i out; + __asm__ volatile(".irp i," __ALL_REGS "\n\t" + " .ifc %[out], " VREGS_PREFIX "\\i \n\t" + " .irp j," __ALL_REGS "\n\t" + " .ifc %[in], " XREGS_PREFIX "\\j \n\t" + " xvpermi.q $xr\\i, $xr\\j, 0x11 \n\t" + " .endif \n\t" + " .endr \n\t" + " .endif \n\t" + ".endr \n\t" + : [out] "=f"(out) + : [in] "f"(in)); + return out; +} +#endif -namespace simdutf { -namespace { -namespace tables { -namespace utf8_to_utf16 { -/** - * utf8bigindex uses about 8 kB - * shufutf8 uses about 3344 B - * - * So we use a bit over 11 kB. It would be - * easy to save about 4 kB by only - * storing the index in utf8bigindex, and - * deriving the consumed bytes otherwise. - * However, this may come at a significant (10% to 20%) - * performance penalty. - */ +/* +Encoding of argument for LoongArch64 xvldi instruction. See: +https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/misc/#__m256i-__lasx_xvldi-imm_n1024_1023-imm -const uint8_t shufutf8[209][16] = -{ {0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 0, 0, 0, 0}, - {0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 6, 5, 0, 0, 0, 0}, - {0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 6, 255, 0, 0, 0, 0}, - {0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 7, 6, 0, 0, 0, 0}, - {0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 6, 255, 0, 0, 0, 0}, - {0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 7, 6, 0, 0, 0, 0}, - {0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 7, 255, 0, 0, 0, 0}, - {0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 8, 7, 0, 0, 0, 0}, - {0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0}, - {0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0}, - {0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0}, - {0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0}, - {0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0}, - {0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0}, - {0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0}, - {0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0}, - {0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0}, - {0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0}, - {0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0}, - {0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0}, - {0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0}, - {0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0}, - {0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0}, - {0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0}, - {0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0}, - {0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0}, - {0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0}, - {0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0}, - {0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0}, - {0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0}, - {0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0}, - {0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0}, - {1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0}, - {1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0}, - {1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0}, - {1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0}, - {1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0}, - {1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0}, - {1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0}, - {1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0}, - {1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0}, - {1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0}, - {1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0}, - {1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0}, - {1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0}, - {1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0}, - {1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0}, - {1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0}, - {1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0}, - {1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0}, - {1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0}, - {1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0}, - {1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0}, - {1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0}, - {1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0}, - {1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0}, - {1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 8, 255, 0, 0, 0, 0}, - {1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 9, 8, 0, 0, 0, 0}, - {1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 9, 255, 0, 0, 0, 0}, - {1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 10, 9, 0, 0, 0, 0}, - {1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 9, 255, 0, 0, 0, 0}, - {1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 10, 9, 0, 0, 0, 0}, - {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 255, 0, 0, 0, 0}, - {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 0, 0, 0, 0}, - {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255}, - {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255}, - {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255}, - {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255}, - {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255}, - {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255}, - {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255}, - {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255}, - {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255}, - {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255}, - {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255}, - {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255}, - {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255}, - {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255}, - {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255}, - {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255}, - {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255}, - {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255}, - {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 5, 255, 255, 255}, - {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 6, 5, 255, 255}, - {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 7, 6, 5, 255}, - {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 6, 255, 255, 255}, - {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 7, 6, 255, 255}, - {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 8, 7, 6, 255}, - {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 7, 255, 255, 255}, - {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 8, 7, 255, 255}, - {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 9, 8, 7, 255}, - {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255}, - {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255}, - {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255}, - {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255}, - {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255}, - {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255}, - {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255}, - {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255}, - {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255}, - {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255}, - {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255}, - {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255}, - {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255}, - {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255}, - {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255}, - {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255}, - {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255}, - {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255}, - {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 6, 255, 255, 255}, - {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 7, 6, 255, 255}, - {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 8, 7, 6, 255}, - {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 7, 255, 255, 255}, - {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 8, 7, 255, 255}, - {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 9, 8, 7, 255}, - {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 8, 255, 255, 255}, - {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 9, 8, 255, 255}, - {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 10, 9, 8, 255}, - {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255}, - {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255}, - {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255}, - {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255}, - {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255}, - {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255}, - {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255}, - {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255}, - {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255}, - {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255}, - {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 7, 6, 255, 255}, - {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 8, 7, 6, 255}, - {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 7, 255, 255, 255}, - {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 8, 7, 255, 255}, - {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 9, 8, 7, 255}, - {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 8, 255, 255, 255}, - {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 9, 8, 255, 255}, - {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 10, 9, 8, 255}, - {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 7, 255, 255, 255}, - {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 8, 7, 255, 255}, - {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 9, 8, 7, 255}, - {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 8, 255, 255, 255}, - {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 9, 8, 255, 255}, - {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 10, 9, 8, 255}, - {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 9, 255, 255, 255}, - {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 10, 9, 255, 255}, - {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255}, - {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 0, 0, 0, 0}, - {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 0, 0, 0, 0}, - {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 0, 0, 0, 0}, - {0, 255, 255, 255, 1, 255, 255, 255, 5, 4, 3, 2, 0, 0, 0, 0}, - {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0}, - {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0}, - {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0}, - {0, 255, 255, 255, 2, 1, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0}, - {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 0, 0, 0, 0}, - {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 0, 0, 0, 0}, - {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 0, 0, 0, 0}, - {0, 255, 255, 255, 3, 2, 1, 255, 7, 6, 5, 4, 0, 0, 0, 0}, - {0, 255, 255, 255, 4, 3, 2, 1, 5, 255, 255, 255, 0, 0, 0, 0}, - {0, 255, 255, 255, 4, 3, 2, 1, 6, 5, 255, 255, 0, 0, 0, 0}, - {0, 255, 255, 255, 4, 3, 2, 1, 7, 6, 5, 255, 0, 0, 0, 0}, - {0, 255, 255, 255, 4, 3, 2, 1, 8, 7, 6, 5, 0, 0, 0, 0}, - {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0}, - {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0}, - {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0}, - {1, 0, 255, 255, 2, 255, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0}, - {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0}, - {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0}, - {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0}, - {1, 0, 255, 255, 3, 2, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0}, - {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 0, 0, 0, 0}, - {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 0, 0, 0, 0}, - {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 0, 0, 0, 0}, - {1, 0, 255, 255, 4, 3, 2, 255, 8, 7, 6, 5, 0, 0, 0, 0}, - {1, 0, 255, 255, 5, 4, 3, 2, 6, 255, 255, 255, 0, 0, 0, 0}, - {1, 0, 255, 255, 5, 4, 3, 2, 7, 6, 255, 255, 0, 0, 0, 0}, - {1, 0, 255, 255, 5, 4, 3, 2, 8, 7, 6, 255, 0, 0, 0, 0}, - {1, 0, 255, 255, 5, 4, 3, 2, 9, 8, 7, 6, 0, 0, 0, 0}, - {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0}, - {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0}, - {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0}, - {2, 1, 0, 255, 3, 255, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0}, - {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0}, - {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0}, - {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0}, - {2, 1, 0, 255, 4, 3, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0}, - {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 0, 0, 0, 0}, - {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 0, 0, 0, 0}, - {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 0, 0, 0, 0}, - {2, 1, 0, 255, 5, 4, 3, 255, 9, 8, 7, 6, 0, 0, 0, 0}, - {2, 1, 0, 255, 6, 5, 4, 3, 7, 255, 255, 255, 0, 0, 0, 0}, - {2, 1, 0, 255, 6, 5, 4, 3, 8, 7, 255, 255, 0, 0, 0, 0}, - {2, 1, 0, 255, 6, 5, 4, 3, 9, 8, 7, 255, 0, 0, 0, 0}, - {2, 1, 0, 255, 6, 5, 4, 3, 10, 9, 8, 7, 0, 0, 0, 0}, - {3, 2, 1, 0, 4, 255, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0}, - {3, 2, 1, 0, 4, 255, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0}, - {3, 2, 1, 0, 4, 255, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0}, - {3, 2, 1, 0, 4, 255, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0}, - {3, 2, 1, 0, 5, 4, 255, 255, 6, 255, 255, 255, 0, 0, 0, 0}, - {3, 2, 1, 0, 5, 4, 255, 255, 7, 6, 255, 255, 0, 0, 0, 0}, - {3, 2, 1, 0, 5, 4, 255, 255, 8, 7, 6, 255, 0, 0, 0, 0}, - {3, 2, 1, 0, 5, 4, 255, 255, 9, 8, 7, 6, 0, 0, 0, 0}, - {3, 2, 1, 0, 6, 5, 4, 255, 7, 255, 255, 255, 0, 0, 0, 0}, - {3, 2, 1, 0, 6, 5, 4, 255, 8, 7, 255, 255, 0, 0, 0, 0}, - {3, 2, 1, 0, 6, 5, 4, 255, 9, 8, 7, 255, 0, 0, 0, 0}, - {3, 2, 1, 0, 6, 5, 4, 255, 10, 9, 8, 7, 0, 0, 0, 0}, - {3, 2, 1, 0, 7, 6, 5, 4, 8, 255, 255, 255, 0, 0, 0, 0}, - {3, 2, 1, 0, 7, 6, 5, 4, 9, 8, 255, 255, 0, 0, 0, 0}, - {3, 2, 1, 0, 7, 6, 5, 4, 10, 9, 8, 255, 0, 0, 0, 0}, - {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 0, 0, 0, 0}}; -/* number of two bytes : 64 */ -/* number of two + three bytes : 145 */ -/* number of two + three + four bytes : 209 */ -const uint8_t utf8bigindex[4096][2] = -{ {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {209, 12}, - {209, 12}, - {209, 12}, - {146, 4}, - {209, 12}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {209, 12}, - {209, 12}, - {147, 5}, - {209, 12}, - {150, 5}, - {162, 5}, - {65, 5}, - {209, 12}, - {153, 5}, - {165, 5}, - {67, 5}, - {177, 5}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {209, 12}, - {148, 6}, - {209, 12}, - {151, 6}, - {163, 6}, - {66, 6}, - {209, 12}, - {154, 6}, - {166, 6}, - {68, 6}, - {178, 6}, - {74, 6}, - {92, 6}, - {64, 4}, - {209, 12}, - {157, 6}, - {169, 6}, - {70, 6}, - {181, 6}, - {76, 6}, - {94, 6}, - {65, 5}, - {193, 6}, - {82, 6}, - {100, 6}, - {67, 5}, - {118, 6}, - {73, 5}, - {91, 5}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {152, 7}, - {164, 7}, - {145, 3}, - {209, 12}, - {155, 7}, - {167, 7}, - {69, 7}, - {179, 7}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {170, 7}, - {71, 7}, - {182, 7}, - {77, 7}, - {95, 7}, - {65, 5}, - {194, 7}, - {83, 7}, - {101, 7}, - {67, 5}, - {119, 7}, - {73, 5}, - {91, 5}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {185, 7}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {103, 7}, - {68, 6}, - {121, 7}, - {74, 6}, - {92, 6}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {76, 6}, - {94, 6}, - {4, 7}, - {193, 6}, - {82, 6}, - {100, 6}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {209, 12}, - {156, 8}, - {168, 8}, - {146, 4}, - {180, 8}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {159, 8}, - {171, 8}, - {72, 8}, - {183, 8}, - {78, 8}, - {96, 8}, - {65, 5}, - {195, 8}, - {84, 8}, - {102, 8}, - {67, 5}, - {120, 8}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {174, 8}, - {148, 6}, - {186, 8}, - {80, 8}, - {98, 8}, - {66, 6}, - {198, 8}, - {86, 8}, - {104, 8}, - {68, 6}, - {122, 8}, - {74, 6}, - {92, 6}, - {3, 8}, - {209, 12}, - {157, 6}, - {110, 8}, - {70, 6}, - {128, 8}, - {76, 6}, - {94, 6}, - {5, 8}, - {193, 6}, - {82, 6}, - {100, 6}, - {9, 8}, - {118, 6}, - {17, 8}, - {33, 8}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {189, 8}, - {152, 7}, - {164, 7}, - {145, 3}, - {201, 8}, - {88, 8}, - {106, 8}, - {69, 7}, - {124, 8}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {112, 8}, - {71, 7}, - {130, 8}, - {77, 7}, - {95, 7}, - {6, 8}, - {194, 7}, - {83, 7}, - {101, 7}, - {10, 8}, - {119, 7}, - {18, 8}, - {34, 8}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {136, 8}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {103, 7}, - {12, 8}, - {121, 7}, - {20, 8}, - {36, 8}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {24, 8}, - {40, 8}, - {4, 7}, - {193, 6}, - {82, 6}, - {48, 8}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {209, 12}, - {209, 12}, - {209, 12}, - {146, 4}, - {209, 12}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {160, 9}, - {172, 9}, - {147, 5}, - {184, 9}, - {150, 5}, - {162, 5}, - {65, 5}, - {196, 9}, - {153, 5}, - {165, 5}, - {67, 5}, - {177, 5}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {175, 9}, - {148, 6}, - {187, 9}, - {81, 9}, - {99, 9}, - {66, 6}, - {199, 9}, - {87, 9}, - {105, 9}, - {68, 6}, - {123, 9}, - {74, 6}, - {92, 6}, - {64, 4}, - {209, 12}, - {157, 6}, - {111, 9}, - {70, 6}, - {129, 9}, - {76, 6}, - {94, 6}, - {65, 5}, - {193, 6}, - {82, 6}, - {100, 6}, - {67, 5}, - {118, 6}, - {73, 5}, - {91, 5}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {190, 9}, - {152, 7}, - {164, 7}, - {145, 3}, - {202, 9}, - {89, 9}, - {107, 9}, - {69, 7}, - {125, 9}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {113, 9}, - {71, 7}, - {131, 9}, - {77, 7}, - {95, 7}, - {7, 9}, - {194, 7}, - {83, 7}, - {101, 7}, - {11, 9}, - {119, 7}, - {19, 9}, - {35, 9}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {137, 9}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {103, 7}, - {13, 9}, - {121, 7}, - {21, 9}, - {37, 9}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {25, 9}, - {41, 9}, - {4, 7}, - {193, 6}, - {82, 6}, - {49, 9}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {205, 9}, - {156, 8}, - {168, 8}, - {146, 4}, - {180, 8}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {159, 8}, - {115, 9}, - {72, 8}, - {133, 9}, - {78, 8}, - {96, 8}, - {65, 5}, - {195, 8}, - {84, 8}, - {102, 8}, - {67, 5}, - {120, 8}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {174, 8}, - {148, 6}, - {139, 9}, - {80, 8}, - {98, 8}, - {66, 6}, - {198, 8}, - {86, 8}, - {104, 8}, - {14, 9}, - {122, 8}, - {22, 9}, - {38, 9}, - {3, 8}, - {209, 12}, - {157, 6}, - {110, 8}, - {70, 6}, - {128, 8}, - {26, 9}, - {42, 9}, - {5, 8}, - {193, 6}, - {82, 6}, - {50, 9}, - {9, 8}, - {118, 6}, - {17, 8}, - {33, 8}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {189, 8}, - {152, 7}, - {164, 7}, - {145, 3}, - {201, 8}, - {88, 8}, - {106, 8}, - {69, 7}, - {124, 8}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {112, 8}, - {71, 7}, - {130, 8}, - {28, 9}, - {44, 9}, - {6, 8}, - {194, 7}, - {83, 7}, - {52, 9}, - {10, 8}, - {119, 7}, - {18, 8}, - {34, 8}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {136, 8}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {56, 9}, - {12, 8}, - {121, 7}, - {20, 8}, - {36, 8}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {24, 8}, - {40, 8}, - {4, 7}, - {193, 6}, - {82, 6}, - {48, 8}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {209, 12}, - {209, 12}, - {209, 12}, - {146, 4}, - {209, 12}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {209, 12}, - {209, 12}, - {147, 5}, - {209, 12}, - {150, 5}, - {162, 5}, - {65, 5}, - {209, 12}, - {153, 5}, - {165, 5}, - {67, 5}, - {177, 5}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {176, 10}, - {148, 6}, - {188, 10}, - {151, 6}, - {163, 6}, - {66, 6}, - {200, 10}, - {154, 6}, - {166, 6}, - {68, 6}, - {178, 6}, - {74, 6}, - {92, 6}, - {64, 4}, - {209, 12}, - {157, 6}, - {169, 6}, - {70, 6}, - {181, 6}, - {76, 6}, - {94, 6}, - {65, 5}, - {193, 6}, - {82, 6}, - {100, 6}, - {67, 5}, - {118, 6}, - {73, 5}, - {91, 5}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {191, 10}, - {152, 7}, - {164, 7}, - {145, 3}, - {203, 10}, - {90, 10}, - {108, 10}, - {69, 7}, - {126, 10}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {114, 10}, - {71, 7}, - {132, 10}, - {77, 7}, - {95, 7}, - {65, 5}, - {194, 7}, - {83, 7}, - {101, 7}, - {67, 5}, - {119, 7}, - {73, 5}, - {91, 5}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {138, 10}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {103, 7}, - {68, 6}, - {121, 7}, - {74, 6}, - {92, 6}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {76, 6}, - {94, 6}, - {4, 7}, - {193, 6}, - {82, 6}, - {100, 6}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {206, 10}, - {156, 8}, - {168, 8}, - {146, 4}, - {180, 8}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {159, 8}, - {116, 10}, - {72, 8}, - {134, 10}, - {78, 8}, - {96, 8}, - {65, 5}, - {195, 8}, - {84, 8}, - {102, 8}, - {67, 5}, - {120, 8}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {174, 8}, - {148, 6}, - {140, 10}, - {80, 8}, - {98, 8}, - {66, 6}, - {198, 8}, - {86, 8}, - {104, 8}, - {15, 10}, - {122, 8}, - {23, 10}, - {39, 10}, - {3, 8}, - {209, 12}, - {157, 6}, - {110, 8}, - {70, 6}, - {128, 8}, - {27, 10}, - {43, 10}, - {5, 8}, - {193, 6}, - {82, 6}, - {51, 10}, - {9, 8}, - {118, 6}, - {17, 8}, - {33, 8}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {189, 8}, - {152, 7}, - {164, 7}, - {145, 3}, - {201, 8}, - {88, 8}, - {106, 8}, - {69, 7}, - {124, 8}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {112, 8}, - {71, 7}, - {130, 8}, - {29, 10}, - {45, 10}, - {6, 8}, - {194, 7}, - {83, 7}, - {53, 10}, - {10, 8}, - {119, 7}, - {18, 8}, - {34, 8}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {136, 8}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {57, 10}, - {12, 8}, - {121, 7}, - {20, 8}, - {36, 8}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {24, 8}, - {40, 8}, - {4, 7}, - {193, 6}, - {82, 6}, - {48, 8}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {209, 12}, - {209, 12}, - {209, 12}, - {146, 4}, - {209, 12}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {160, 9}, - {172, 9}, - {147, 5}, - {184, 9}, - {150, 5}, - {162, 5}, - {65, 5}, - {196, 9}, - {153, 5}, - {165, 5}, - {67, 5}, - {177, 5}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {175, 9}, - {148, 6}, - {142, 10}, - {81, 9}, - {99, 9}, - {66, 6}, - {199, 9}, - {87, 9}, - {105, 9}, - {68, 6}, - {123, 9}, - {74, 6}, - {92, 6}, - {64, 4}, - {209, 12}, - {157, 6}, - {111, 9}, - {70, 6}, - {129, 9}, - {76, 6}, - {94, 6}, - {65, 5}, - {193, 6}, - {82, 6}, - {100, 6}, - {67, 5}, - {118, 6}, - {73, 5}, - {91, 5}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {190, 9}, - {152, 7}, - {164, 7}, - {145, 3}, - {202, 9}, - {89, 9}, - {107, 9}, - {69, 7}, - {125, 9}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {113, 9}, - {71, 7}, - {131, 9}, - {30, 10}, - {46, 10}, - {7, 9}, - {194, 7}, - {83, 7}, - {54, 10}, - {11, 9}, - {119, 7}, - {19, 9}, - {35, 9}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {137, 9}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {58, 10}, - {13, 9}, - {121, 7}, - {21, 9}, - {37, 9}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {25, 9}, - {41, 9}, - {4, 7}, - {193, 6}, - {82, 6}, - {49, 9}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {205, 9}, - {156, 8}, - {168, 8}, - {146, 4}, - {180, 8}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {159, 8}, - {115, 9}, - {72, 8}, - {133, 9}, - {78, 8}, - {96, 8}, - {65, 5}, - {195, 8}, - {84, 8}, - {102, 8}, - {67, 5}, - {120, 8}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {174, 8}, - {148, 6}, - {139, 9}, - {80, 8}, - {98, 8}, - {66, 6}, - {198, 8}, - {86, 8}, - {60, 10}, - {14, 9}, - {122, 8}, - {22, 9}, - {38, 9}, - {3, 8}, - {209, 12}, - {157, 6}, - {110, 8}, - {70, 6}, - {128, 8}, - {26, 9}, - {42, 9}, - {5, 8}, - {193, 6}, - {82, 6}, - {50, 9}, - {9, 8}, - {118, 6}, - {17, 8}, - {33, 8}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {189, 8}, - {152, 7}, - {164, 7}, - {145, 3}, - {201, 8}, - {88, 8}, - {106, 8}, - {69, 7}, - {124, 8}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {112, 8}, - {71, 7}, - {130, 8}, - {28, 9}, - {44, 9}, - {6, 8}, - {194, 7}, - {83, 7}, - {52, 9}, - {10, 8}, - {119, 7}, - {18, 8}, - {34, 8}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {136, 8}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {56, 9}, - {12, 8}, - {121, 7}, - {20, 8}, - {36, 8}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {24, 8}, - {40, 8}, - {4, 7}, - {193, 6}, - {82, 6}, - {48, 8}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {209, 12}, - {209, 12}, - {209, 12}, - {146, 4}, - {209, 12}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {209, 12}, - {209, 12}, - {147, 5}, - {209, 12}, - {150, 5}, - {162, 5}, - {65, 5}, - {209, 12}, - {153, 5}, - {165, 5}, - {67, 5}, - {177, 5}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {209, 12}, - {148, 6}, - {209, 12}, - {151, 6}, - {163, 6}, - {66, 6}, - {209, 12}, - {154, 6}, - {166, 6}, - {68, 6}, - {178, 6}, - {74, 6}, - {92, 6}, - {64, 4}, - {209, 12}, - {157, 6}, - {169, 6}, - {70, 6}, - {181, 6}, - {76, 6}, - {94, 6}, - {65, 5}, - {193, 6}, - {82, 6}, - {100, 6}, - {67, 5}, - {118, 6}, - {73, 5}, - {91, 5}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {192, 11}, - {152, 7}, - {164, 7}, - {145, 3}, - {204, 11}, - {155, 7}, - {167, 7}, - {69, 7}, - {179, 7}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {170, 7}, - {71, 7}, - {182, 7}, - {77, 7}, - {95, 7}, - {65, 5}, - {194, 7}, - {83, 7}, - {101, 7}, - {67, 5}, - {119, 7}, - {73, 5}, - {91, 5}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {185, 7}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {103, 7}, - {68, 6}, - {121, 7}, - {74, 6}, - {92, 6}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {76, 6}, - {94, 6}, - {4, 7}, - {193, 6}, - {82, 6}, - {100, 6}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {207, 11}, - {156, 8}, - {168, 8}, - {146, 4}, - {180, 8}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {159, 8}, - {117, 11}, - {72, 8}, - {135, 11}, - {78, 8}, - {96, 8}, - {65, 5}, - {195, 8}, - {84, 8}, - {102, 8}, - {67, 5}, - {120, 8}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {174, 8}, - {148, 6}, - {141, 11}, - {80, 8}, - {98, 8}, - {66, 6}, - {198, 8}, - {86, 8}, - {104, 8}, - {68, 6}, - {122, 8}, - {74, 6}, - {92, 6}, - {3, 8}, - {209, 12}, - {157, 6}, - {110, 8}, - {70, 6}, - {128, 8}, - {76, 6}, - {94, 6}, - {5, 8}, - {193, 6}, - {82, 6}, - {100, 6}, - {9, 8}, - {118, 6}, - {17, 8}, - {33, 8}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {189, 8}, - {152, 7}, - {164, 7}, - {145, 3}, - {201, 8}, - {88, 8}, - {106, 8}, - {69, 7}, - {124, 8}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {112, 8}, - {71, 7}, - {130, 8}, - {77, 7}, - {95, 7}, - {6, 8}, - {194, 7}, - {83, 7}, - {101, 7}, - {10, 8}, - {119, 7}, - {18, 8}, - {34, 8}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {136, 8}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {103, 7}, - {12, 8}, - {121, 7}, - {20, 8}, - {36, 8}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {24, 8}, - {40, 8}, - {4, 7}, - {193, 6}, - {82, 6}, - {48, 8}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {209, 12}, - {209, 12}, - {209, 12}, - {146, 4}, - {209, 12}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {160, 9}, - {172, 9}, - {147, 5}, - {184, 9}, - {150, 5}, - {162, 5}, - {65, 5}, - {196, 9}, - {153, 5}, - {165, 5}, - {67, 5}, - {177, 5}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {175, 9}, - {148, 6}, - {143, 11}, - {81, 9}, - {99, 9}, - {66, 6}, - {199, 9}, - {87, 9}, - {105, 9}, - {68, 6}, - {123, 9}, - {74, 6}, - {92, 6}, - {64, 4}, - {209, 12}, - {157, 6}, - {111, 9}, - {70, 6}, - {129, 9}, - {76, 6}, - {94, 6}, - {65, 5}, - {193, 6}, - {82, 6}, - {100, 6}, - {67, 5}, - {118, 6}, - {73, 5}, - {91, 5}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {190, 9}, - {152, 7}, - {164, 7}, - {145, 3}, - {202, 9}, - {89, 9}, - {107, 9}, - {69, 7}, - {125, 9}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {113, 9}, - {71, 7}, - {131, 9}, - {31, 11}, - {47, 11}, - {7, 9}, - {194, 7}, - {83, 7}, - {55, 11}, - {11, 9}, - {119, 7}, - {19, 9}, - {35, 9}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {137, 9}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {59, 11}, - {13, 9}, - {121, 7}, - {21, 9}, - {37, 9}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {25, 9}, - {41, 9}, - {4, 7}, - {193, 6}, - {82, 6}, - {49, 9}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {205, 9}, - {156, 8}, - {168, 8}, - {146, 4}, - {180, 8}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {159, 8}, - {115, 9}, - {72, 8}, - {133, 9}, - {78, 8}, - {96, 8}, - {65, 5}, - {195, 8}, - {84, 8}, - {102, 8}, - {67, 5}, - {120, 8}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {174, 8}, - {148, 6}, - {139, 9}, - {80, 8}, - {98, 8}, - {66, 6}, - {198, 8}, - {86, 8}, - {61, 11}, - {14, 9}, - {122, 8}, - {22, 9}, - {38, 9}, - {3, 8}, - {209, 12}, - {157, 6}, - {110, 8}, - {70, 6}, - {128, 8}, - {26, 9}, - {42, 9}, - {5, 8}, - {193, 6}, - {82, 6}, - {50, 9}, - {9, 8}, - {118, 6}, - {17, 8}, - {33, 8}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {189, 8}, - {152, 7}, - {164, 7}, - {145, 3}, - {201, 8}, - {88, 8}, - {106, 8}, - {69, 7}, - {124, 8}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {112, 8}, - {71, 7}, - {130, 8}, - {28, 9}, - {44, 9}, - {6, 8}, - {194, 7}, - {83, 7}, - {52, 9}, - {10, 8}, - {119, 7}, - {18, 8}, - {34, 8}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {136, 8}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {56, 9}, - {12, 8}, - {121, 7}, - {20, 8}, - {36, 8}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {24, 8}, - {40, 8}, - {4, 7}, - {193, 6}, - {82, 6}, - {48, 8}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {209, 12}, - {209, 12}, - {209, 12}, - {146, 4}, - {209, 12}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {209, 12}, - {209, 12}, - {147, 5}, - {209, 12}, - {150, 5}, - {162, 5}, - {65, 5}, - {209, 12}, - {153, 5}, - {165, 5}, - {67, 5}, - {177, 5}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {176, 10}, - {148, 6}, - {188, 10}, - {151, 6}, - {163, 6}, - {66, 6}, - {200, 10}, - {154, 6}, - {166, 6}, - {68, 6}, - {178, 6}, - {74, 6}, - {92, 6}, - {64, 4}, - {209, 12}, - {157, 6}, - {169, 6}, - {70, 6}, - {181, 6}, - {76, 6}, - {94, 6}, - {65, 5}, - {193, 6}, - {82, 6}, - {100, 6}, - {67, 5}, - {118, 6}, - {73, 5}, - {91, 5}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {191, 10}, - {152, 7}, - {164, 7}, - {145, 3}, - {203, 10}, - {90, 10}, - {108, 10}, - {69, 7}, - {126, 10}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {114, 10}, - {71, 7}, - {132, 10}, - {77, 7}, - {95, 7}, - {65, 5}, - {194, 7}, - {83, 7}, - {101, 7}, - {67, 5}, - {119, 7}, - {73, 5}, - {91, 5}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {138, 10}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {103, 7}, - {68, 6}, - {121, 7}, - {74, 6}, - {92, 6}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {76, 6}, - {94, 6}, - {4, 7}, - {193, 6}, - {82, 6}, - {100, 6}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {206, 10}, - {156, 8}, - {168, 8}, - {146, 4}, - {180, 8}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {159, 8}, - {116, 10}, - {72, 8}, - {134, 10}, - {78, 8}, - {96, 8}, - {65, 5}, - {195, 8}, - {84, 8}, - {102, 8}, - {67, 5}, - {120, 8}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {174, 8}, - {148, 6}, - {140, 10}, - {80, 8}, - {98, 8}, - {66, 6}, - {198, 8}, - {86, 8}, - {62, 11}, - {15, 10}, - {122, 8}, - {23, 10}, - {39, 10}, - {3, 8}, - {209, 12}, - {157, 6}, - {110, 8}, - {70, 6}, - {128, 8}, - {27, 10}, - {43, 10}, - {5, 8}, - {193, 6}, - {82, 6}, - {51, 10}, - {9, 8}, - {118, 6}, - {17, 8}, - {33, 8}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {189, 8}, - {152, 7}, - {164, 7}, - {145, 3}, - {201, 8}, - {88, 8}, - {106, 8}, - {69, 7}, - {124, 8}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {112, 8}, - {71, 7}, - {130, 8}, - {29, 10}, - {45, 10}, - {6, 8}, - {194, 7}, - {83, 7}, - {53, 10}, - {10, 8}, - {119, 7}, - {18, 8}, - {34, 8}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {136, 8}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {57, 10}, - {12, 8}, - {121, 7}, - {20, 8}, - {36, 8}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {24, 8}, - {40, 8}, - {4, 7}, - {193, 6}, - {82, 6}, - {48, 8}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {209, 12}, - {209, 12}, - {209, 12}, - {146, 4}, - {209, 12}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {160, 9}, - {172, 9}, - {147, 5}, - {184, 9}, - {150, 5}, - {162, 5}, - {65, 5}, - {196, 9}, - {153, 5}, - {165, 5}, - {67, 5}, - {177, 5}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {175, 9}, - {148, 6}, - {142, 10}, - {81, 9}, - {99, 9}, - {66, 6}, - {199, 9}, - {87, 9}, - {105, 9}, - {68, 6}, - {123, 9}, - {74, 6}, - {92, 6}, - {64, 4}, - {209, 12}, - {157, 6}, - {111, 9}, - {70, 6}, - {129, 9}, - {76, 6}, - {94, 6}, - {65, 5}, - {193, 6}, - {82, 6}, - {100, 6}, - {67, 5}, - {118, 6}, - {73, 5}, - {91, 5}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {190, 9}, - {152, 7}, - {164, 7}, - {145, 3}, - {202, 9}, - {89, 9}, - {107, 9}, - {69, 7}, - {125, 9}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {113, 9}, - {71, 7}, - {131, 9}, - {30, 10}, - {46, 10}, - {7, 9}, - {194, 7}, - {83, 7}, - {54, 10}, - {11, 9}, - {119, 7}, - {19, 9}, - {35, 9}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {137, 9}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {58, 10}, - {13, 9}, - {121, 7}, - {21, 9}, - {37, 9}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {25, 9}, - {41, 9}, - {4, 7}, - {193, 6}, - {82, 6}, - {49, 9}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {205, 9}, - {156, 8}, - {168, 8}, - {146, 4}, - {180, 8}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {159, 8}, - {115, 9}, - {72, 8}, - {133, 9}, - {78, 8}, - {96, 8}, - {65, 5}, - {195, 8}, - {84, 8}, - {102, 8}, - {67, 5}, - {120, 8}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {174, 8}, - {148, 6}, - {139, 9}, - {80, 8}, - {98, 8}, - {66, 6}, - {198, 8}, - {86, 8}, - {60, 10}, - {14, 9}, - {122, 8}, - {22, 9}, - {38, 9}, - {3, 8}, - {209, 12}, - {157, 6}, - {110, 8}, - {70, 6}, - {128, 8}, - {26, 9}, - {42, 9}, - {5, 8}, - {193, 6}, - {82, 6}, - {50, 9}, - {9, 8}, - {118, 6}, - {17, 8}, - {33, 8}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {189, 8}, - {152, 7}, - {164, 7}, - {145, 3}, - {201, 8}, - {88, 8}, - {106, 8}, - {69, 7}, - {124, 8}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {112, 8}, - {71, 7}, - {130, 8}, - {28, 9}, - {44, 9}, - {6, 8}, - {194, 7}, - {83, 7}, - {52, 9}, - {10, 8}, - {119, 7}, - {18, 8}, - {34, 8}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {136, 8}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {56, 9}, - {12, 8}, - {121, 7}, - {20, 8}, - {36, 8}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {24, 8}, - {40, 8}, - {4, 7}, - {193, 6}, - {82, 6}, - {48, 8}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {209, 12}, - {209, 12}, - {209, 12}, - {146, 4}, - {209, 12}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {209, 12}, - {209, 12}, - {147, 5}, - {209, 12}, - {150, 5}, - {162, 5}, - {65, 5}, - {209, 12}, - {153, 5}, - {165, 5}, - {67, 5}, - {177, 5}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {209, 12}, - {148, 6}, - {209, 12}, - {151, 6}, - {163, 6}, - {66, 6}, - {209, 12}, - {154, 6}, - {166, 6}, - {68, 6}, - {178, 6}, - {74, 6}, - {92, 6}, - {64, 4}, - {209, 12}, - {157, 6}, - {169, 6}, - {70, 6}, - {181, 6}, - {76, 6}, - {94, 6}, - {65, 5}, - {193, 6}, - {82, 6}, - {100, 6}, - {67, 5}, - {118, 6}, - {73, 5}, - {91, 5}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {152, 7}, - {164, 7}, - {145, 3}, - {209, 12}, - {155, 7}, - {167, 7}, - {69, 7}, - {179, 7}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {170, 7}, - {71, 7}, - {182, 7}, - {77, 7}, - {95, 7}, - {65, 5}, - {194, 7}, - {83, 7}, - {101, 7}, - {67, 5}, - {119, 7}, - {73, 5}, - {91, 5}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {185, 7}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {103, 7}, - {68, 6}, - {121, 7}, - {74, 6}, - {92, 6}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {76, 6}, - {94, 6}, - {4, 7}, - {193, 6}, - {82, 6}, - {100, 6}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {208, 12}, - {156, 8}, - {168, 8}, - {146, 4}, - {180, 8}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {159, 8}, - {171, 8}, - {72, 8}, - {183, 8}, - {78, 8}, - {96, 8}, - {65, 5}, - {195, 8}, - {84, 8}, - {102, 8}, - {67, 5}, - {120, 8}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {174, 8}, - {148, 6}, - {186, 8}, - {80, 8}, - {98, 8}, - {66, 6}, - {198, 8}, - {86, 8}, - {104, 8}, - {68, 6}, - {122, 8}, - {74, 6}, - {92, 6}, - {3, 8}, - {209, 12}, - {157, 6}, - {110, 8}, - {70, 6}, - {128, 8}, - {76, 6}, - {94, 6}, - {5, 8}, - {193, 6}, - {82, 6}, - {100, 6}, - {9, 8}, - {118, 6}, - {17, 8}, - {33, 8}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {189, 8}, - {152, 7}, - {164, 7}, - {145, 3}, - {201, 8}, - {88, 8}, - {106, 8}, - {69, 7}, - {124, 8}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {112, 8}, - {71, 7}, - {130, 8}, - {77, 7}, - {95, 7}, - {6, 8}, - {194, 7}, - {83, 7}, - {101, 7}, - {10, 8}, - {119, 7}, - {18, 8}, - {34, 8}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {136, 8}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {103, 7}, - {12, 8}, - {121, 7}, - {20, 8}, - {36, 8}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {24, 8}, - {40, 8}, - {4, 7}, - {193, 6}, - {82, 6}, - {48, 8}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {209, 12}, - {209, 12}, - {209, 12}, - {146, 4}, - {209, 12}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {160, 9}, - {172, 9}, - {147, 5}, - {184, 9}, - {150, 5}, - {162, 5}, - {65, 5}, - {196, 9}, - {153, 5}, - {165, 5}, - {67, 5}, - {177, 5}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {175, 9}, - {148, 6}, - {144, 12}, - {81, 9}, - {99, 9}, - {66, 6}, - {199, 9}, - {87, 9}, - {105, 9}, - {68, 6}, - {123, 9}, - {74, 6}, - {92, 6}, - {64, 4}, - {209, 12}, - {157, 6}, - {111, 9}, - {70, 6}, - {129, 9}, - {76, 6}, - {94, 6}, - {65, 5}, - {193, 6}, - {82, 6}, - {100, 6}, - {67, 5}, - {118, 6}, - {73, 5}, - {91, 5}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {190, 9}, - {152, 7}, - {164, 7}, - {145, 3}, - {202, 9}, - {89, 9}, - {107, 9}, - {69, 7}, - {125, 9}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {113, 9}, - {71, 7}, - {131, 9}, - {77, 7}, - {95, 7}, - {7, 9}, - {194, 7}, - {83, 7}, - {101, 7}, - {11, 9}, - {119, 7}, - {19, 9}, - {35, 9}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {137, 9}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {103, 7}, - {13, 9}, - {121, 7}, - {21, 9}, - {37, 9}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {25, 9}, - {41, 9}, - {4, 7}, - {193, 6}, - {82, 6}, - {49, 9}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {205, 9}, - {156, 8}, - {168, 8}, - {146, 4}, - {180, 8}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {159, 8}, - {115, 9}, - {72, 8}, - {133, 9}, - {78, 8}, - {96, 8}, - {65, 5}, - {195, 8}, - {84, 8}, - {102, 8}, - {67, 5}, - {120, 8}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {174, 8}, - {148, 6}, - {139, 9}, - {80, 8}, - {98, 8}, - {66, 6}, - {198, 8}, - {86, 8}, - {104, 8}, - {14, 9}, - {122, 8}, - {22, 9}, - {38, 9}, - {3, 8}, - {209, 12}, - {157, 6}, - {110, 8}, - {70, 6}, - {128, 8}, - {26, 9}, - {42, 9}, - {5, 8}, - {193, 6}, - {82, 6}, - {50, 9}, - {9, 8}, - {118, 6}, - {17, 8}, - {33, 8}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {189, 8}, - {152, 7}, - {164, 7}, - {145, 3}, - {201, 8}, - {88, 8}, - {106, 8}, - {69, 7}, - {124, 8}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {112, 8}, - {71, 7}, - {130, 8}, - {28, 9}, - {44, 9}, - {6, 8}, - {194, 7}, - {83, 7}, - {52, 9}, - {10, 8}, - {119, 7}, - {18, 8}, - {34, 8}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {136, 8}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {56, 9}, - {12, 8}, - {121, 7}, - {20, 8}, - {36, 8}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {24, 8}, - {40, 8}, - {4, 7}, - {193, 6}, - {82, 6}, - {48, 8}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {209, 12}, - {209, 12}, - {209, 12}, - {146, 4}, - {209, 12}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {209, 12}, - {209, 12}, - {147, 5}, - {209, 12}, - {150, 5}, - {162, 5}, - {65, 5}, - {209, 12}, - {153, 5}, - {165, 5}, - {67, 5}, - {177, 5}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {176, 10}, - {148, 6}, - {188, 10}, - {151, 6}, - {163, 6}, - {66, 6}, - {200, 10}, - {154, 6}, - {166, 6}, - {68, 6}, - {178, 6}, - {74, 6}, - {92, 6}, - {64, 4}, - {209, 12}, - {157, 6}, - {169, 6}, - {70, 6}, - {181, 6}, - {76, 6}, - {94, 6}, - {65, 5}, - {193, 6}, - {82, 6}, - {100, 6}, - {67, 5}, - {118, 6}, - {73, 5}, - {91, 5}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {191, 10}, - {152, 7}, - {164, 7}, - {145, 3}, - {203, 10}, - {90, 10}, - {108, 10}, - {69, 7}, - {126, 10}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {114, 10}, - {71, 7}, - {132, 10}, - {77, 7}, - {95, 7}, - {65, 5}, - {194, 7}, - {83, 7}, - {101, 7}, - {67, 5}, - {119, 7}, - {73, 5}, - {91, 5}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {138, 10}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {103, 7}, - {68, 6}, - {121, 7}, - {74, 6}, - {92, 6}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {76, 6}, - {94, 6}, - {4, 7}, - {193, 6}, - {82, 6}, - {100, 6}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {206, 10}, - {156, 8}, - {168, 8}, - {146, 4}, - {180, 8}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {159, 8}, - {116, 10}, - {72, 8}, - {134, 10}, - {78, 8}, - {96, 8}, - {65, 5}, - {195, 8}, - {84, 8}, - {102, 8}, - {67, 5}, - {120, 8}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {174, 8}, - {148, 6}, - {140, 10}, - {80, 8}, - {98, 8}, - {66, 6}, - {198, 8}, - {86, 8}, - {63, 12}, - {15, 10}, - {122, 8}, - {23, 10}, - {39, 10}, - {3, 8}, - {209, 12}, - {157, 6}, - {110, 8}, - {70, 6}, - {128, 8}, - {27, 10}, - {43, 10}, - {5, 8}, - {193, 6}, - {82, 6}, - {51, 10}, - {9, 8}, - {118, 6}, - {17, 8}, - {33, 8}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {189, 8}, - {152, 7}, - {164, 7}, - {145, 3}, - {201, 8}, - {88, 8}, - {106, 8}, - {69, 7}, - {124, 8}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {112, 8}, - {71, 7}, - {130, 8}, - {29, 10}, - {45, 10}, - {6, 8}, - {194, 7}, - {83, 7}, - {53, 10}, - {10, 8}, - {119, 7}, - {18, 8}, - {34, 8}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {136, 8}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {57, 10}, - {12, 8}, - {121, 7}, - {20, 8}, - {36, 8}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {24, 8}, - {40, 8}, - {4, 7}, - {193, 6}, - {82, 6}, - {48, 8}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {209, 12}, - {209, 12}, - {209, 12}, - {146, 4}, - {209, 12}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {160, 9}, - {172, 9}, - {147, 5}, - {184, 9}, - {150, 5}, - {162, 5}, - {65, 5}, - {196, 9}, - {153, 5}, - {165, 5}, - {67, 5}, - {177, 5}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {175, 9}, - {148, 6}, - {142, 10}, - {81, 9}, - {99, 9}, - {66, 6}, - {199, 9}, - {87, 9}, - {105, 9}, - {68, 6}, - {123, 9}, - {74, 6}, - {92, 6}, - {64, 4}, - {209, 12}, - {157, 6}, - {111, 9}, - {70, 6}, - {129, 9}, - {76, 6}, - {94, 6}, - {65, 5}, - {193, 6}, - {82, 6}, - {100, 6}, - {67, 5}, - {118, 6}, - {73, 5}, - {91, 5}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {190, 9}, - {152, 7}, - {164, 7}, - {145, 3}, - {202, 9}, - {89, 9}, - {107, 9}, - {69, 7}, - {125, 9}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {113, 9}, - {71, 7}, - {131, 9}, - {30, 10}, - {46, 10}, - {7, 9}, - {194, 7}, - {83, 7}, - {54, 10}, - {11, 9}, - {119, 7}, - {19, 9}, - {35, 9}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {137, 9}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {58, 10}, - {13, 9}, - {121, 7}, - {21, 9}, - {37, 9}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {25, 9}, - {41, 9}, - {4, 7}, - {193, 6}, - {82, 6}, - {49, 9}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {205, 9}, - {156, 8}, - {168, 8}, - {146, 4}, - {180, 8}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {159, 8}, - {115, 9}, - {72, 8}, - {133, 9}, - {78, 8}, - {96, 8}, - {65, 5}, - {195, 8}, - {84, 8}, - {102, 8}, - {67, 5}, - {120, 8}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {174, 8}, - {148, 6}, - {139, 9}, - {80, 8}, - {98, 8}, - {66, 6}, - {198, 8}, - {86, 8}, - {60, 10}, - {14, 9}, - {122, 8}, - {22, 9}, - {38, 9}, - {3, 8}, - {209, 12}, - {157, 6}, - {110, 8}, - {70, 6}, - {128, 8}, - {26, 9}, - {42, 9}, - {5, 8}, - {193, 6}, - {82, 6}, - {50, 9}, - {9, 8}, - {118, 6}, - {17, 8}, - {33, 8}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {189, 8}, - {152, 7}, - {164, 7}, - {145, 3}, - {201, 8}, - {88, 8}, - {106, 8}, - {69, 7}, - {124, 8}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {112, 8}, - {71, 7}, - {130, 8}, - {28, 9}, - {44, 9}, - {6, 8}, - {194, 7}, - {83, 7}, - {52, 9}, - {10, 8}, - {119, 7}, - {18, 8}, - {34, 8}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {136, 8}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {56, 9}, - {12, 8}, - {121, 7}, - {20, 8}, - {36, 8}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {24, 8}, - {40, 8}, - {4, 7}, - {193, 6}, - {82, 6}, - {48, 8}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {209, 12}, - {209, 12}, - {209, 12}, - {146, 4}, - {209, 12}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {209, 12}, - {209, 12}, - {147, 5}, - {209, 12}, - {150, 5}, - {162, 5}, - {65, 5}, - {209, 12}, - {153, 5}, - {165, 5}, - {67, 5}, - {177, 5}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {209, 12}, - {148, 6}, - {209, 12}, - {151, 6}, - {163, 6}, - {66, 6}, - {209, 12}, - {154, 6}, - {166, 6}, - {68, 6}, - {178, 6}, - {74, 6}, - {92, 6}, - {64, 4}, - {209, 12}, - {157, 6}, - {169, 6}, - {70, 6}, - {181, 6}, - {76, 6}, - {94, 6}, - {65, 5}, - {193, 6}, - {82, 6}, - {100, 6}, - {67, 5}, - {118, 6}, - {73, 5}, - {91, 5}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {192, 11}, - {152, 7}, - {164, 7}, - {145, 3}, - {204, 11}, - {155, 7}, - {167, 7}, - {69, 7}, - {179, 7}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {170, 7}, - {71, 7}, - {182, 7}, - {77, 7}, - {95, 7}, - {65, 5}, - {194, 7}, - {83, 7}, - {101, 7}, - {67, 5}, - {119, 7}, - {73, 5}, - {91, 5}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {185, 7}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {103, 7}, - {68, 6}, - {121, 7}, - {74, 6}, - {92, 6}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {76, 6}, - {94, 6}, - {4, 7}, - {193, 6}, - {82, 6}, - {100, 6}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {207, 11}, - {156, 8}, - {168, 8}, - {146, 4}, - {180, 8}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {159, 8}, - {117, 11}, - {72, 8}, - {135, 11}, - {78, 8}, - {96, 8}, - {65, 5}, - {195, 8}, - {84, 8}, - {102, 8}, - {67, 5}, - {120, 8}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {174, 8}, - {148, 6}, - {141, 11}, - {80, 8}, - {98, 8}, - {66, 6}, - {198, 8}, - {86, 8}, - {104, 8}, - {68, 6}, - {122, 8}, - {74, 6}, - {92, 6}, - {3, 8}, - {209, 12}, - {157, 6}, - {110, 8}, - {70, 6}, - {128, 8}, - {76, 6}, - {94, 6}, - {5, 8}, - {193, 6}, - {82, 6}, - {100, 6}, - {9, 8}, - {118, 6}, - {17, 8}, - {33, 8}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {189, 8}, - {152, 7}, - {164, 7}, - {145, 3}, - {201, 8}, - {88, 8}, - {106, 8}, - {69, 7}, - {124, 8}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {112, 8}, - {71, 7}, - {130, 8}, - {77, 7}, - {95, 7}, - {6, 8}, - {194, 7}, - {83, 7}, - {101, 7}, - {10, 8}, - {119, 7}, - {18, 8}, - {34, 8}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {136, 8}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {103, 7}, - {12, 8}, - {121, 7}, - {20, 8}, - {36, 8}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {24, 8}, - {40, 8}, - {4, 7}, - {193, 6}, - {82, 6}, - {48, 8}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {209, 12}, - {209, 12}, - {209, 12}, - {146, 4}, - {209, 12}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {160, 9}, - {172, 9}, - {147, 5}, - {184, 9}, - {150, 5}, - {162, 5}, - {65, 5}, - {196, 9}, - {153, 5}, - {165, 5}, - {67, 5}, - {177, 5}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {175, 9}, - {148, 6}, - {143, 11}, - {81, 9}, - {99, 9}, - {66, 6}, - {199, 9}, - {87, 9}, - {105, 9}, - {68, 6}, - {123, 9}, - {74, 6}, - {92, 6}, - {64, 4}, - {209, 12}, - {157, 6}, - {111, 9}, - {70, 6}, - {129, 9}, - {76, 6}, - {94, 6}, - {65, 5}, - {193, 6}, - {82, 6}, - {100, 6}, - {67, 5}, - {118, 6}, - {73, 5}, - {91, 5}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {190, 9}, - {152, 7}, - {164, 7}, - {145, 3}, - {202, 9}, - {89, 9}, - {107, 9}, - {69, 7}, - {125, 9}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {113, 9}, - {71, 7}, - {131, 9}, - {31, 11}, - {47, 11}, - {7, 9}, - {194, 7}, - {83, 7}, - {55, 11}, - {11, 9}, - {119, 7}, - {19, 9}, - {35, 9}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {137, 9}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {59, 11}, - {13, 9}, - {121, 7}, - {21, 9}, - {37, 9}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {25, 9}, - {41, 9}, - {4, 7}, - {193, 6}, - {82, 6}, - {49, 9}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {205, 9}, - {156, 8}, - {168, 8}, - {146, 4}, - {180, 8}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {159, 8}, - {115, 9}, - {72, 8}, - {133, 9}, - {78, 8}, - {96, 8}, - {65, 5}, - {195, 8}, - {84, 8}, - {102, 8}, - {67, 5}, - {120, 8}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {174, 8}, - {148, 6}, - {139, 9}, - {80, 8}, - {98, 8}, - {66, 6}, - {198, 8}, - {86, 8}, - {61, 11}, - {14, 9}, - {122, 8}, - {22, 9}, - {38, 9}, - {3, 8}, - {209, 12}, - {157, 6}, - {110, 8}, - {70, 6}, - {128, 8}, - {26, 9}, - {42, 9}, - {5, 8}, - {193, 6}, - {82, 6}, - {50, 9}, - {9, 8}, - {118, 6}, - {17, 8}, - {33, 8}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {189, 8}, - {152, 7}, - {164, 7}, - {145, 3}, - {201, 8}, - {88, 8}, - {106, 8}, - {69, 7}, - {124, 8}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {112, 8}, - {71, 7}, - {130, 8}, - {28, 9}, - {44, 9}, - {6, 8}, - {194, 7}, - {83, 7}, - {52, 9}, - {10, 8}, - {119, 7}, - {18, 8}, - {34, 8}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {136, 8}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {56, 9}, - {12, 8}, - {121, 7}, - {20, 8}, - {36, 8}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {24, 8}, - {40, 8}, - {4, 7}, - {193, 6}, - {82, 6}, - {48, 8}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {209, 12}, - {209, 12}, - {209, 12}, - {146, 4}, - {209, 12}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {209, 12}, - {209, 12}, - {147, 5}, - {209, 12}, - {150, 5}, - {162, 5}, - {65, 5}, - {209, 12}, - {153, 5}, - {165, 5}, - {67, 5}, - {177, 5}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {176, 10}, - {148, 6}, - {188, 10}, - {151, 6}, - {163, 6}, - {66, 6}, - {200, 10}, - {154, 6}, - {166, 6}, - {68, 6}, - {178, 6}, - {74, 6}, - {92, 6}, - {64, 4}, - {209, 12}, - {157, 6}, - {169, 6}, - {70, 6}, - {181, 6}, - {76, 6}, - {94, 6}, - {65, 5}, - {193, 6}, - {82, 6}, - {100, 6}, - {67, 5}, - {118, 6}, - {73, 5}, - {91, 5}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {191, 10}, - {152, 7}, - {164, 7}, - {145, 3}, - {203, 10}, - {90, 10}, - {108, 10}, - {69, 7}, - {126, 10}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {114, 10}, - {71, 7}, - {132, 10}, - {77, 7}, - {95, 7}, - {65, 5}, - {194, 7}, - {83, 7}, - {101, 7}, - {67, 5}, - {119, 7}, - {73, 5}, - {91, 5}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {138, 10}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {103, 7}, - {68, 6}, - {121, 7}, - {74, 6}, - {92, 6}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {76, 6}, - {94, 6}, - {4, 7}, - {193, 6}, - {82, 6}, - {100, 6}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {206, 10}, - {156, 8}, - {168, 8}, - {146, 4}, - {180, 8}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {159, 8}, - {116, 10}, - {72, 8}, - {134, 10}, - {78, 8}, - {96, 8}, - {65, 5}, - {195, 8}, - {84, 8}, - {102, 8}, - {67, 5}, - {120, 8}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {174, 8}, - {148, 6}, - {140, 10}, - {80, 8}, - {98, 8}, - {66, 6}, - {198, 8}, - {86, 8}, - {62, 11}, - {15, 10}, - {122, 8}, - {23, 10}, - {39, 10}, - {3, 8}, - {209, 12}, - {157, 6}, - {110, 8}, - {70, 6}, - {128, 8}, - {27, 10}, - {43, 10}, - {5, 8}, - {193, 6}, - {82, 6}, - {51, 10}, - {9, 8}, - {118, 6}, - {17, 8}, - {33, 8}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {189, 8}, - {152, 7}, - {164, 7}, - {145, 3}, - {201, 8}, - {88, 8}, - {106, 8}, - {69, 7}, - {124, 8}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {112, 8}, - {71, 7}, - {130, 8}, - {29, 10}, - {45, 10}, - {6, 8}, - {194, 7}, - {83, 7}, - {53, 10}, - {10, 8}, - {119, 7}, - {18, 8}, - {34, 8}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {136, 8}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {57, 10}, - {12, 8}, - {121, 7}, - {20, 8}, - {36, 8}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {24, 8}, - {40, 8}, - {4, 7}, - {193, 6}, - {82, 6}, - {48, 8}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {209, 12}, - {209, 12}, - {209, 12}, - {146, 4}, - {209, 12}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {160, 9}, - {172, 9}, - {147, 5}, - {184, 9}, - {150, 5}, - {162, 5}, - {65, 5}, - {196, 9}, - {153, 5}, - {165, 5}, - {67, 5}, - {177, 5}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {175, 9}, - {148, 6}, - {142, 10}, - {81, 9}, - {99, 9}, - {66, 6}, - {199, 9}, - {87, 9}, - {105, 9}, - {68, 6}, - {123, 9}, - {74, 6}, - {92, 6}, - {64, 4}, - {209, 12}, - {157, 6}, - {111, 9}, - {70, 6}, - {129, 9}, - {76, 6}, - {94, 6}, - {65, 5}, - {193, 6}, - {82, 6}, - {100, 6}, - {67, 5}, - {118, 6}, - {73, 5}, - {91, 5}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {190, 9}, - {152, 7}, - {164, 7}, - {145, 3}, - {202, 9}, - {89, 9}, - {107, 9}, - {69, 7}, - {125, 9}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {113, 9}, - {71, 7}, - {131, 9}, - {30, 10}, - {46, 10}, - {7, 9}, - {194, 7}, - {83, 7}, - {54, 10}, - {11, 9}, - {119, 7}, - {19, 9}, - {35, 9}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {137, 9}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {58, 10}, - {13, 9}, - {121, 7}, - {21, 9}, - {37, 9}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {25, 9}, - {41, 9}, - {4, 7}, - {193, 6}, - {82, 6}, - {49, 9}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {145, 3}, - {205, 9}, - {156, 8}, - {168, 8}, - {146, 4}, - {180, 8}, - {149, 4}, - {161, 4}, - {64, 4}, - {209, 12}, - {159, 8}, - {115, 9}, - {72, 8}, - {133, 9}, - {78, 8}, - {96, 8}, - {65, 5}, - {195, 8}, - {84, 8}, - {102, 8}, - {67, 5}, - {120, 8}, - {73, 5}, - {91, 5}, - {64, 4}, - {209, 12}, - {209, 12}, - {174, 8}, - {148, 6}, - {139, 9}, - {80, 8}, - {98, 8}, - {66, 6}, - {198, 8}, - {86, 8}, - {60, 10}, - {14, 9}, - {122, 8}, - {22, 9}, - {38, 9}, - {3, 8}, - {209, 12}, - {157, 6}, - {110, 8}, - {70, 6}, - {128, 8}, - {26, 9}, - {42, 9}, - {5, 8}, - {193, 6}, - {82, 6}, - {50, 9}, - {9, 8}, - {118, 6}, - {17, 8}, - {33, 8}, - {0, 6}, - {209, 12}, - {209, 12}, - {209, 12}, - {209, 12}, - {189, 8}, - {152, 7}, - {164, 7}, - {145, 3}, - {201, 8}, - {88, 8}, - {106, 8}, - {69, 7}, - {124, 8}, - {75, 7}, - {93, 7}, - {64, 4}, - {209, 12}, - {158, 7}, - {112, 8}, - {71, 7}, - {130, 8}, - {28, 9}, - {44, 9}, - {6, 8}, - {194, 7}, - {83, 7}, - {52, 9}, - {10, 8}, - {119, 7}, - {18, 8}, - {34, 8}, - {1, 7}, - {209, 12}, - {209, 12}, - {173, 7}, - {148, 6}, - {136, 8}, - {79, 7}, - {97, 7}, - {66, 6}, - {197, 7}, - {85, 7}, - {56, 9}, - {12, 8}, - {121, 7}, - {20, 8}, - {36, 8}, - {2, 7}, - {209, 12}, - {157, 6}, - {109, 7}, - {70, 6}, - {127, 7}, - {24, 8}, - {40, 8}, - {4, 7}, - {193, 6}, - {82, 6}, - {48, 8}, - {8, 7}, - {118, 6}, - {16, 7}, - {32, 7}, - {0, 6}}; -} // utf8_to_utf16 namespace -} // tables namespace -} // unnamed namespace -} // namespace simdutf +1: imm[12:8]=0b10000: broadcast imm[7:0] as 32-bit elements to all lanes -#endif // SIMDUTF_UTF8_TO_UTF16_TABLES_H -/* end file src/tables/utf8_to_utf16_tables.h */ -/* begin file src/tables/utf16_to_utf8_tables.h */ -// file generated by scripts/sse_convert_utf16_to_utf8.py -#ifndef SIMDUTF_UTF16_TO_UTF8_TABLES_H -#define SIMDUTF_UTF16_TO_UTF8_TABLES_H +2: imm[12:8]=0b10001: broadcast imm[7:0] << 8 as 32-bit elements to all lanes -namespace simdutf { -namespace { -namespace tables { -namespace utf16_to_utf8 { +3: imm[12:8]=0b10010: broadcast imm[7:0] << 16 as 32-bit elements to all lanes - // 1 byte for length, 16 bytes for mask - const uint8_t pack_1_2_utf8_bytes[256][17] = { - {16,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14}, - {15,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,0x80}, - {15,1,0,3,2,5,4,7,6,8,11,10,13,12,15,14,0x80}, - {14,0,3,2,5,4,7,6,8,11,10,13,12,15,14,0x80,0x80}, - {15,1,0,2,5,4,7,6,9,8,11,10,13,12,15,14,0x80}, - {14,0,2,5,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80}, - {14,1,0,2,5,4,7,6,8,11,10,13,12,15,14,0x80,0x80}, - {13,0,2,5,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80}, - {15,1,0,3,2,5,4,7,6,9,8,10,13,12,15,14,0x80}, - {14,0,3,2,5,4,7,6,9,8,10,13,12,15,14,0x80,0x80}, - {14,1,0,3,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80}, - {13,0,3,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80}, - {14,1,0,2,5,4,7,6,9,8,10,13,12,15,14,0x80,0x80}, - {13,0,2,5,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80}, - {13,1,0,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80}, - {12,0,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80}, - {15,1,0,3,2,4,7,6,9,8,11,10,13,12,15,14,0x80}, - {14,0,3,2,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80}, - {14,1,0,3,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80}, - {13,0,3,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80}, - {14,1,0,2,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80}, - {13,0,2,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80}, - {13,1,0,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80}, - {12,0,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80}, - {14,1,0,3,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80}, - {13,0,3,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80}, - {13,1,0,3,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80}, - {12,0,3,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80}, - {13,1,0,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80}, - {12,0,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80}, - {12,1,0,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80}, - {11,0,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80}, - {15,1,0,3,2,5,4,7,6,9,8,11,10,12,15,14,0x80}, - {14,0,3,2,5,4,7,6,9,8,11,10,12,15,14,0x80,0x80}, - {14,1,0,3,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80}, - {13,0,3,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80}, - {14,1,0,2,5,4,7,6,9,8,11,10,12,15,14,0x80,0x80}, - {13,0,2,5,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80}, - {13,1,0,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80}, - {12,0,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80}, - {14,1,0,3,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80}, - {13,0,3,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80}, - {13,1,0,3,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80}, - {12,0,3,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80}, - {13,1,0,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80}, - {12,0,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80}, - {12,1,0,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80}, - {11,0,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, - {14,1,0,3,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80}, - {13,0,3,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80}, - {13,1,0,3,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80}, - {12,0,3,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80}, - {13,1,0,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80}, - {12,0,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80}, - {12,1,0,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80}, - {11,0,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, - {13,1,0,3,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80}, - {12,0,3,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80}, - {12,1,0,3,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80}, - {11,0,3,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, - {12,1,0,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80}, - {11,0,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, - {11,1,0,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, - {10,0,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80}, - {15,1,0,3,2,5,4,6,9,8,11,10,13,12,15,14,0x80}, - {14,0,3,2,5,4,6,9,8,11,10,13,12,15,14,0x80,0x80}, - {14,1,0,3,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80}, - {13,0,3,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80}, - {14,1,0,2,5,4,6,9,8,11,10,13,12,15,14,0x80,0x80}, - {13,0,2,5,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80}, - {13,1,0,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80}, - {12,0,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80}, - {14,1,0,3,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80}, - {13,0,3,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80}, - {13,1,0,3,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80}, - {12,0,3,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80}, - {13,1,0,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80}, - {12,0,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80}, - {12,1,0,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80}, - {11,0,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80}, - {14,1,0,3,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80}, - {13,0,3,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80}, - {13,1,0,3,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80}, - {12,0,3,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80}, - {13,1,0,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80}, - {12,0,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80}, - {12,1,0,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80}, - {11,0,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80}, - {13,1,0,3,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80}, - {12,0,3,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80}, - {12,1,0,3,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80}, - {11,0,3,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80}, - {12,1,0,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80}, - {11,0,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80}, - {11,1,0,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80}, - {10,0,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80}, - {14,1,0,3,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80}, - {13,0,3,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80}, - {13,1,0,3,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80}, - {12,0,3,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80}, - {13,1,0,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80}, - {12,0,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80}, - {12,1,0,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80}, - {11,0,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, - {13,1,0,3,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80}, - {12,0,3,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80}, - {12,1,0,3,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80}, - {11,0,3,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, - {12,1,0,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80}, - {11,0,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, - {11,1,0,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, - {10,0,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80}, - {13,1,0,3,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80}, - {12,0,3,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80}, - {12,1,0,3,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80}, - {11,0,3,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, - {12,1,0,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80}, - {11,0,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, - {11,1,0,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, - {10,0,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80}, - {12,1,0,3,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80}, - {11,0,3,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, - {11,1,0,3,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, - {10,0,3,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80}, - {11,1,0,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, - {10,0,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80}, - {10,1,0,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80}, - {9,0,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {15,1,0,3,2,5,4,7,6,9,8,11,10,13,12,14,0x80}, - {14,0,3,2,5,4,7,6,9,8,11,10,13,12,14,0x80,0x80}, - {14,1,0,3,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80}, - {13,0,3,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80}, - {14,1,0,2,5,4,7,6,9,8,11,10,13,12,14,0x80,0x80}, - {13,0,2,5,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80}, - {13,1,0,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80}, - {12,0,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80}, - {14,1,0,3,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80}, - {13,0,3,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80}, - {13,1,0,3,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80}, - {12,0,3,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80}, - {13,1,0,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80}, - {12,0,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80}, - {12,1,0,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80}, - {11,0,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, - {14,1,0,3,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80}, - {13,0,3,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80}, - {13,1,0,3,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80}, - {12,0,3,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80}, - {13,1,0,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80}, - {12,0,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80}, - {12,1,0,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80}, - {11,0,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, - {13,1,0,3,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80}, - {12,0,3,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80}, - {12,1,0,3,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80}, - {11,0,3,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, - {12,1,0,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80}, - {11,0,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, - {11,1,0,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, - {10,0,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, - {14,1,0,3,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80}, - {13,0,3,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80}, - {13,1,0,3,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80}, - {12,0,3,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80}, - {13,1,0,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80}, - {12,0,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80}, - {12,1,0,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80}, - {11,0,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80}, - {13,1,0,3,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80}, - {12,0,3,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80}, - {12,1,0,3,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80}, - {11,0,3,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80}, - {12,1,0,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80}, - {11,0,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80}, - {11,1,0,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80}, - {10,0,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, - {13,1,0,3,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80}, - {12,0,3,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80}, - {12,1,0,3,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80}, - {11,0,3,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80}, - {12,1,0,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80}, - {11,0,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80}, - {11,1,0,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80}, - {10,0,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, - {12,1,0,3,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80}, - {11,0,3,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80}, - {11,1,0,3,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80}, - {10,0,3,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, - {11,1,0,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80}, - {10,0,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, - {10,1,0,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, - {9,0,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {14,1,0,3,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80}, - {13,0,3,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80}, - {13,1,0,3,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80}, - {12,0,3,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80}, - {13,1,0,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80}, - {12,0,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80}, - {12,1,0,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80}, - {11,0,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, - {13,1,0,3,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80}, - {12,0,3,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80}, - {12,1,0,3,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80}, - {11,0,3,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, - {12,1,0,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80}, - {11,0,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, - {11,1,0,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, - {10,0,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, - {13,1,0,3,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80}, - {12,0,3,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80}, - {12,1,0,3,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80}, - {11,0,3,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, - {12,1,0,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80}, - {11,0,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, - {11,1,0,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, - {10,0,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, - {12,1,0,3,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80}, - {11,0,3,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, - {11,1,0,3,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, - {10,0,3,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, - {11,1,0,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, - {10,0,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, - {10,1,0,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, - {9,0,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {13,1,0,3,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80}, - {12,0,3,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80}, - {12,1,0,3,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80}, - {11,0,3,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80}, - {12,1,0,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80}, - {11,0,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80}, - {11,1,0,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80}, - {10,0,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, - {12,1,0,3,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80}, - {11,0,3,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80}, - {11,1,0,3,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80}, - {10,0,3,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, - {11,1,0,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80}, - {10,0,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, - {10,1,0,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, - {9,0,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {12,1,0,3,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80}, - {11,0,3,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80}, - {11,1,0,3,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80}, - {10,0,3,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, - {11,1,0,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80}, - {10,0,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, - {10,1,0,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, - {9,0,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {11,1,0,3,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80}, - {10,0,3,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, - {10,1,0,3,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, - {9,0,3,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {10,1,0,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, - {9,0,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {9,1,0,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {8,0,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80} - }; +4: imm[12:8]=0b10011: broadcast imm[7:0] << 24 as 32-bit elements to all lanes - // 1 byte for length, 16 bytes for mask - const uint8_t pack_1_2_3_utf8_bytes[256][17] = { - {12,2,3,1,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80}, - {9,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {11,3,1,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80}, - {10,0,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80}, - {9,2,3,1,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {8,3,1,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,0,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {11,2,3,1,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80}, - {8,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {10,3,1,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80}, - {9,0,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {10,2,3,1,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {9,3,1,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {8,0,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {9,2,3,1,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {8,3,1,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,0,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,2,3,1,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {3,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,3,1,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {4,0,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {8,2,3,1,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,3,1,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,0,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,2,3,1,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {4,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,3,1,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,0,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {11,2,3,1,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80}, - {8,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {10,3,1,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80}, - {9,0,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {8,2,3,1,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,3,1,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,0,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {10,2,3,1,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {9,3,1,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {8,0,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {9,2,3,1,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {8,3,1,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,0,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {10,2,3,1,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {9,3,1,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {8,0,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,2,3,1,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,3,1,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,0,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {9,2,3,1,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {8,3,1,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,0,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {8,2,3,1,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,3,1,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,0,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {9,2,3,1,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {8,3,1,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,0,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,2,3,1,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {3,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,3,1,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {4,0,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {8,2,3,1,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,3,1,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,0,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,2,3,1,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {4,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,3,1,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,0,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,2,3,1,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {3,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,3,1,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {4,0,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {3,2,3,1,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {0,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {2,3,1,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {1,0,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,2,3,1,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {2,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {4,3,1,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {3,0,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {4,2,3,1,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {1,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {3,3,1,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {2,0,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {8,2,3,1,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,3,1,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,0,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,2,3,1,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {2,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {4,3,1,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {3,0,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,2,3,1,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {4,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,3,1,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,0,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,2,3,1,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {3,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,3,1,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {4,0,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,2,3,1,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {4,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,3,1,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,0,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {4,2,3,1,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {1,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {3,3,1,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {2,0,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,2,3,1,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {3,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,3,1,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {4,0,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,2,3,1,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {2,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {4,3,1,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {3,0,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {11,2,3,1,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80}, - {8,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {10,3,1,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80}, - {9,0,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {8,2,3,1,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,3,1,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,0,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {10,2,3,1,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {9,3,1,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {8,0,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {9,2,3,1,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {8,3,1,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,0,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {8,2,3,1,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,3,1,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,0,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,2,3,1,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {2,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {4,3,1,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {3,0,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,2,3,1,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {4,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,3,1,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,0,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,2,3,1,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {3,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,3,1,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {4,0,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {10,2,3,1,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {9,3,1,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {8,0,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,2,3,1,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,3,1,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,0,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {9,2,3,1,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {8,3,1,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,0,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {8,2,3,1,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,3,1,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,0,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {9,2,3,1,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {8,3,1,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,0,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,2,3,1,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {3,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,3,1,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {4,0,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {8,2,3,1,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,3,1,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,0,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,2,3,1,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {4,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,3,1,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,0,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {10,2,3,1,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {9,3,1,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {8,0,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,2,3,1,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,3,1,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,0,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {9,2,3,1,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {8,3,1,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,0,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {8,2,3,1,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,3,1,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,0,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,2,3,1,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {4,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,3,1,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,0,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {4,2,3,1,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {1,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {3,3,1,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {2,0,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,2,3,1,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {3,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,3,1,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {4,0,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,2,3,1,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {2,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {4,3,1,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {3,0,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {9,2,3,1,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {8,3,1,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,0,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,2,3,1,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {3,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,3,1,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {4,0,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {8,2,3,1,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,3,1,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,0,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,2,3,1,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {4,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,3,1,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,0,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {8,2,3,1,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,3,1,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,0,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,2,3,1,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {2,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {4,3,1,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {3,0,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {7,2,3,1,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {4,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,3,1,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,0,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {6,2,3,1,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {3,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {5,3,1,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, - {4,0,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80} - }; +5: imm[12:8]=0b10100: broadcast imm[7:0] as 16-bit elements to all lanes -} // utf16_to_utf8 namespace -} // tables namespace -} // unnamed namespace -} // namespace simdutf +6: imm[12:8]=0b10101: broadcast imm[7:0] << 8 as 16-bit elements to all lanes -#endif // SIMDUTF_UTF16_TO_UTF8_TABLES_H -/* end file src/tables/utf16_to_utf8_tables.h */ -// End of tables. +7: imm[12:8]=0b10110: broadcast (imm[7:0] << 8) | 0xFF as 32-bit elements to all +lanes -// The scalar routines should be included once. -/* begin file src/scalar/ascii.h */ -#ifndef SIMDUTF_ASCII_H -#define SIMDUTF_ASCII_H +8: imm[12:8]=0b10111: broadcast (imm[7:0] << 16) | 0xFFFF as 32-bit elements to +all lanes -namespace simdutf { -namespace scalar { -namespace { -namespace ascii { -#if SIMDUTF_IMPLEMENTATION_FALLBACK -// Only used by the fallback kernel. -inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept { - const uint8_t *data = reinterpret_cast(buf); - uint64_t pos = 0; - // process in blocks of 16 bytes when possible - for (;pos + 16 <= len; pos += 16) { - uint64_t v1; - std::memcpy(&v1, data + pos, sizeof(uint64_t)); - uint64_t v2; - std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); - uint64_t v{v1 | v2}; - if ((v & 0x8080808080808080) != 0) { return false; } - } - // process the tail byte-by-byte - for (;pos < len; pos ++) { - if (data[pos] >= 0b10000000) { return false; } - } - return true; -} -#endif +9: imm[12:8]=0b11000: broadcast imm[7:0] as 8-bit elements to all lanes -inline simdutf_warn_unused result validate_with_errors(const char *buf, size_t len) noexcept { - const uint8_t *data = reinterpret_cast(buf); - size_t pos = 0; - // process in blocks of 16 bytes when possible - for (;pos + 16 <= len; pos += 16) { - uint64_t v1; - std::memcpy(&v1, data + pos, sizeof(uint64_t)); - uint64_t v2; - std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); - uint64_t v{v1 | v2}; - if ((v & 0x8080808080808080) != 0) { - for (;pos < len; pos ++) { - if (data[pos] >= 0b10000000) { return result(error_code::TOO_LARGE, pos); } - } - } - } - // process the tail byte-by-byte - for (;pos < len; pos ++) { - if (data[pos] >= 0b10000000) { return result(error_code::TOO_LARGE, pos); } - } - return result(error_code::SUCCESS, pos); -} +10: imm[12:8]=0b11001: repeat each bit of imm[7:0] eight times, and broadcast +the result as 64-bit elements to all lanes +*/ -} // ascii namespace -} // unnamed namespace -} // namespace scalar -} // namespace simdutf +namespace lasx_vldi { -#endif -/* end file src/scalar/ascii.h */ -/* begin file src/scalar/latin1.h */ -#ifndef SIMDUTF_LATIN1_H -#define SIMDUTF_LATIN1_H +template class const_u16 { + constexpr static const uint8_t b0 = ((v >> 0 * 8) & 0xff); + constexpr static const uint8_t b1 = ((v >> 1 * 8) & 0xff); -namespace simdutf { -namespace scalar { -namespace { -namespace latin1 { + constexpr static bool is_case5 = uint16_t(b0) == v; + constexpr static bool is_case6 = (uint16_t(b1) << 8) == v; + constexpr static bool is_case9 = (b0 == b1); + constexpr static bool is_case10 = + ((b0 == 0xff) || (b0 == 0x00)) && ((b1 == 0xff) || (b1 == 0x00)); -inline size_t utf32_length_from_latin1(size_t len) { - // We are not BOM aware. - return len; // a utf32 unit will always represent 1 latin1 character -} +public: + constexpr static uint16_t operation = is_case5 ? 0b10100 + : is_case6 ? 0b10101 + : is_case9 ? 0b11000 + : is_case10 ? 0x11001 + : 0xffff; + + constexpr static uint16_t byte = + is_case5 ? b0 + : is_case6 ? b1 + : is_case9 ? b0 + : is_case10 ? ((b0 ? 0x55 : 0x00) | (b1 ? 0xaa : 0x00)) + : 0xffff; + + constexpr static int value = int((operation << 8) | byte) - 8192; + constexpr static bool valid = operation != 0xffff; +}; -inline size_t utf8_length_from_latin1(const char *buf, size_t len) { - const uint8_t * c = reinterpret_cast(buf); - size_t answer = 0; - for(size_t i = 0; i>7)) { answer++; } - } - return answer + len; -} +template class const_u32 { + constexpr static const uint8_t b0 = (v & 0xff); + constexpr static const uint8_t b1 = ((v >> 8) & 0xff); + constexpr static const uint8_t b2 = ((v >> 16) & 0xff); + constexpr static const uint8_t b3 = ((v >> 24) & 0xff); + + constexpr static bool is_case1 = (uint32_t(b0) == v); + constexpr static bool is_case2 = ((uint32_t(b1) << 8) == v); + constexpr static bool is_case3 = ((uint32_t(b2) << 16) == v); + constexpr static bool is_case4 = ((uint32_t(b3) << 24) == v); + constexpr static bool is_case5 = (b0 == b2) && (b1 == 0) && (b3 == 0); + constexpr static bool is_case6 = (b1 == b3) && (b0 == 0) && (b2 == 0); + constexpr static bool is_case7 = (b3 == 0) && (b2 == 0) && (b0 == 0xff); + constexpr static bool is_case8 = (b3 == 0) && (b1 == 0xff) && (b0 == 0xff); + constexpr static bool is_case9 = (b0 == b1) && (b0 == b2) && (b0 == b3); + constexpr static bool is_case10 = + ((b0 == 0xff) || (b0 == 0x00)) && ((b1 == 0xff) || (b1 == 0x00)) && + ((b2 == 0xff) || (b2 == 0x00)) && ((b3 == 0xff) || (b3 == 0x00)); -inline size_t utf16_length_from_latin1(size_t len) { - return len; -} +public: + constexpr static uint16_t operation = is_case1 ? 0b10000 + : is_case2 ? 0b10001 + : is_case3 ? 0b10010 + : is_case4 ? 0b10011 + : is_case5 ? 0b10100 + : is_case6 ? 0b10101 + : is_case7 ? 0b10110 + : is_case8 ? 0b10111 + : is_case9 ? 0b11000 + : is_case10 ? 0b11001 + : 0xffff; + + constexpr static uint16_t byte = + is_case1 ? b0 + : is_case2 ? b1 + : is_case3 ? b2 + : is_case4 ? b3 + : is_case5 ? b0 + : is_case6 ? b1 + : is_case7 ? b1 + : is_case8 ? b2 + : is_case9 ? b0 + : is_case10 ? ((b0 ? 0x11 : 0x00) | (b1 ? 0x22 : 0x00) | + (b2 ? 0x44 : 0x00) | (b3 ? 0x88 : 0x00)) + : 0xffff; + + constexpr static int value = int((operation << 8) | byte) - 8192; + constexpr static bool valid = operation != 0xffff; +}; -} // utf32 namespace -} // unnamed namespace -} // namespace scalar -} // namespace simdutf +template class const_u64 { + constexpr static const uint8_t b0 = ((v >> 0 * 8) & 0xff); + constexpr static const uint8_t b1 = ((v >> 1 * 8) & 0xff); + constexpr static const uint8_t b2 = ((v >> 2 * 8) & 0xff); + constexpr static const uint8_t b3 = ((v >> 3 * 8) & 0xff); + constexpr static const uint8_t b4 = ((v >> 4 * 8) & 0xff); + constexpr static const uint8_t b5 = ((v >> 5 * 8) & 0xff); + constexpr static const uint8_t b6 = ((v >> 6 * 8) & 0xff); + constexpr static const uint8_t b7 = ((v >> 7 * 8) & 0xff); + + constexpr static bool is_case10 = + ((b0 == 0xff) || (b0 == 0x00)) && ((b1 == 0xff) || (b1 == 0x00)) && + ((b2 == 0xff) || (b2 == 0x00)) && ((b3 == 0xff) || (b3 == 0x00)) && + ((b4 == 0xff) || (b4 == 0x00)) && ((b5 == 0xff) || (b5 == 0x00)) && + ((b6 == 0xff) || (b6 == 0x00)) && ((b7 == 0xff) || (b7 == 0x00)); + +public: + constexpr static bool is_32bit = + ((v & 0xffffffff) == (v >> 32)) && const_u32<(v >> 32)>::value; + constexpr static uint8_t op_32bit = const_u32<(v >> 32)>::operation; + constexpr static uint8_t byte_32bit = const_u32<(v >> 32)>::byte; + + constexpr static uint16_t operation = is_32bit ? op_32bit + : is_case10 ? 0x11001 + : 0xffff; + + constexpr static uint16_t byte = + is_32bit ? byte_32bit + : is_case10 + ? ((b0 ? 0x01 : 0x00) | (b1 ? 0x02 : 0x00) | (b2 ? 0x04 : 0x00) | + (b3 ? 0x08 : 0x00) | (b4 ? 0x10 : 0x00) | (b5 ? 0x20 : 0x00) | + (b6 ? 0x40 : 0x00) | (b7 ? 0x80 : 0x00)) + : 0xffff; + + constexpr static int value = int((operation << 8) | byte) - 8192; + constexpr static bool valid = operation != 0xffff; +}; + +} // namespace lasx_vldi +// Uncomment when running under QEMU affected +// by bug https://gitlab.com/qemu-project/qemu/-/issues/2865 +// Versions <= 9.2.2 are affected, likely anything newer is correct. +#ifndef QEMU_VLDI_BUG +// #define QEMU_VLDI_BUG 1 #endif -/* end file src/scalar/latin1.h */ -/* begin file src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */ -#ifndef SIMDUTF_VALID_UTF32_TO_UTF8_H -#define SIMDUTF_VALID_UTF32_TO_UTF8_H +#ifdef QEMU_VLDI_BUG + #define lasx_splat_u16(v) __lasx_xvreplgr2vr_h(v) + #define lasx_splat_u32(v) __lasx_xvreplgr2vr_w(v) +#else +template constexpr __m256i lasx_splat_u16_aux() { + constexpr bool is_imm10 = (int16_t(x) < 512) && (int16_t(x) > -512); + constexpr uint16_t imm10 = is_imm10 ? x : 0; + constexpr bool is_vldi = lasx_vldi::const_u16::valid; + constexpr int vldi_imm = is_vldi ? lasx_vldi::const_u16::value : 0; -namespace simdutf { -namespace scalar { -namespace { -namespace utf32_to_utf8 { + return is_imm10 ? __lasx_xvrepli_h(int16_t(imm10)) + : is_vldi ? __lasx_xvldi(vldi_imm) + : __lasx_xvreplgr2vr_h(x); +} -#if SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_PPC64 -// only used by the fallback and POWER kernel -inline size_t convert_valid(const char32_t* buf, size_t len, char* utf8_output) { - const uint32_t *data = reinterpret_cast(buf); - size_t pos = 0; - char* start{utf8_output}; - while (pos < len) { - // try to convert the next block of 2 ASCII characters - if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii - uint64_t v; - ::memcpy(&v, data + pos, sizeof(uint64_t)); - if ((v & 0xFFFFFF80FFFFFF80) == 0) { - *utf8_output++ = char(buf[pos]); - *utf8_output++ = char(buf[pos+1]); - pos += 2; - continue; - } - } - uint32_t word = data[pos]; - if((word & 0xFFFFFF80)==0) { - // will generate one UTF-8 bytes - *utf8_output++ = char(word); - pos++; - } else if((word & 0xFFFFF800)==0) { - // will generate two UTF-8 bytes - // we have 0b110XXXXX 0b10XXXXXX - *utf8_output++ = char((word>>6) | 0b11000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - pos++; - } else if((word & 0xFFFF0000)==0) { - // will generate three UTF-8 bytes - // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX - *utf8_output++ = char((word>>12) | 0b11100000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - pos++; - } else { - // will generate four UTF-8 bytes - // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX - *utf8_output++ = char((word>>18) | 0b11110000); - *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - pos ++; - } - } - return utf8_output - start; +template constexpr __m256i lasx_splat_u32_aux() { + constexpr bool is_imm10 = (int32_t(x) < 512) && (int32_t(x) > -512); + constexpr uint32_t imm10 = is_imm10 ? x : 0; + constexpr bool is_vldi = lasx_vldi::const_u32::valid; + constexpr int vldi_imm = is_vldi ? lasx_vldi::const_u32::value : 0; + + return is_imm10 ? __lasx_xvrepli_w(int32_t(imm10)) + : is_vldi ? __lasx_xvldi(vldi_imm) + : __lasx_xvreplgr2vr_w(x); } -#endif // SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_PPC64 -} // utf32_to_utf8 namespace -} // unnamed namespace -} // namespace scalar -} // namespace simdutf + #define lasx_splat_u16(v) lasx_splat_u16_aux<(v)>() + #define lasx_splat_u32(v) lasx_splat_u32_aux<(v)>() +#endif // QEMU_VLDI_BUG -#endif -/* end file src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */ -/* begin file src/scalar/utf32_to_utf8/utf32_to_utf8.h */ -#ifndef SIMDUTF_UTF32_TO_UTF8_H -#define SIMDUTF_UTF32_TO_UTF8_H +#ifndef lsx_splat_u16 + #ifdef QEMU_VLDI_BUG + #define lsx_splat_u16(v) __lsx_vreplgr2vr_h(v) + #define lsx_splat_u32(v) __lsx_vreplgr2vr_w(v) + #else +namespace { +template constexpr __m128i lsx_splat_u16_aux() { + return ((int16_t(x) < 512) && (int16_t(x) > -512)) + ? __lsx_vrepli_h( + ((int16_t(x) < 512) && (int16_t(x) > -512)) ? int16_t(x) : 0) + : (lasx_vldi::const_u16::valid + ? __lsx_vldi(lasx_vldi::const_u16::valid + ? lasx_vldi::const_u16::value + : 0) + : __lsx_vreplgr2vr_h(x)); +} + +template constexpr __m128i lsx_splat_u32_aux() { + return ((int32_t(x) < 512) && (int32_t(x) > -512)) + ? __lsx_vrepli_w( + ((int32_t(x) < 512) && (int32_t(x) > -512)) ? int32_t(x) : 0) + : (lasx_vldi::const_u32::valid + ? __lsx_vldi(lasx_vldi::const_u32::valid + ? lasx_vldi::const_u32::value + : 0) + : __lsx_vreplgr2vr_w(x)); +} +} // namespace + #define lsx_splat_u16(v) lsx_splat_u16_aux<(v)>() + #define lsx_splat_u32(v) lsx_splat_u32_aux<(v)>() + #endif // QEMU_VLDI_BUG +#endif // lsx_splat_u16 + +#endif // SIMDUTF_LASX_INTRINSICS_H +/* end file src/simdutf/lasx/intrinsics.h */ +/* begin file src/simdutf/lasx/bitmanipulation.h */ +#ifndef SIMDUTF_LASX_BITMANIPULATION_H +#define SIMDUTF_LASX_BITMANIPULATION_H + +#include namespace simdutf { -namespace scalar { +namespace lasx { namespace { -namespace utf32_to_utf8 { -inline size_t convert(const char32_t* buf, size_t len, char* utf8_output) { - const uint32_t *data = reinterpret_cast(buf); - size_t pos = 0; - char* start{utf8_output}; - while (pos < len) { - // try to convert the next block of 2 ASCII characters - if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii - uint64_t v; - ::memcpy(&v, data + pos, sizeof(uint64_t)); - if ((v & 0xFFFFFF80FFFFFF80) == 0) { - *utf8_output++ = char(buf[pos]); - *utf8_output++ = char(buf[pos+1]); - pos += 2; - continue; - } - } - uint32_t word = data[pos]; - if((word & 0xFFFFFF80)==0) { - // will generate one UTF-8 bytes - *utf8_output++ = char(word); - pos++; - } else if((word & 0xFFFFF800)==0) { - // will generate two UTF-8 bytes - // we have 0b110XXXXX 0b10XXXXXX - *utf8_output++ = char((word>>6) | 0b11000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - pos++; - } else if((word & 0xFFFF0000)==0) { - // will generate three UTF-8 bytes - // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX - if (word >= 0xD800 && word <= 0xDFFF) { return 0; } - *utf8_output++ = char((word>>12) | 0b11100000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - pos++; - } else { - // will generate four UTF-8 bytes - // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX - if (word > 0x10FFFF) { return 0; } - *utf8_output++ = char((word>>18) | 0b11110000); - *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - pos ++; - } - } - return utf8_output - start; +simdutf_really_inline int count_ones(uint64_t input_num) { + return __lsx_vpickve2gr_w(__lsx_vpcnt_d(__lsx_vreplgr2vr_d(input_num)), 0); } -inline result convert_with_errors(const char32_t* buf, size_t len, char* utf8_output) { - const uint32_t *data = reinterpret_cast(buf); - size_t pos = 0; - char* start{utf8_output}; - while (pos < len) { - // try to convert the next block of 2 ASCII characters - if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii - uint64_t v; - ::memcpy(&v, data + pos, sizeof(uint64_t)); - if ((v & 0xFFFFFF80FFFFFF80) == 0) { - *utf8_output++ = char(buf[pos]); - *utf8_output++ = char(buf[pos+1]); - pos += 2; - continue; - } - } - uint32_t word = data[pos]; - if((word & 0xFFFFFF80)==0) { - // will generate one UTF-8 bytes - *utf8_output++ = char(word); - pos++; - } else if((word & 0xFFFFF800)==0) { - // will generate two UTF-8 bytes - // we have 0b110XXXXX 0b10XXXXXX - *utf8_output++ = char((word>>6) | 0b11000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - pos++; - } else if((word & 0xFFFF0000)==0) { - // will generate three UTF-8 bytes - // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX - if (word >= 0xD800 && word <= 0xDFFF) { return result(error_code::SURROGATE, pos); } - *utf8_output++ = char((word>>12) | 0b11100000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - pos++; - } else { - // will generate four UTF-8 bytes - // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX - if (word > 0x10FFFF) { return result(error_code::TOO_LARGE, pos); } - *utf8_output++ = char((word>>18) | 0b11110000); - *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - pos ++; - } - } - return result(error_code::SUCCESS, utf8_output - start); +#if SIMDUTF_NEED_TRAILING_ZEROES +simdutf_really_inline int trailing_zeroes(uint64_t input_num) { + return __builtin_ctzll(input_num); } +#endif -} // utf32_to_utf8 namespace } // unnamed namespace -} // namespace scalar +} // namespace lasx } // namespace simdutf -#endif -/* end file src/scalar/utf32_to_utf8/utf32_to_utf8.h */ +#endif // SIMDUTF_LASX_BITMANIPULATION_H +/* end file src/simdutf/lasx/bitmanipulation.h */ +/* begin file src/simdutf/lasx/simd.h */ +#ifndef SIMDUTF_LASX_SIMD_H +#define SIMDUTF_LASX_SIMD_H -/* begin file src/scalar/utf32_to_utf16/valid_utf32_to_utf16.h */ -#ifndef SIMDUTF_VALID_UTF32_TO_UTF16_H -#define SIMDUTF_VALID_UTF32_TO_UTF16_H namespace simdutf { -namespace scalar { +namespace lasx { namespace { -namespace utf32_to_utf16 { +namespace simd { -template -inline size_t convert_valid(const char32_t* buf, size_t len, char16_t* utf16_output) { - const uint32_t *data = reinterpret_cast(buf); - size_t pos = 0; - char16_t* start{utf16_output}; - while (pos < len) { - uint32_t word = data[pos]; - if((word & 0xFFFF0000)==0) { - // will not generate a surrogate pair - *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word); - pos++; - } else { - // will generate a surrogate pair - word -= 0x10000; - uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); - uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); - if (!match_system(big_endian)) { - high_surrogate = utf16::swap_bytes(high_surrogate); - low_surrogate = utf16::swap_bytes(low_surrogate); - } - *utf16_output++ = char16_t(high_surrogate); - *utf16_output++ = char16_t(low_surrogate); - pos++; - } - } - return utf16_output - start; -} +__attribute__((aligned(32))) static const uint8_t prev_shuf_table[32][32] = { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, + {0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}, + {0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, + {0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, + {0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, + {0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, + {0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, + 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, + 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, + 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0}, + {15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, + 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, + 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, + 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, + 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0}, + {7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, + 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0}, + {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0}, + {5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0}, + {4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0}, + {3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0}, + {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, +}; -} // utf32_to_utf16 namespace -} // unnamed namespace -} // namespace scalar -} // namespace simdutf +__attribute__((aligned(32))) static const uint8_t bitsel_mask_table[32][32] = { + {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + {0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + {0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + {0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + {0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0}, + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0}, + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0}, + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0}, + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0}}; -#endif -/* end file src/scalar/utf32_to_utf16/valid_utf32_to_utf16.h */ -/* begin file src/scalar/utf32_to_utf16/utf32_to_utf16.h */ -#ifndef SIMDUTF_UTF32_TO_UTF16_H -#define SIMDUTF_UTF32_TO_UTF16_H +// Forward-declared so they can be used by splat and friends. +template struct base { + __m256i value; -namespace simdutf { -namespace scalar { -namespace { -namespace utf32_to_utf16 { + // Zero constructor + simdutf_really_inline base() : value{__m256i()} {} -template -inline size_t convert(const char32_t* buf, size_t len, char16_t* utf16_output) { - const uint32_t *data = reinterpret_cast(buf); - size_t pos = 0; - char16_t* start{utf16_output}; - while (pos < len) { - uint32_t word = data[pos]; - if((word & 0xFFFF0000)==0) { - if (word >= 0xD800 && word <= 0xDFFF) { return 0; } - // will not generate a surrogate pair - *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word); + // Conversion from SIMD register + simdutf_really_inline base(const __m256i _value) : value(_value) {} + // Conversion to SIMD register + simdutf_really_inline operator const __m256i &() const { return this->value; } + simdutf_really_inline operator __m256i &() { return this->value; } + template + simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const { + if (big_endian) { + __m256i zero = __lasx_xvldi(0); + __m256i in8 = __lasx_xvpermi_d(this->value, 0b11011000); + __m256i inlow = __lasx_xvilvl_b(in8, zero); + __m256i inhigh = __lasx_xvilvh_b(in8, zero); + __lasx_xvst(inlow, reinterpret_cast(ptr), 0); + __lasx_xvst(inhigh, reinterpret_cast(ptr), 32); } else { - // will generate a surrogate pair - if (word > 0x10FFFF) { return 0; } - word -= 0x10000; - uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); - uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); - if (!match_system(big_endian)) { - high_surrogate = utf16::swap_bytes(high_surrogate); - low_surrogate = utf16::swap_bytes(low_surrogate); - } - *utf16_output++ = char16_t(high_surrogate); - *utf16_output++ = char16_t(low_surrogate); + __m256i inlow = __lasx_vext2xv_hu_bu(this->value); + __m256i inhigh = __lasx_vext2xv_hu_bu( + __lasx_xvpermi_q(this->value, this->value, 0b00000001)); + __lasx_xvst(inlow, reinterpret_cast<__m256i *>(ptr), 0); + __lasx_xvst(inhigh, reinterpret_cast<__m256i *>(ptr), 32); } - pos++; } - return utf16_output - start; -} + simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const { + __m256i in32_0 = __lasx_vext2xv_wu_bu(this->value); + __lasx_xvst(in32_0, reinterpret_cast(ptr), 0); -template -inline result convert_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) { - const uint32_t *data = reinterpret_cast(buf); - size_t pos = 0; - char16_t* start{utf16_output}; - while (pos < len) { - uint32_t word = data[pos]; - if((word & 0xFFFF0000)==0) { - if (word >= 0xD800 && word <= 0xDFFF) { return result(error_code::SURROGATE, pos); } - // will not generate a surrogate pair - *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word); - } else { - // will generate a surrogate pair - if (word > 0x10FFFF) { return result(error_code::TOO_LARGE, pos); } - word -= 0x10000; - uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); - uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); - if (!match_system(big_endian)) { - high_surrogate = utf16::swap_bytes(high_surrogate); - low_surrogate = utf16::swap_bytes(low_surrogate); - } - *utf16_output++ = char16_t(high_surrogate); - *utf16_output++ = char16_t(low_surrogate); - } - pos++; + __m256i in8_1 = __lasx_xvpermi_d(this->value, 0b00000001); + __m256i in32_1 = __lasx_vext2xv_wu_bu(in8_1); + __lasx_xvst(in32_1, reinterpret_cast(ptr), 32); + + __m256i in8_2 = __lasx_xvpermi_d(this->value, 0b00000010); + __m256i in32_2 = __lasx_vext2xv_wu_bu(in8_2); + __lasx_xvst(in32_2, reinterpret_cast(ptr), 64); + + __m256i in8_3 = __lasx_xvpermi_d(this->value, 0b00000011); + __m256i in32_3 = __lasx_vext2xv_wu_bu(in8_3); + __lasx_xvst(in32_3, reinterpret_cast(ptr), 96); } - return result(error_code::SUCCESS, utf16_output - start); -} + // Bit operations + simdutf_really_inline Child operator|(const Child other) const { + return __lasx_xvor_v(this->value, other); + } + simdutf_really_inline Child operator&(const Child other) const { + return __lasx_xvand_v(this->value, other); + } + simdutf_really_inline Child operator^(const Child other) const { + return __lasx_xvxor_v(this->value, other); + } + simdutf_really_inline Child &operator|=(const Child other) { + auto this_cast = static_cast(this); + *this_cast = *this_cast | other; + return *this_cast; + } +}; -} // utf32_to_utf16 namespace -} // unnamed namespace -} // namespace scalar -} // namespace simdutf +template struct simd8; -#endif -/* end file src/scalar/utf32_to_utf16/utf32_to_utf16.h */ +template > +struct base8 : base> { + simdutf_really_inline base8() : base>() {} + simdutf_really_inline base8(const __m256i _value) : base>(_value) {} + friend simdutf_really_inline Mask operator==(const simd8 lhs, + const simd8 rhs) { + return __lasx_xvseq_b(lhs, rhs); + } -/* begin file src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */ -#ifndef SIMDUTF_VALID_UTF16_TO_UTF8_H -#define SIMDUTF_VALID_UTF16_TO_UTF8_H + static const int SIZE = sizeof(base::value); -namespace simdutf { -namespace scalar { -namespace { -namespace utf16_to_utf8 { + template + simdutf_really_inline simd8 prev(const simd8 prev_chunk) const { + static_assert(N <= 16, "unsupported shift value"); -template -inline size_t convert_valid(const char16_t* buf, size_t len, char* utf8_output) { - const uint16_t *data = reinterpret_cast(buf); - size_t pos = 0; - char* start{utf8_output}; - while (pos < len) { - // try to convert the next block of 4 ASCII characters - if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii - uint64_t v; - ::memcpy(&v, data + pos, sizeof(uint64_t)); - if (!match_system(big_endian)) { v = (v >> 8) | (v << (64 - 8)); } - if ((v & 0xFF80FF80FF80FF80) == 0) { - size_t final_pos = pos + 4; - while(pos < final_pos) { - *utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]); - pos++; - } - continue; - } + if (!N) + return this->value; + + __m256i zero = __lasx_xvldi(0); + __m256i result, shuf; + if (N < 16) { + shuf = __lasx_xvld(prev_shuf_table[N], 0); + + result = __lasx_xvshuf_b( + __lasx_xvpermi_q(this->value, this->value, 0b00000001), this->value, + shuf); + __m256i srl_prev = __lasx_xvbsrl_v( + __lasx_xvpermi_q(zero, prev_chunk.value, 0b00110001), (16 - N)); + __m256i mask = __lasx_xvld(bitsel_mask_table[N], 0); + result = __lasx_xvbitsel_v(result, srl_prev, mask); + + return result; + } else if (N == 16) { + return __lasx_xvpermi_q(this->value, prev_chunk.value, 0b00100001); } + } +}; - uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos]; - if((word & 0xFF80)==0) { - // will generate one UTF-8 bytes - *utf8_output++ = char(word); - pos++; - } else if((word & 0xF800)==0) { - // will generate two UTF-8 bytes - // we have 0b110XXXXX 0b10XXXXXX - *utf8_output++ = char((word>>6) | 0b11000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - pos++; - } else if((word &0xF800 ) != 0xD800) { - // will generate three UTF-8 bytes - // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX - *utf8_output++ = char((word>>12) | 0b11100000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - pos++; - } else { - // must be a surrogate pair - uint16_t diff = uint16_t(word - 0xD800); - if(pos + 1 >= len) { return 0; } // minimal bound checking - uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1]; - uint16_t diff2 = uint16_t(next_word - 0xDC00); - uint32_t value = (diff << 10) + diff2 + 0x10000; - // will generate four UTF-8 bytes - // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX - *utf8_output++ = char((value>>18) | 0b11110000); - *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000); - *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((value & 0b111111) | 0b10000000); - pos += 2; - } +// SIMD byte mask type (returned by things like eq and gt) +template <> struct simd8 : base8 { + static simdutf_really_inline simd8 splat(bool _value) { + return __lasx_xvreplgr2vr_b(uint8_t(-(!!_value))); } - return utf8_output - start; -} - -} // utf16_to_utf8 namespace -} // unnamed namespace -} // namespace scalar -} // namespace simdutf -#endif -/* end file src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */ -/* begin file src/scalar/utf16_to_utf8/utf16_to_utf8.h */ -#ifndef SIMDUTF_UTF16_TO_UTF8_H -#define SIMDUTF_UTF16_TO_UTF8_H + simdutf_really_inline simd8() : base8() {} + simdutf_really_inline simd8(const __m256i _value) : base8(_value) {} + // Splat constructor + simdutf_really_inline simd8(bool _value) : base8(splat(_value)) {} -namespace simdutf { -namespace scalar { -namespace { -namespace utf16_to_utf8 { + simdutf_really_inline uint32_t to_bitmask() const { + __m256i mask = __lasx_xvmsknz_b(this->value); + uint32_t mask0 = __lasx_xvpickve2gr_wu(mask, 0); + uint32_t mask1 = __lasx_xvpickve2gr_wu(mask, 4); + return (mask0 | (mask1 << 16)); + } + simdutf_really_inline bool any() const { + if (__lasx_xbz_b(this->value)) + return false; + return true; + } + simdutf_really_inline simd8 operator~() const { return *this ^ true; } +}; -template -inline size_t convert(const char16_t* buf, size_t len, char* utf8_output) { - const uint16_t *data = reinterpret_cast(buf); - size_t pos = 0; - char* start{utf8_output}; - while (pos < len) { - // try to convert the next block of 8 bytes - if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii - uint64_t v; - ::memcpy(&v, data + pos, sizeof(uint64_t)); - if (!match_system(big_endian)) { v = (v >> 8) | (v << (64 - 8)); } - if ((v & 0xFF80FF80FF80FF80) == 0) { - size_t final_pos = pos + 4; - while(pos < final_pos) { - *utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]); - pos++; - } - continue; - } - } - uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos]; - if((word & 0xFF80)==0) { - // will generate one UTF-8 bytes - *utf8_output++ = char(word); - pos++; - } else if((word & 0xF800)==0) { - // will generate two UTF-8 bytes - // we have 0b110XXXXX 0b10XXXXXX - *utf8_output++ = char((word>>6) | 0b11000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - pos++; - } else if((word &0xF800 ) != 0xD800) { - // will generate three UTF-8 bytes - // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX - *utf8_output++ = char((word>>12) | 0b11100000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - pos++; - } else { - // must be a surrogate pair - if(pos + 1 >= len) { return 0; } - uint16_t diff = uint16_t(word - 0xD800); - if(diff > 0x3FF) { return 0; } - uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1]; - uint16_t diff2 = uint16_t(next_word - 0xDC00); - if(diff2 > 0x3FF) { return 0; } - uint32_t value = (diff << 10) + diff2 + 0x10000; - // will generate four UTF-8 bytes - // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX - *utf8_output++ = char((value>>18) | 0b11110000); - *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000); - *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((value & 0b111111) | 0b10000000); - pos += 2; - } +template struct base8_numeric : base8 { + static simdutf_really_inline simd8 splat(T _value) { + return __lasx_xvreplgr2vr_b(_value); + } + static simdutf_really_inline simd8 zero() { return __lasx_xvldi(0); } + static simdutf_really_inline simd8 load(const T values[32]) { + return __lasx_xvld(reinterpret_cast(values), 0); + } + // Repeat 16 values as many times as necessary (usually for lookup tables) + static simdutf_really_inline simd8 repeat_16(T v0, T v1, T v2, T v3, T v4, + T v5, T v6, T v7, T v8, T v9, + T v10, T v11, T v12, T v13, + T v14, T v15) { + return simd8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, + v14, v15, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, + v12, v13, v14, v15); } - return utf8_output - start; -} -template -inline result convert_with_errors(const char16_t* buf, size_t len, char* utf8_output) { - const uint16_t *data = reinterpret_cast(buf); - size_t pos = 0; - char* start{utf8_output}; - while (pos < len) { - // try to convert the next block of 8 bytes - if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii - uint64_t v; - ::memcpy(&v, data + pos, sizeof(uint64_t)); - if (!match_system(big_endian)) v = (v >> 8) | (v << (64 - 8)); - if ((v & 0xFF80FF80FF80FF80) == 0) { - size_t final_pos = pos + 4; - while(pos < final_pos) { - *utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]); - pos++; - } - continue; - } - } - uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos]; - if((word & 0xFF80)==0) { - // will generate one UTF-8 bytes - *utf8_output++ = char(word); - pos++; - } else if((word & 0xF800)==0) { - // will generate two UTF-8 bytes - // we have 0b110XXXXX 0b10XXXXXX - *utf8_output++ = char((word>>6) | 0b11000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - pos++; - } else if((word &0xF800 ) != 0xD800) { - // will generate three UTF-8 bytes - // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX - *utf8_output++ = char((word>>12) | 0b11100000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - pos++; - } else { - // must be a surrogate pair - if(pos + 1 >= len) { return result(error_code::SURROGATE, pos); } - uint16_t diff = uint16_t(word - 0xD800); - if(diff > 0x3FF) { return result(error_code::SURROGATE, pos); } - uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1]; - uint16_t diff2 = uint16_t(next_word - 0xDC00); - if(diff2 > 0x3FF) { return result(error_code::SURROGATE, pos); } - uint32_t value = (diff << 10) + diff2 + 0x10000; - // will generate four UTF-8 bytes - // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX - *utf8_output++ = char((value>>18) | 0b11110000); - *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000); - *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((value & 0b111111) | 0b10000000); - pos += 2; - } + simdutf_really_inline base8_numeric() : base8() {} + simdutf_really_inline base8_numeric(const __m256i _value) + : base8(_value) {} + + // Store to array + simdutf_really_inline void store(T dst[32]) const { + return __lasx_xvst(this->value, reinterpret_cast<__m256i *>(dst), 0); } - return result(error_code::SUCCESS, utf8_output - start); -} -} // utf16_to_utf8 namespace -} // unnamed namespace -} // namespace scalar -} // namespace simdutf + // Override to distinguish from bool version + simdutf_really_inline simd8 operator~() const { return *this ^ 0xFFu; } -#endif -/* end file src/scalar/utf16_to_utf8/utf16_to_utf8.h */ + // Perform a lookup assuming the value is between 0 and 16 (undefined behavior + // for out of range values) + template + simdutf_really_inline simd8 lookup_16(simd8 lookup_table) const { + __m256i origin = __lasx_xvand_v(this->value, __lasx_xvldi(0x1f)); + return __lasx_xvshuf_b(__lasx_xvldi(0), lookup_table, origin); + } -/* begin file src/scalar/utf16_to_utf32/valid_utf16_to_utf32.h */ -#ifndef SIMDUTF_VALID_UTF16_TO_UTF32_H -#define SIMDUTF_VALID_UTF16_TO_UTF32_H + template + simdutf_really_inline simd8 + lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4, + L replace5, L replace6, L replace7, L replace8, L replace9, + L replace10, L replace11, L replace12, L replace13, L replace14, + L replace15) const { + return lookup_16(simd8::repeat_16( + replace0, replace1, replace2, replace3, replace4, replace5, replace6, + replace7, replace8, replace9, replace10, replace11, replace12, + replace13, replace14, replace15)); + } +}; -namespace simdutf { -namespace scalar { -namespace { -namespace utf16_to_utf32 { +// Signed bytes +template <> struct simd8 : base8_numeric { + simdutf_really_inline simd8() : base8_numeric() {} + simdutf_really_inline simd8(const __m256i _value) + : base8_numeric(_value) {} -template -inline size_t convert_valid(const char16_t* buf, size_t len, char32_t* utf32_output) { - const uint16_t *data = reinterpret_cast(buf); - size_t pos = 0; - char32_t* start{utf32_output}; - while (pos < len) { - uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos]; - if((word &0xF800 ) != 0xD800) { - // No surrogate pair, extend 16-bit word to 32-bit word - *utf32_output++ = char32_t(word); - pos++; - } else { - // must be a surrogate pair - uint16_t diff = uint16_t(word - 0xD800); - if(pos + 1 >= len) { return 0; } // minimal bound checking - uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1]; - uint16_t diff2 = uint16_t(next_word - 0xDC00); - uint32_t value = (diff << 10) + diff2 + 0x10000; - *utf32_output++ = char32_t(value); - pos += 2; - } + // Splat constructor + simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdutf_really_inline simd8(const int8_t values[32]) : simd8(load(values)) {} + simdutf_really_inline operator simd8() const; + simdutf_really_inline bool is_ascii() const { + __m256i ascii_mask = __lasx_xvslti_b(this->value, 0); + if (__lasx_xbnz_v(ascii_mask)) + return false; + return true; } - return utf32_output - start; -} + // Order-sensitive comparisons + simdutf_really_inline simd8 operator>(const simd8 other) const { + return __lasx_xvslt_b(other, this->value); + } + simdutf_really_inline simd8 operator<(const simd8 other) const { + return __lasx_xvslt_b(this->value, other); + } +}; -} // utf16_to_utf32 namespace -} // unnamed namespace -} // namespace scalar -} // namespace simdutf +// Unsigned bytes +template <> struct simd8 : base8_numeric { + simdutf_really_inline simd8() : base8_numeric() {} + simdutf_really_inline simd8(const __m256i _value) + : base8_numeric(_value) {} + // Splat constructor + simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdutf_really_inline simd8(const uint8_t values[32]) : simd8(load(values)) {} + // Member-by-member initialization + simdutf_really_inline + simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, + uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10, + uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15, + uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, + uint8_t v21, uint8_t v22, uint8_t v23, uint8_t v24, uint8_t v25, + uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, + uint8_t v31) + : simd8((__m256i)v32u8{v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, + v24, v25, v26, v27, v28, v29, v30, v31}) {} -#endif -/* end file src/scalar/utf16_to_utf32/valid_utf16_to_utf32.h */ -/* begin file src/scalar/utf16_to_utf32/utf16_to_utf32.h */ -#ifndef SIMDUTF_UTF16_TO_UTF32_H -#define SIMDUTF_UTF16_TO_UTF32_H + // Saturated math + simdutf_really_inline simd8 + saturating_sub(const simd8 other) const { + return __lasx_xvssub_bu(this->value, other); + } -namespace simdutf { -namespace scalar { -namespace { -namespace utf16_to_utf32 { + // Same as >, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd8 + gt_bits(const simd8 other) const { + return this->saturating_sub(other); + } + simdutf_really_inline simd8 + operator>=(const simd8 other) const { + return __lasx_xvsle_bu(other, *this); + } + simdutf_really_inline simd8 &operator-=(const simd8 other) { + value = __lasx_xvsub_b(value, other.value); + return *this; + } -template -inline size_t convert(const char16_t* buf, size_t len, char32_t* utf32_output) { - const uint16_t *data = reinterpret_cast(buf); - size_t pos = 0; - char32_t* start{utf32_output}; - while (pos < len) { - uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos]; - if((word &0xF800 ) != 0xD800) { - // No surrogate pair, extend 16-bit word to 32-bit word - *utf32_output++ = char32_t(word); - pos++; - } else { - // must be a surrogate pair - uint16_t diff = uint16_t(word - 0xD800); - if(diff > 0x3FF) { return 0; } - if(pos + 1 >= len) { return 0; } // minimal bound checking - uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1]; - uint16_t diff2 = uint16_t(next_word - 0xDC00); - if(diff2 > 0x3FF) { return 0; } - uint32_t value = (diff << 10) + diff2 + 0x10000; - *utf32_output++ = char32_t(value); - pos += 2; - } + // Bit-specific operations + simdutf_really_inline bool is_ascii() const { + __m256i ascii_mask = __lasx_xvslti_b(this->value, 0); + if (__lasx_xbnz_v(ascii_mask)) + return false; + return true; + } + simdutf_really_inline bool any_bits_set_anywhere() const { + if (__lasx_xbnz_v(this->value)) + return true; + return false; + } + template simdutf_really_inline simd8 shr() const { + return __lasx_xvsrli_b(this->value, N); + } + template simdutf_really_inline simd8 shl() const { + return __lasx_xvslli_b(this->value, N); } - return utf32_output - start; -} -template -inline result convert_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) { - const uint16_t *data = reinterpret_cast(buf); - size_t pos = 0; - char32_t* start{utf32_output}; - while (pos < len) { - uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos]; - if((word &0xF800 ) != 0xD800) { - // No surrogate pair, extend 16-bit word to 32-bit word - *utf32_output++ = char32_t(word); - pos++; - } else { - // must be a surrogate pair - uint16_t diff = uint16_t(word - 0xD800); - if(diff > 0x3FF) { return result(error_code::SURROGATE, pos); } - if(pos + 1 >= len) { return result(error_code::SURROGATE, pos); } // minimal bound checking - uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1]; - uint16_t diff2 = uint16_t(next_word - 0xDC00); - if(diff2 > 0x3FF) { return result(error_code::SURROGATE, pos); } - uint32_t value = (diff << 10) + diff2 + 0x10000; - *utf32_output++ = char32_t(value); - pos += 2; - } + simdutf_really_inline uint64_t sum_bytes() const { + const auto sum_u16 = __lasx_xvhaddw_hu_bu(value, value); + const auto sum_u32 = __lasx_xvhaddw_wu_hu(sum_u16, sum_u16); + const auto sum_u64 = __lasx_xvhaddw_du_wu(sum_u32, sum_u32); + + return uint64_t(__lasx_xvpickve2gr_du(sum_u64, 0)) + + uint64_t(__lasx_xvpickve2gr_du(sum_u64, 1)) + + uint64_t(__lasx_xvpickve2gr_du(sum_u64, 2)) + + uint64_t(__lasx_xvpickve2gr_du(sum_u64, 3)); } - return result(error_code::SUCCESS, utf32_output - start); +}; +simdutf_really_inline simd8::operator simd8() const { + return this->value; } -} // utf16_to_utf32 namespace -} // unnamed namespace -} // namespace scalar -} // namespace simdutf +template struct simd8x64 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd8); + static_assert(NUM_CHUNKS == 2, + "LASX kernel should use two registers per 64-byte block."); + simd8 chunks[NUM_CHUNKS]; -#endif -/* end file src/scalar/utf16_to_utf32/utf16_to_utf32.h */ + simd8x64(const simd8x64 &o) = delete; // no copy allowed + simd8x64 & + operator=(const simd8 other) = delete; // no assignment allowed + simd8x64() = delete; // no default constructor allowed -/* begin file src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */ -#ifndef SIMDUTF_VALID_UTF8_TO_UTF16_H -#define SIMDUTF_VALID_UTF8_TO_UTF16_H + simdutf_really_inline simd8x64(const simd8 chunk0, const simd8 chunk1) + : chunks{chunk0, chunk1} {} + simdutf_really_inline simd8x64(const T *ptr) + : chunks{simd8::load(ptr), + simd8::load(ptr + sizeof(simd8) / sizeof(T))} {} -namespace simdutf { -namespace scalar { -namespace { -namespace utf8_to_utf16 { + simdutf_really_inline void store(T *ptr) const { + this->chunks[0].store(ptr + sizeof(simd8) * 0 / sizeof(T)); + this->chunks[1].store(ptr + sizeof(simd8) * 1 / sizeof(T)); + } -template -inline size_t convert_valid(const char* buf, size_t len, char16_t* utf16_output) { - const uint8_t *data = reinterpret_cast(buf); - size_t pos = 0; - char16_t* start{utf16_output}; - while (pos < len) { - // try to convert the next block of 8 ASCII bytes - if (pos + 8 <= len) { // if it is safe to read 8 more bytes, check that they are ascii - uint64_t v; - ::memcpy(&v, data + pos, sizeof(uint64_t)); - if ((v & 0x8080808080808080) == 0) { - size_t final_pos = pos + 8; - while(pos < final_pos) { - *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]); - pos++; - } - continue; - } - } - uint8_t leading_byte = data[pos]; // leading byte - if (leading_byte < 0b10000000) { - // converting one ASCII byte !!! - *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)) : char16_t(leading_byte); - pos++; - } else if ((leading_byte & 0b11100000) == 0b11000000) { - // We have a two-byte UTF-8, it should become - // a single UTF-16 word. - if(pos + 1 >= len) { break; } // minimal bound checking - uint16_t code_point = uint16_t(((leading_byte &0b00011111) << 6) | (data[pos + 1] &0b00111111)); - if (!match_system(big_endian)) { - code_point = utf16::swap_bytes(uint16_t(code_point)); - } - *utf16_output++ = char16_t(code_point); - pos += 2; - } else if ((leading_byte & 0b11110000) == 0b11100000) { - // We have a three-byte UTF-8, it should become - // a single UTF-16 word. - if(pos + 2 >= len) { break; } // minimal bound checking - uint16_t code_point = uint16_t(((leading_byte &0b00001111) << 12) | ((data[pos + 1] &0b00111111) << 6) | (data[pos + 2] &0b00111111)); - if (!match_system(big_endian)) { - code_point = utf16::swap_bytes(uint16_t(code_point)); - } - *utf16_output++ = char16_t(code_point); - pos += 3; - } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000 - // we have a 4-byte UTF-8 word. - if(pos + 3 >= len) { break; } // minimal bound checking - uint32_t code_point = ((leading_byte & 0b00000111) << 18 )| ((data[pos + 1] &0b00111111) << 12) - | ((data[pos + 2] &0b00111111) << 6) | (data[pos + 3] &0b00111111); - code_point -= 0x10000; - uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10)); - uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF)); - if (!match_system(big_endian)) { - high_surrogate = utf16::swap_bytes(high_surrogate); - low_surrogate = utf16::swap_bytes(low_surrogate); - } - *utf16_output++ = char16_t(high_surrogate); - *utf16_output++ = char16_t(low_surrogate); - pos += 4; - } else { - // we may have a continuation but we do not do error checking - return 0; - } + simdutf_really_inline uint64_t to_bitmask() const { + uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask()); + uint64_t r_hi = this->chunks[1].to_bitmask(); + return r_lo | (r_hi << 32); } - return utf16_output - start; -} + simdutf_really_inline simd8x64 &operator|=(const simd8x64 &other) { + this->chunks[0] |= other.chunks[0]; + this->chunks[1] |= other.chunks[1]; + return *this; + } -} // namespace utf8_to_utf16 -} // unnamed namespace -} // namespace scalar -} // namespace simdutf + simdutf_really_inline simd8 reduce_or() const { + return this->chunks[0] | this->chunks[1]; + } -#endif -/* end file src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */ -/* begin file src/scalar/utf8_to_utf16/utf8_to_utf16.h */ -#ifndef SIMDUTF_UTF8_TO_UTF16_H -#define SIMDUTF_UTF8_TO_UTF16_H + simdutf_really_inline bool is_ascii() const { + return this->reduce_or().is_ascii(); + } -namespace simdutf { -namespace scalar { -namespace { -namespace utf8_to_utf16 { + template + simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const { + this->chunks[0].template store_ascii_as_utf16(ptr + + sizeof(simd8) * 0); + this->chunks[1].template store_ascii_as_utf16(ptr + + sizeof(simd8) * 1); + } -template -inline size_t convert(const char* buf, size_t len, char16_t* utf16_output) { - const uint8_t *data = reinterpret_cast(buf); - size_t pos = 0; - char16_t* start{utf16_output}; - while (pos < len) { - // try to convert the next block of 16 ASCII bytes - if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii - uint64_t v1; - ::memcpy(&v1, data + pos, sizeof(uint64_t)); - uint64_t v2; - ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); - uint64_t v{v1 | v2}; - if ((v & 0x8080808080808080) == 0) { - size_t final_pos = pos + 16; - while(pos < final_pos) { - *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]); - pos++; - } - continue; - } - } + simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const { + this->chunks[0].store_ascii_as_utf32(ptr + sizeof(simd8) * 0); + this->chunks[1].store_ascii_as_utf32(ptr + sizeof(simd8) * 1); + } - uint8_t leading_byte = data[pos]; // leading byte - if (leading_byte < 0b10000000) { - // converting one ASCII byte !!! - *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)): char16_t(leading_byte); - pos++; - } else if ((leading_byte & 0b11100000) == 0b11000000) { - // We have a two-byte UTF-8, it should become - // a single UTF-16 word. - if(pos + 1 >= len) { return 0; } // minimal bound checking - if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } - // range check - uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); - if (code_point < 0x80 || 0x7ff < code_point) { return 0; } - if (!match_system(big_endian)) { - code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point))); - } - *utf16_output++ = char16_t(code_point); - pos += 2; - } else if ((leading_byte & 0b11110000) == 0b11100000) { - // We have a three-byte UTF-8, it should become - // a single UTF-16 word. - if(pos + 2 >= len) { return 0; } // minimal bound checking - - if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } - if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; } - // range check - uint32_t code_point = (leading_byte & 0b00001111) << 12 | - (data[pos + 1] & 0b00111111) << 6 | - (data[pos + 2] & 0b00111111); - if (code_point < 0x800 || 0xffff < code_point || - (0xd7ff < code_point && code_point < 0xe000)) { - return 0; - } - if (!match_system(big_endian)) { - code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point))); - } - *utf16_output++ = char16_t(code_point); - pos += 3; - } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000 - // we have a 4-byte UTF-8 word. - if(pos + 3 >= len) { return 0; } // minimal bound checking - if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } - if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; } - if ((data[pos + 3] & 0b11000000) != 0b10000000) { return 0; } - - // range check - uint32_t code_point = - (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | - (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111); - if (code_point <= 0xffff || 0x10ffff < code_point) { return 0; } - code_point -= 0x10000; - uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10)); - uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF)); - if (!match_system(big_endian)) { - high_surrogate = utf16::swap_bytes(high_surrogate); - low_surrogate = utf16::swap_bytes(low_surrogate); - } - *utf16_output++ = char16_t(high_surrogate); - *utf16_output++ = char16_t(low_surrogate); - pos += 4; - } else { - return 0; - } + simdutf_really_inline uint64_t lt(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64(this->chunks[0] < mask, this->chunks[1] < mask) + .to_bitmask(); } - return utf16_output - start; -} -template -inline result convert_with_errors(const char* buf, size_t len, char16_t* utf16_output) { - const uint8_t *data = reinterpret_cast(buf); - size_t pos = 0; - char16_t* start{utf16_output}; - while (pos < len) { - // try to convert the next block of 16 ASCII bytes - if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii - uint64_t v1; - ::memcpy(&v1, data + pos, sizeof(uint64_t)); - uint64_t v2; - ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); - uint64_t v{v1 | v2}; - if ((v & 0x8080808080808080) == 0) { - size_t final_pos = pos + 16; - while(pos < final_pos) { - *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]); - pos++; - } - continue; - } - } - uint8_t leading_byte = data[pos]; // leading byte - if (leading_byte < 0b10000000) { - // converting one ASCII byte !!! - *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)): char16_t(leading_byte); - pos++; - } else if ((leading_byte & 0b11100000) == 0b11000000) { - // We have a two-byte UTF-8, it should become - // a single UTF-16 word. - if(pos + 1 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking - if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } - // range check - uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); - if (code_point < 0x80 || 0x7ff < code_point) { return result(error_code::OVERLONG, pos); } - if (!match_system(big_endian)) { - code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point))); - } - *utf16_output++ = char16_t(code_point); - pos += 2; - } else if ((leading_byte & 0b11110000) == 0b11100000) { - // We have a three-byte UTF-8, it should become - // a single UTF-16 word. - if(pos + 2 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking - - if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } - if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } - // range check - uint32_t code_point = (leading_byte & 0b00001111) << 12 | - (data[pos + 1] & 0b00111111) << 6 | - (data[pos + 2] & 0b00111111); - if ((code_point < 0x800) || (0xffff < code_point)) { return result(error_code::OVERLONG, pos);} - if (0xd7ff < code_point && code_point < 0xe000) { return result(error_code::SURROGATE, pos); } - if (!match_system(big_endian)) { - code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point))); - } - *utf16_output++ = char16_t(code_point); - pos += 3; - } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000 - // we have a 4-byte UTF-8 word. - if(pos + 3 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking - if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } - if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } - if ((data[pos + 3] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } - - // range check - uint32_t code_point = - (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | - (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111); - if (code_point <= 0xffff) { return result(error_code::OVERLONG, pos); } - if (0x10ffff < code_point) { return result(error_code::TOO_LARGE, pos); } - code_point -= 0x10000; - uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10)); - uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF)); - if (!match_system(big_endian)) { - high_surrogate = utf16::swap_bytes(high_surrogate); - low_surrogate = utf16::swap_bytes(low_surrogate); - } - *utf16_output++ = char16_t(high_surrogate); - *utf16_output++ = char16_t(low_surrogate); - pos += 4; - } else { - // we either have too many continuation bytes or an invalid leading byte - if ((leading_byte & 0b11000000) == 0b10000000) { return result(error_code::TOO_LONG, pos); } - else { return result(error_code::HEADER_BITS, pos); } - } - } - return result(error_code::SUCCESS, utf16_output - start); -} - -/** - * When rewind_and_convert_with_errors is called, we are pointing at 'buf' and we have - * up to len input bytes left, and we encountered some error. It is possible that - * the error is at 'buf' exactly, but it could also be in the previous bytes (up to 3 bytes back). - * - * prior_bytes indicates how many bytes, prior to 'buf' may belong to the current memory section - * and can be safely accessed. We prior_bytes to access safely up to three bytes before 'buf'. - * - * The caller is responsible to ensure that len > 0. - * - * If the error is believed to have occurred prior to 'buf', the count value contain in the result - * will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3. - */ -template -inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf, size_t len, char16_t* utf16_output) { - size_t extra_len{0}; - // We potentially need to go back in time and find a leading byte. - // In theory '3' would be sufficient, but sometimes the error can go back quite far. - size_t how_far_back = prior_bytes; - // size_t how_far_back = 3; // 3 bytes in the past + current position - // if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; } - bool found_leading_bytes{false}; - // important: it is i <= how_far_back and not 'i < how_far_back'. - for(size_t i = 0; i <= how_far_back; i++) { - unsigned char byte = buf[0-i]; - found_leading_bytes = ((byte & 0b11000000) != 0b10000000); - if(found_leading_bytes) { - buf -= i; - extra_len = i; - break; - } - } - // - // It is possible for this function to return a negative count in its result. - // C++ Standard Section 18.1 defines size_t is in which is described in C Standard as . - // C Standard Section 4.1.5 defines size_t as an unsigned integral type of the result of the sizeof operator - // - // An unsigned type will simply wrap round arithmetically (well defined). - // - if(!found_leading_bytes) { - // If how_far_back == 3, we may have four consecutive continuation bytes!!! - // [....] [continuation] [continuation] [continuation] | [buf is continuation] - // Or we possibly have a stream that does not start with a leading byte. - return result(error_code::TOO_LONG, 0-how_far_back); + simdutf_really_inline uint64_t gt(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64(this->chunks[0] > mask, this->chunks[1] > mask) + .to_bitmask(); } - result res = convert_with_errors(buf, len + extra_len, utf16_output); - if (res.error) { - res.count -= extra_len; + simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const { + const simd8 mask = simd8::splat(m); + return simd8x64((simd8(__m256i(this->chunks[0])) >= mask), + (simd8(__m256i(this->chunks[1])) >= mask)) + .to_bitmask(); } - return res; -} +}; // struct simd8x64 -} // utf8_to_utf16 namespace -} // unnamed namespace -} // namespace scalar -} // namespace simdutf +/* begin file src/simdutf/lasx/simd16-inl.h */ +template struct simd16; -#endif -/* end file src/scalar/utf8_to_utf16/utf8_to_utf16.h */ +template > +struct base16 : base> { + using bitmask_type = uint32_t; -/* begin file src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h */ -#ifndef SIMDUTF_VALID_UTF8_TO_UTF32_H -#define SIMDUTF_VALID_UTF8_TO_UTF32_H + simdutf_really_inline base16() : base>() {} + simdutf_really_inline base16(const __m256i _value) + : base>(_value) {} + template + simdutf_really_inline base16(const Pointer *ptr) + : base16(__lasx_xvld(reinterpret_cast(ptr), 0)) {} -namespace simdutf { -namespace scalar { -namespace { -namespace utf8_to_utf32 { + /// the size of vector in bytes + static const int SIZE = sizeof(base>::value); -inline size_t convert_valid(const char* buf, size_t len, char32_t* utf32_output) { - const uint8_t *data = reinterpret_cast(buf); - size_t pos = 0; - char32_t* start{utf32_output}; - while (pos < len) { - // try to convert the next block of 8 ASCII bytes - if (pos + 8 <= len) { // if it is safe to read 8 more bytes, check that they are ascii - uint64_t v; - ::memcpy(&v, data + pos, sizeof(uint64_t)); - if ((v & 0x8080808080808080) == 0) { - size_t final_pos = pos + 8; - while(pos < final_pos) { - *utf32_output++ = char32_t(buf[pos]); - pos++; - } - continue; - } - } - uint8_t leading_byte = data[pos]; // leading byte - if (leading_byte < 0b10000000) { - // converting one ASCII byte !!! - *utf32_output++ = char32_t(leading_byte); - pos++; - } else if ((leading_byte & 0b11100000) == 0b11000000) { - // We have a two-byte UTF-8 - if(pos + 1 >= len) { break; } // minimal bound checking - *utf32_output++ = char32_t(((leading_byte &0b00011111) << 6) | (data[pos + 1] &0b00111111)); - pos += 2; - } else if ((leading_byte & 0b11110000) == 0b11100000) { - // We have a three-byte UTF-8 - if(pos + 2 >= len) { break; } // minimal bound checking - *utf32_output++ = char32_t(((leading_byte &0b00001111) << 12) | ((data[pos + 1] &0b00111111) << 6) | (data[pos + 2] &0b00111111)); - pos += 3; - } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000 - // we have a 4-byte UTF-8 word. - if(pos + 3 >= len) { break; } // minimal bound checking - uint32_t code_word = ((leading_byte & 0b00000111) << 18 )| ((data[pos + 1] &0b00111111) << 12) - | ((data[pos + 2] &0b00111111) << 6) | (data[pos + 3] &0b00111111); - *utf32_output++ = char32_t(code_word); - pos += 4; - } else { - // we may have a continuation but we do not do error checking - return 0; - } - } - return utf32_output - start; -} + /// the number of elements of type T a vector can hold + static const int ELEMENTS = SIZE / sizeof(T); +}; +// SIMD byte mask type (returned by things like eq and gt) +template <> struct simd16 : base16 { + static simdutf_really_inline simd16 splat(bool _value) { + return __lasx_xvreplgr2vr_h(uint16_t(-(!!_value))); + } -} // namespace utf8_to_utf32 -} // unnamed namespace -} // namespace scalar -} // namespace simdutf + simdutf_really_inline simd16() : base16() {} + simdutf_really_inline simd16(const __m256i _value) : base16(_value) {} + // Splat constructor + simdutf_really_inline simd16(bool _value) : base16(splat(_value)) {} -#endif -/* end file src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h */ -/* begin file src/scalar/utf8_to_utf32/utf8_to_utf32.h */ -#ifndef SIMDUTF_UTF8_TO_UTF32_H -#define SIMDUTF_UTF8_TO_UTF32_H + simdutf_really_inline bitmask_type to_bitmask() const { + __m256i mask = __lasx_xvmsknz_b(this->value); + bitmask_type mask0 = __lasx_xvpickve2gr_wu(mask, 0); + bitmask_type mask1 = __lasx_xvpickve2gr_wu(mask, 4); + return (mask0 | (mask1 << 16)); + } + simdutf_really_inline simd16 operator~() const { return *this ^ true; } -namespace simdutf { -namespace scalar { -namespace { -namespace utf8_to_utf32 { + simdutf_really_inline bool is_zero() const { + return __lasx_xbz_v(this->value); + } -inline size_t convert(const char* buf, size_t len, char32_t* utf32_output) { - const uint8_t *data = reinterpret_cast(buf); - size_t pos = 0; - char32_t* start{utf32_output}; - while (pos < len) { - // try to convert the next block of 16 ASCII bytes - if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii - uint64_t v1; - ::memcpy(&v1, data + pos, sizeof(uint64_t)); - uint64_t v2; - ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); - uint64_t v{v1 | v2}; - if ((v & 0x8080808080808080) == 0) { - size_t final_pos = pos + 16; - while(pos < final_pos) { - *utf32_output++ = char32_t(buf[pos]); - pos++; - } - continue; - } - } - uint8_t leading_byte = data[pos]; // leading byte - if (leading_byte < 0b10000000) { - // converting one ASCII byte !!! - *utf32_output++ = char32_t(leading_byte); - pos++; - } else if ((leading_byte & 0b11100000) == 0b11000000) { - // We have a two-byte UTF-8 - if(pos + 1 >= len) { return 0; } // minimal bound checking - if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } - // range check - uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); - if (code_point < 0x80 || 0x7ff < code_point) { return 0; } - *utf32_output++ = char32_t(code_point); - pos += 2; - } else if ((leading_byte & 0b11110000) == 0b11100000) { - // We have a three-byte UTF-8 - if(pos + 2 >= len) { return 0; } // minimal bound checking - - if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } - if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; } - // range check - uint32_t code_point = (leading_byte & 0b00001111) << 12 | - (data[pos + 1] & 0b00111111) << 6 | - (data[pos + 2] & 0b00111111); - if (code_point < 0x800 || 0xffff < code_point || - (0xd7ff < code_point && code_point < 0xe000)) { - return 0; - } - *utf32_output++ = char32_t(code_point); - pos += 3; - } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000 - // we have a 4-byte UTF-8 word. - if(pos + 3 >= len) { return 0; } // minimal bound checking - if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } - if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; } - if ((data[pos + 3] & 0b11000000) != 0b10000000) { return 0; } - - // range check - uint32_t code_point = - (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | - (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111); - if (code_point <= 0xffff || 0x10ffff < code_point) { return 0; } - *utf32_output++ = char32_t(code_point); - pos += 4; - } else { - return 0; - } + template simdutf_really_inline simd16 byte_right_shift() const { + const auto t0 = __lasx_xvbsrl_v(this->value, N); + const auto t1 = __lasx_xvpermi_q(this->value, __lasx_xvldi(0), 0b00000011); + const auto t2 = __lasx_xvbsll_v(t1, 16 - N); + const auto t3 = __lasx_xvor_v(t0, t2); + return t3; } - return utf32_output - start; -} -inline result convert_with_errors(const char* buf, size_t len, char32_t* utf32_output) { - const uint8_t *data = reinterpret_cast(buf); - size_t pos = 0; - char32_t* start{utf32_output}; - while (pos < len) { - // try to convert the next block of 16 ASCII bytes - if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii - uint64_t v1; - ::memcpy(&v1, data + pos, sizeof(uint64_t)); - uint64_t v2; - ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); - uint64_t v{v1 | v2}; - if ((v & 0x8080808080808080) == 0) { - size_t final_pos = pos + 16; - while(pos < final_pos) { - *utf32_output++ = char32_t(buf[pos]); - pos++; - } - continue; - } - } - uint8_t leading_byte = data[pos]; // leading byte - if (leading_byte < 0b10000000) { - // converting one ASCII byte !!! - *utf32_output++ = char32_t(leading_byte); - pos++; - } else if ((leading_byte & 0b11100000) == 0b11000000) { - // We have a two-byte UTF-8 - if(pos + 1 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking - if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } - // range check - uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); - if (code_point < 0x80 || 0x7ff < code_point) { return result(error_code::OVERLONG, pos); } - *utf32_output++ = char32_t(code_point); - pos += 2; - } else if ((leading_byte & 0b11110000) == 0b11100000) { - // We have a three-byte UTF-8 - if(pos + 2 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking - - if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } - if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } - // range check - uint32_t code_point = (leading_byte & 0b00001111) << 12 | - (data[pos + 1] & 0b00111111) << 6 | - (data[pos + 2] & 0b00111111); - if (code_point < 0x800 || 0xffff < code_point) { return result(error_code::OVERLONG, pos); } - if (0xd7ff < code_point && code_point < 0xe000) { return result(error_code::SURROGATE, pos); } - *utf32_output++ = char32_t(code_point); - pos += 3; - } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000 - // we have a 4-byte UTF-8 word. - if(pos + 3 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking - if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos);} - if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } - if ((data[pos + 3] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } - - // range check - uint32_t code_point = - (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | - (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111); - if (code_point <= 0xffff) { return result(error_code::OVERLONG, pos); } - if (0x10ffff < code_point) { return result(error_code::TOO_LARGE, pos); } - *utf32_output++ = char32_t(code_point); - pos += 4; - } else { - // we either have too many continuation bytes or an invalid leading byte - if ((leading_byte & 0b11000000) == 0b10000000) { return result(error_code::TOO_LONG, pos); } - else { return result(error_code::HEADER_BITS, pos); } - } - } - return result(error_code::SUCCESS, utf32_output - start); -} + simdutf_really_inline uint16_t first() const { + return uint16_t(__lasx_xvpickve2gr_w(value, 0)); + } +}; -/** - * When rewind_and_convert_with_errors is called, we are pointing at 'buf' and we have - * up to len input bytes left, and we encountered some error. It is possible that - * the error is at 'buf' exactly, but it could also be in the previous bytes location (up to 3 bytes back). - * - * prior_bytes indicates how many bytes, prior to 'buf' may belong to the current memory section - * and can be safely accessed. We prior_bytes to access safely up to three bytes before 'buf'. - * - * The caller is responsible to ensure that len > 0. - * - * If the error is believed to have occurred prior to 'buf', the count value contain in the result - * will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3. - */ -inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf, size_t len, char32_t* utf32_output) { - size_t extra_len{0}; - // We potentially need to go back in time and find a leading byte. - size_t how_far_back = 3; // 3 bytes in the past + current position - if(how_far_back > prior_bytes) { how_far_back = prior_bytes; } - bool found_leading_bytes{false}; - // important: it is i <= how_far_back and not 'i < how_far_back'. - for(size_t i = 0; i <= how_far_back; i++) { - unsigned char byte = buf[0-i]; - found_leading_bytes = ((byte & 0b11000000) != 0b10000000); - if(found_leading_bytes) { - buf -= i; - extra_len = i; - break; - } +template struct base16_numeric : base16 { + static simdutf_really_inline simd16 splat(T _value) { + return __lasx_xvreplgr2vr_h((uint16_t)_value); } - // - // It is possible for this function to return a negative count in its result. - // C++ Standard Section 18.1 defines size_t is in which is described in C Standard as . - // C Standard Section 4.1.5 defines size_t as an unsigned integral type of the result of the sizeof operator - // - // An unsigned type will simply wrap round arithmetically (well defined). - // - if(!found_leading_bytes) { - // If how_far_back == 3, we may have four consecutive continuation bytes!!! - // [....] [continuation] [continuation] [continuation] | [buf is continuation] - // Or we possibly have a stream that does not start with a leading byte. - return result(error_code::TOO_LONG, 0-how_far_back); + static simdutf_really_inline simd16 zero() { return __lasx_xvldi(0); } + template + static simdutf_really_inline simd16 load(const Pointer values) { + return __lasx_xvld(values, 0); } - result res = convert_with_errors(buf, len + extra_len, utf32_output); - if (res.error) { - res.count -= extra_len; + simdutf_really_inline base16_numeric() : base16() {} + simdutf_really_inline base16_numeric(const __m256i _value) + : base16(_value) {} + + // Store to array + simdutf_really_inline void store(T dst[8]) const { + return __lasx_xvst(this->value, reinterpret_cast<__m256i *>(dst), 0); } - return res; -} -} // utf8_to_utf32 namespace -} // unnamed namespace -} // namespace scalar -} // namespace simdutf + // Override to distinguish from bool version + simdutf_really_inline simd16 operator~() const { return *this ^ 0xFFFFu; } +}; -#endif -/* end file src/scalar/utf8_to_utf32/utf8_to_utf32.h */ +// Unsigned code units +template <> struct simd16 : base16_numeric { + simdutf_really_inline simd16() : base16_numeric() {} + simdutf_really_inline simd16(const __m256i _value) + : base16_numeric(_value) {} -/* begin file src/scalar/latin1_to_utf8/latin1_to_utf8.h */ -#ifndef SIMDUTF_LATIN1_TO_UTF8_H -#define SIMDUTF_LATIN1_TO_UTF8_H + // Splat constructor + simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {} -namespace simdutf { -namespace scalar { -namespace { -namespace latin1_to_utf8 { + // Array constructor + simdutf_really_inline simd16(const uint16_t *values) : simd16(load(values)) {} + simdutf_really_inline simd16(const char16_t *values) + : simd16(load(reinterpret_cast(values))) {} -inline size_t convert(const char* buf, size_t len, char* utf8_output) { - const unsigned char *data = reinterpret_cast(buf); - size_t pos = 0; - char* start{utf8_output}; - while (pos < len) { - // try to convert the next block of 16 ASCII bytes - if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii - uint64_t v1; - ::memcpy(&v1, data + pos, sizeof(uint64_t)); - uint64_t v2; - ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); - uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000 1000 1000, so it makes sense to concatenate everything - if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII - size_t final_pos = pos + 16; - while(pos < final_pos) { - *utf8_output++ = char(buf[pos]); - pos++; - } - continue; - } - } + // Order-specific operations + simdutf_really_inline simd16 &operator+=(const simd16 other) { + value = __lasx_xvadd_h(value, other.value); + return *this; + } - unsigned char byte = data[pos]; - if((byte & 0x80) == 0) { // if ASCII - // will generate one UTF-8 bytes - *utf8_output++ = char(byte); - pos++; - } else { - // will generate two UTF-8 bytes - *utf8_output++ = char((byte>>6) | 0b11000000); - *utf8_output++ = char((byte & 0b111111) | 0b10000000); - pos++; - } + // Change the endianness + simdutf_really_inline simd16 swap_bytes() const { + return __lasx_xvshuf4i_b(this->value, 0b10110001); } - return utf8_output - start; -} -} // latin1_to_utf8 namespace -} // unnamed namespace -} // namespace scalar -} // namespace simdutf + template + static simdutf_really_inline simd8 + pack_shifted_right(const simd16 &v0, const simd16 &v1) { + return __lasx_xvpermi_d(__lasx_xvssrlni_bu_h(v1.value, v0.value, N), + 0b11011000); + } -#endif -/* end file src/scalar/latin1_to_utf8/latin1_to_utf8.h */ -/* begin file src/scalar/latin1_to_utf16/latin1_to_utf16.h */ -#ifndef SIMDUTF_LATIN1_TO_UTF16_H -#define SIMDUTF_LATIN1_TO_UTF16_H + // Pack with the unsigned saturation of two uint16_t code units into single + // uint8_t vector + static simdutf_really_inline simd8 pack(const simd16 &v0, + const simd16 &v1) { -namespace simdutf { -namespace scalar { -namespace { -namespace latin1_to_utf16 { + return pack_shifted_right<0>(v0, v1); + } -template -inline size_t convert(const char* buf, size_t len, char16_t* utf16_output) { - const uint8_t* data = reinterpret_cast(buf); - size_t pos = 0; - char16_t* start{ utf16_output }; + simdutf_really_inline uint64_t sum() const { + const auto sum_u32 = __lasx_xvhaddw_wu_hu(value, value); + const auto sum_u64 = __lasx_xvhaddw_du_wu(sum_u32, sum_u32); - while (pos < len) { - uint16_t word = uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point - *utf16_output++ = char16_t(match_system(big_endian) ? word : utf16::swap_bytes(word)); - pos++; + return uint64_t(__lasx_xvpickve2gr_du(sum_u64, 0)) + + uint64_t(__lasx_xvpickve2gr_du(sum_u64, 1)) + + uint64_t(__lasx_xvpickve2gr_du(sum_u64, 2)) + + uint64_t(__lasx_xvpickve2gr_du(sum_u64, 3)); } - return utf16_output - start; + template simdutf_really_inline simd16 byte_right_shift() const { + return __lasx_xvbsrl_v(this->value, N); + } +}; + +simdutf_really_inline simd16 operator<(const simd16 a, + const simd16 b) { + return __lasx_xvslt_hu(a.value, b.value); } -template -inline result convert_with_errors(const char* buf, size_t len, char16_t* utf16_output) { - const uint8_t* data = reinterpret_cast(buf); - size_t pos = 0; - char16_t* start{ utf16_output }; +simdutf_really_inline simd16 operator>(const simd16 a, + const simd16 b) { + return __lasx_xvslt_hu(b.value, a.value); +} - while (pos < len) { - uint16_t word = uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point - *utf16_output++ = char16_t(match_system(big_endian) ? word : utf16::swap_bytes(word)); - pos++; - } +simdutf_really_inline simd16 operator<=(const simd16 a, + const simd16 b) { + return __lasx_xvsle_hu(a.value, b.value); +} - return result(error_code::SUCCESS, utf16_output - start); +simdutf_really_inline simd16 operator>=(const simd16 a, + const simd16 b) { + return __lasx_xvsle_hu(b.value, a.value); } -} // latin1_to_utf16 namespace -} // unnamed namespace -} // namespace scalar -} // namespace simdutf +template struct simd16x32 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd16); + static_assert(NUM_CHUNKS == 2, + "LASX kernel should use two registers per 64-byte block."); + simd16 chunks[NUM_CHUNKS]; -#endif -/* end file src/scalar/latin1_to_utf16/latin1_to_utf16.h */ -/* begin file src/scalar/latin1_to_utf32/latin1_to_utf32.h */ -#ifndef SIMDUTF_LATIN1_TO_UTF32_H -#define SIMDUTF_LATIN1_TO_UTF32_H + simd16x32(const simd16x32 &o) = delete; // no copy allowed + simd16x32 & + operator=(const simd16 other) = delete; // no assignment allowed + simd16x32() = delete; // no default constructor allowed -namespace simdutf { -namespace scalar { -namespace { -namespace latin1_to_utf32 { + simdutf_really_inline simd16x32(const simd16 chunk0, + const simd16 chunk1) + : chunks{chunk0, chunk1} {} + simdutf_really_inline simd16x32(const T *ptr) + : chunks{simd16::load(ptr), + simd16::load(ptr + sizeof(simd16) / sizeof(T))} {} + simdutf_really_inline void store(T *ptr) const { + this->chunks[0].store(ptr + sizeof(simd16) * 0 / sizeof(T)); + this->chunks[1].store(ptr + sizeof(simd16) * 1 / sizeof(T)); + } -inline size_t convert(const char *buf, size_t len, char32_t *utf32_output) { - const unsigned char *data = reinterpret_cast(buf); - char32_t* start{utf32_output}; - for (size_t i = 0; i < len; i++) { - *utf32_output++ = (char32_t)data[i]; + simdutf_really_inline void swap_bytes() { + this->chunks[0] = this->chunks[0].swap_bytes(); + this->chunks[1] = this->chunks[1].swap_bytes(); } - return utf32_output - start; + simdutf_really_inline uint64_t to_bitmask() const { + uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask()); + uint64_t r_hi = this->chunks[1].to_bitmask(); + return r_lo | (r_hi << 32); + } + simdutf_really_inline uint64_t lteq(const T m) const { + const simd16 mask = simd16::splat(m); + return simd16x32(this->chunks[0] <= mask, this->chunks[1] <= mask) + .to_bitmask(); + } +}; // struct simd16x32 + +simdutf_really_inline simd16 min(const simd16 a, + const simd16 b) { + return __lasx_xvmin_hu(a.value, b.value); } -} // latin1_to_utf32 namespace -} // unnamed namespace -} // namespace scalar -} // namespace simdutf +simdutf_really_inline simd16 operator==(const simd16 a, + uint16_t b) { + const auto bv = __lasx_xvreplgr2vr_h(b); + return __lasx_xvseq_h(a.value, bv); +} -#endif -/* end file src/scalar/latin1_to_utf32/latin1_to_utf32.h */ +simdutf_really_inline simd16 as_vector_u16(const simd16 x) { + return x.value; +} -/* begin file src/scalar/utf8_to_latin1/utf8_to_latin1.h */ -#ifndef SIMDUTF_UTF8_TO_LATIN1_H -#define SIMDUTF_UTF8_TO_LATIN1_H +simdutf_really_inline simd16 operator&(const simd16 a, + uint16_t b) { + const auto bv = __lasx_xvreplgr2vr_h(b); + return __lasx_xvand_v(a.value, bv); +} -namespace simdutf { -namespace scalar { -namespace { -namespace utf8_to_latin1 { +simdutf_really_inline simd16 operator&(const simd16 a, + const simd16 b) { + return __lasx_xvand_v(a.value, b.value); +} -inline size_t convert(const char* buf, size_t len, char* latin_output) { - const uint8_t *data = reinterpret_cast(buf); - size_t pos = 0; - char* start{latin_output}; - - while (pos < len) { - // try to convert the next block of 16 ASCII bytes - if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii - uint64_t v1; - ::memcpy(&v1, data + pos, sizeof(uint64_t)); - uint64_t v2; - ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); - uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000 1000 1000 .... etc - if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII - size_t final_pos = pos + 16; - while(pos < final_pos) { - *latin_output++ = char(buf[pos]); - pos++; - } - continue; - } - } +simdutf_really_inline simd16 operator^(const simd16 a, + uint16_t b) { + const auto bv = __lasx_xvreplgr2vr_h(b); + return __lasx_xvxor_v(a.value, bv); +} - // suppose it is not an all ASCII byte sequence - uint8_t leading_byte = data[pos]; // leading byte - if (leading_byte < 0b10000000) { - // converting one ASCII byte !!! - *latin_output++ = char(leading_byte); - pos++; - } else if ((leading_byte & 0b11100000) == 0b11000000) { // the first three bits indicate: - // We have a two-byte UTF-8 - if(pos + 1 >= len) { - return 0; - } // minimal bound checking - if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } // checks if the next byte is a valid continuation byte in UTF-8. A valid continuation byte starts with 10. - // range check - - uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); // assembles the Unicode code point from the two bytes. It does this by discarding the leading 110 and 10 bits from the two bytes, shifting the remaining bits of the first byte, and then combining the results with a bitwise OR operation. - if (code_point < 0x80 || 0xFF < code_point) { - return 0; // We only care about the range 129-255 which is Non-ASCII latin1 characters. A code_point beneath 0x80 is invalid as it's already covered by bytes whose leading bit is zero. - } - *latin_output++ = char(code_point); - pos += 2; - } else { - return 0; - } - } - return latin_output - start; +simdutf_really_inline simd16 operator^(const simd16 a, + const simd16 b) { + return __lasx_xvxor_v(a.value, b.value); } +/* end file src/simdutf/lasx/simd16-inl.h */ +/* begin file src/simdutf/lasx/simd32-inl.h */ +template struct simd32; -inline result convert_with_errors(const char* buf, size_t len, char* latin_output) { - const uint8_t *data = reinterpret_cast(buf); - size_t pos = 0; - char* start{latin_output}; - - while (pos < len) { - // try to convert the next block of 16 ASCII bytes - if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii - uint64_t v1; - ::memcpy(&v1, data + pos, sizeof(uint64_t)); - uint64_t v2; - ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); - uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000 1000 1000...etc - if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII - size_t final_pos = pos + 16; - while(pos < final_pos) { - *latin_output++ = char(buf[pos]); - pos++; - } - continue; - } - } - // suppose it is not an all ASCII byte sequence - uint8_t leading_byte = data[pos]; // leading byte - if (leading_byte < 0b10000000) { - // converting one ASCII byte !!! - *latin_output++ = char(leading_byte); - pos++; - } else if ((leading_byte & 0b11100000) == 0b11000000) { // the first three bits indicate: - // We have a two-byte UTF-8 - if(pos + 1 >= len) { - return result(error_code::TOO_SHORT, pos); } // minimal bound checking - if ((data[pos + 1] & 0b11000000) != 0b10000000) { - return result(error_code::TOO_SHORT, pos); } // checks if the next byte is a valid continuation byte in UTF-8. A valid continuation byte starts with 10. - // range check - - uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); // assembles the Unicode code point from the two bytes. It does this by discarding the leading 110 and 10 bits from the two bytes, shifting the remaining bits of the first byte, and then combining the results with a bitwise OR operation. - if (code_point < 0x80) { - return result(error_code::OVERLONG, pos); - } - if (0xFF < code_point) { - return result(error_code::TOO_LARGE, pos); - } // We only care about the range 129-255 which is Non-ASCII latin1 characters - *latin_output++ = char(code_point); - pos += 2; - } else if ((leading_byte & 0b11110000) == 0b11100000) { - // We have a three-byte UTF-8 - return result(error_code::TOO_LARGE, pos); - } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000 - // we have a 4-byte UTF-8 word. - return result(error_code::TOO_LARGE, pos); - } else { - // we either have too many continuation bytes or an invalid leading byte - if ((leading_byte & 0b11000000) == 0b10000000) { - return result(error_code::TOO_LONG, pos); - } +template <> struct simd32 { + __m256i value; + static const int SIZE = sizeof(value); + static const int ELEMENTS = SIZE / sizeof(uint32_t); - return result(error_code::HEADER_BITS, pos); + // constructors + simdutf_really_inline simd32(__m256i v) : value(v) {} - } + template + simdutf_really_inline simd32(Ptr *ptr) : value(__lasx_xvld(ptr, 0)) {} + + // in-place operators + simdutf_really_inline simd32 &operator-=(const simd32 other) { + value = __lasx_xvsub_w(value, other.value); + return *this; } - return result(error_code::SUCCESS, latin_output - start); -} + // members + simdutf_really_inline uint64_t sum() const { + const auto odd = __lasx_xvsrli_d(value, 32); + const auto even = __lasx_xvand_v(value, __lasx_xvreplgr2vr_d(0xffffffff)); -inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf, size_t len, char* latin1_output) { - size_t extra_len{0}; - // We potentially need to go back in time and find a leading byte. - // In theory '3' would be sufficient, but sometimes the error can go back quite far. - size_t how_far_back = prior_bytes; - // size_t how_far_back = 3; // 3 bytes in the past + current position - // if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; } - bool found_leading_bytes{false}; - // important: it is i <= how_far_back and not 'i < how_far_back'. - for(size_t i = 0; i <= how_far_back; i++) { - unsigned char byte = buf[0-i]; - found_leading_bytes = ((byte & 0b11000000) != 0b10000000); - if(found_leading_bytes) { - buf -= i; - extra_len = i; - break; - } - } - // - // It is possible for this function to return a negative count in its result. - // C++ Standard Section 18.1 defines size_t is in which is described in C Standard as . - // C Standard Section 4.1.5 defines size_t as an unsigned integral type of the result of the sizeof operator - // - // An unsigned type will simply wrap round arithmetically (well defined). - // - if(!found_leading_bytes) { - // If how_far_back == 3, we may have four consecutive continuation bytes!!! - // [....] [continuation] [continuation] [continuation] | [buf is continuation] - // Or we possibly have a stream that does not start with a leading byte. - return result(error_code::TOO_LONG, 0-how_far_back); - } - result res = convert_with_errors(buf, len + extra_len, latin1_output); - if (res.error) { - res.count -= extra_len; + const auto sum64 = __lasx_xvadd_d(odd, even); + + return uint64_t(__lasx_xvpickve2gr_du(sum64, 0)) + + uint64_t(__lasx_xvpickve2gr_du(sum64, 1)) + + uint64_t(__lasx_xvpickve2gr_du(sum64, 2)) + + uint64_t(__lasx_xvpickve2gr_du(sum64, 3)); } - return res; -} + // static members + static simdutf_really_inline simd32 splat(uint32_t x) { + return __lasx_xvreplgr2vr_w(x); + } -} // utf8_to_latin1 namespace -} // unnamed namespace -} // namespace scalar -} // namespace simdutf + static simdutf_really_inline simd32 zero() { + return __lasx_xvrepli_w(0); + } +}; -#endif -/* end file src/scalar/utf8_to_latin1/utf8_to_latin1.h */ -/* begin file src/scalar/utf16_to_latin1/utf16_to_latin1.h */ -#ifndef SIMDUTF_UTF16_TO_LATIN1_H -#define SIMDUTF_UTF16_TO_LATIN1_H +// ------------------------------------------------------------ -namespace simdutf { -namespace scalar { -namespace { -namespace utf16_to_latin1 { +template <> struct simd32 { + __m256i value; + static const int SIZE = sizeof(value); -#include // for std::memcpy + // constructors + simdutf_really_inline simd32(__m256i v) : value(v) {} +}; -template -inline size_t convert(const char16_t* buf, size_t len, char* latin_output) { - const uint16_t *data = reinterpret_cast(buf); - size_t pos = 0; - std::vector temp_output(len); - char* current_write = temp_output.data(); - uint16_t word = 0; - uint16_t too_large = 0; - - while (pos < len) { - word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos]; - too_large |= word; - *current_write++ = char(word & 0xFF); - pos++; - } - if((too_large & 0xFF00) != 0) { return 0; } +// ------------------------------------------------------------ - // Only copy to latin_output if there were no errors - std::memcpy(latin_output, temp_output.data(), len); +simdutf_really_inline simd32 operator&(const simd32 a, + const simd32 b) { + return __lasx_xvor_v(a.value, b.value); +} - return current_write - temp_output.data(); +simdutf_really_inline simd32 operator<(const simd32 a, + const simd32 b) { + return __lasx_xvslt_wu(a.value, b.value); } -template -inline result convert_with_errors(const char16_t* buf, size_t len, char* latin_output) { - const uint16_t *data = reinterpret_cast(buf); - size_t pos = 0; - char* start{latin_output}; - uint16_t word; - - while (pos < len) { - if (pos + 16 <= len) { // if it is safe to read 32 more bytes, check that they are Latin1 - uint64_t v1, v2, v3, v4; - ::memcpy(&v1, data + pos, sizeof(uint64_t)); - ::memcpy(&v2, data + pos + 4, sizeof(uint64_t)); - ::memcpy(&v3, data + pos + 8, sizeof(uint64_t)); - ::memcpy(&v4, data + pos + 12, sizeof(uint64_t)); - - if (!match_system(big_endian)) { v1 = (v1 >> 8) | (v1 << (64 - 8)); } - if (!match_system(big_endian)) { v2 = (v2 >> 8) | (v2 << (64 - 8)); } - if (!match_system(big_endian)) { v3 = (v3 >> 8) | (v3 << (64 - 8)); } - if (!match_system(big_endian)) { v4 = (v1 >> 8) | (v4 << (64 - 8)); } - - if (((v1 | v2 | v3 | v4) & 0xFF00FF00FF00FF00) == 0) { - size_t final_pos = pos + 16; - while(pos < final_pos) { - *latin_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(data[pos])) : char(data[pos]); - pos++; - } - continue; - } - } - word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos]; - if((word & 0xFF00 ) == 0) { - *latin_output++ = char(word & 0xFF); - pos++; - } else { return result(error_code::TOO_LARGE, pos); } - } - return result(error_code::SUCCESS,latin_output - start); +simdutf_really_inline simd32 operator>(const simd32 a, + const simd32 b) { + return __lasx_xvslt_wu(b.value, a.value); } +// ------------------------------------------------------------ -} // utf16_to_latin1 namespace -} // unnamed namespace -} // namespace scalar -} // namespace simdutf +simdutf_really_inline simd32 as_vector_u32(const simd32 v) { + return v.value; +} +/* end file src/simdutf/lasx/simd32-inl.h */ +/* begin file src/simdutf/lasx/simd64-inl.h */ +template struct simd64; -#endif -/* end file src/scalar/utf16_to_latin1/utf16_to_latin1.h */ -/* begin file src/scalar/utf32_to_latin1/utf32_to_latin1.h */ -#ifndef SIMDUTF_UTF32_TO_LATIN1_H -#define SIMDUTF_UTF32_TO_LATIN1_H +template <> struct simd64 { + __m256i value; + static const int SIZE = sizeof(value); + static const int ELEMENTS = SIZE / sizeof(uint64_t); -namespace simdutf { -namespace scalar { -namespace { -namespace utf32_to_latin1 { + // constructors + simdutf_really_inline simd64(__m256i v) : value(v) {} -inline size_t convert(const char32_t *buf, size_t len, char *latin1_output) { - const uint32_t *data = reinterpret_cast(buf); - char* start = latin1_output; - uint32_t utf32_char; - size_t pos = 0; - uint32_t too_large = 0; + template + simdutf_really_inline simd64(Ptr *ptr) : value(__lasx_xvld(ptr, 0)) {} - while (pos < len) { - utf32_char = (uint32_t)data[pos]; - too_large |= utf32_char; - *latin1_output++ = (char)(utf32_char & 0xFF); - pos++; + // in-place operators + simdutf_really_inline simd64 &operator+=(const simd64 other) { + value = __lasx_xvadd_d(value, other.value); + return *this; } - if((too_large & 0xFFFFFF00) != 0) { return 0; } - return latin1_output - start; -} -inline result convert_with_errors(const char32_t *buf, size_t len, char *latin1_output) { - const uint32_t *data = reinterpret_cast(buf); - char* start{latin1_output}; - size_t pos = 0; - while (pos < len) { - if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are Latin1 - uint64_t v; - ::memcpy(&v, data + pos, sizeof(uint64_t)); - if ((v & 0xFFFFFF00FFFFFF00) == 0) { - *latin1_output++ = char(buf[pos]); - *latin1_output++ = char(buf[pos+1]); - pos += 2; - continue; - } - } - uint32_t utf32_char = data[pos]; - if ((utf32_char & 0xFFFFFF00) == 0) { // Check if the character can be represented in Latin-1 - *latin1_output++ = (char)(utf32_char & 0xFF); - pos++; - } else { return result(error_code::TOO_LARGE, pos); }; + // members + simdutf_really_inline uint64_t sum() const { + return uint64_t(__lasx_xvpickve2gr_du(value, 0)) + + uint64_t(__lasx_xvpickve2gr_du(value, 1)) + + uint64_t(__lasx_xvpickve2gr_du(value, 2)) + + uint64_t(__lasx_xvpickve2gr_du(value, 3)); } - return result(error_code::SUCCESS, latin1_output - start); -} -} // utf32_to_latin1 namespace -} // unnamed namespace -} // namespace scalar -} // namespace simdutf + // static members + static simdutf_really_inline simd64 zero() { + return __lasx_xvrepli_d(0); + } +}; -#endif -/* end file src/scalar/utf32_to_latin1/utf32_to_latin1.h */ +// ------------------------------------------------------------ -/* begin file src/scalar/utf8_to_latin1/valid_utf8_to_latin1.h */ -#ifndef SIMDUTF_VALID_UTF8_TO_LATIN1_H -#define SIMDUTF_VALID_UTF8_TO_LATIN1_H +template <> struct simd64 { + __m256i value; + static const int SIZE = sizeof(value); -namespace simdutf { -namespace scalar { -namespace { -namespace utf8_to_latin1 { + // constructors + simdutf_really_inline simd64(__m256i v) : value(v) {} +}; -inline size_t convert_valid(const char* buf, size_t len, char* latin_output) { - const uint8_t *data = reinterpret_cast(buf); +// ------------------------------------------------------------ - size_t pos = 0; - char* start{latin_output}; - - while (pos < len) { - // try to convert the next block of 16 ASCII bytes - if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii - uint64_t v1; - ::memcpy(&v1, data + pos, sizeof(uint64_t)); - uint64_t v2; - ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); - uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000 1000 1000, so it makes sense to concatenate everything - if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII - size_t final_pos = pos + 16; - while(pos < final_pos) { - *latin_output++ = char(buf[pos]); - pos++; - } - continue; - } - } +simd64 sum_8bytes(const simd8 v) { + const auto sum_u16 = __lasx_xvhaddw_hu_bu(v, v); + const auto sum_u32 = __lasx_xvhaddw_wu_hu(sum_u16, sum_u16); + const auto sum_u64 = __lasx_xvhaddw_du_wu(sum_u32, sum_u32); - // suppose it is not an all ASCII byte sequence - uint8_t leading_byte = data[pos]; // leading byte - if (leading_byte < 0b10000000) { - // converting one ASCII byte !!! - *latin_output++ = char(leading_byte); - pos++; - } else if ((leading_byte & 0b11100000) == 0b11000000) { // the first three bits indicate: - // We have a two-byte UTF-8 - if(pos + 1 >= len) { break; } // minimal bound checking - if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } // checks if the next byte is a valid continuation byte in UTF-8. A valid continuation byte starts with 10. - // range check - - uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); // assembles the Unicode code point from the two bytes. It does this by discarding the leading 110 and 10 bits from the two bytes, shifting the remaining bits of the first byte, and then combining the results with a bitwise OR operation. - *latin_output++ = char(code_point); - pos += 2; - } else { - // we may have a continuation but we do not do error checking - return 0; - } - } - return latin_output - start; + return simd64(sum_u64); } +/* end file src/simdutf/lasx/simd64-inl.h */ -} // utf8_to_latin1 namespace +} // namespace simd } // unnamed namespace -} // namespace scalar +} // namespace lasx } // namespace simdutf +#endif // SIMDUTF_LASX_SIMD_H +/* end file src/simdutf/lasx/simd.h */ + +/* begin file src/simdutf/lasx/end.h */ +#undef SIMDUTF_SIMD_HAS_UNSIGNED_CMP + +#if SIMDUTF_CAN_ALWAYS_RUN_LASX +// nothing needed. +#else +SIMDUTF_UNTARGET_REGION #endif -/* end file src/scalar/utf8_to_latin1/valid_utf8_to_latin1.h */ -/* begin file src/scalar/utf16_to_latin1/valid_utf16_to_latin1.h */ -#ifndef SIMDUTF_VALID_UTF16_TO_LATIN1_H -#define SIMDUTF_VALID_UTF16_TO_LATIN1_H +/* end file src/simdutf/lasx/end.h */ -namespace simdutf { -namespace scalar { -namespace { -namespace utf16_to_latin1 { +#endif // SIMDUTF_IMPLEMENTATION_LASX -template -inline size_t convert_valid(const char16_t* buf, size_t len, char* latin_output) { - const uint16_t *data = reinterpret_cast(buf); - size_t pos = 0; - char* start{latin_output}; - uint16_t word = 0; +#endif // SIMDUTF_LASX_H +/* end file src/simdutf/lasx.h */ +/* begin file src/simdutf/lsx.h */ +#ifndef SIMDUTF_LSX_H +#define SIMDUTF_LSX_H - while (pos < len) { - word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos]; - *latin_output++ = char(word); - pos++; - } +#ifdef SIMDUTF_FALLBACK_H + #error "lsx.h must be included before fallback.h" +#endif - return latin_output - start; -} +#ifndef SIMDUTF_CAN_ALWAYS_RUN_LASX + #error "lsx.h must be included after lasx.h" +#endif -} // utf16_to_latin1 namespace -} // unnamed namespace -} // namespace scalar -} // namespace simdutf +#ifndef SIMDUTF_IMPLEMENTATION_LSX + #if SIMDUTF_CAN_ALWAYS_RUN_LASX + #define SIMDUTF_IMPLEMENTATION_LSX 0 + #else + #define SIMDUTF_IMPLEMENTATION_LSX (SIMDUTF_IS_LSX) + #endif +#endif +#if SIMDUTF_IMPLEMENTATION_LSX && SIMDUTF_IS_LSX + #define SIMDUTF_CAN_ALWAYS_RUN_LSX 1 +#else + #define SIMDUTF_CAN_ALWAYS_RUN_LSX 0 #endif -/* end file src/scalar/utf16_to_latin1/valid_utf16_to_latin1.h */ -/* begin file src/scalar/utf32_to_latin1/valid_utf32_to_latin1.h */ -#ifndef SIMDUTF_VALID_UTF32_TO_LATIN1_H -#define SIMDUTF_VALID_UTF32_TO_LATIN1_H + +#define SIMDUTF_CAN_ALWAYS_RUN_FALLBACK (SIMDUTF_IMPLEMENTATION_FALLBACK) + +#if SIMDUTF_IMPLEMENTATION_LSX namespace simdutf { -namespace scalar { -namespace { -namespace utf32_to_latin1 { +/** + * Implementation for LoongArch SX. + */ +namespace lsx {} // namespace lsx +} // namespace simdutf -inline size_t convert_valid(const char32_t *buf, size_t len, char *latin1_output) { - const uint32_t *data = reinterpret_cast(buf); - char* start = latin1_output; - uint32_t utf32_char; - size_t pos = 0; +/* begin file src/simdutf/lsx/implementation.h */ +#ifndef SIMDUTF_LSX_IMPLEMENTATION_H +#define SIMDUTF_LSX_IMPLEMENTATION_H - while (pos < len) { - utf32_char = (uint32_t)data[pos]; - if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are Latin1 - uint64_t v; - ::memcpy(&v, data + pos, sizeof(uint64_t)); - if ((v & 0xFFFFFF00FFFFFF00) == 0) { - *latin1_output++ = char(buf[pos]); - *latin1_output++ = char(buf[pos+1]); - pos += 2; - continue; - } - } - *latin1_output++ = (char)(utf32_char & 0xFF); - pos++; +namespace simdutf { +namespace lsx { - } - return latin1_output - start; +namespace { +using namespace simdutf; } +class implementation final : public simdutf::implementation { +public: + simdutf_really_inline implementation() + : simdutf::implementation("lsx", "LOONGARCH SX", + internal::instruction_set::LSX) {} +#if SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused int detect_encodings(const char *input, + size_t length) const noexcept final; +#endif // SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool validate_utf8(const char *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF8 + simdutf_warn_unused result + validate_utf8_with_errors(const char *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 +#if SIMDUTF_FEATURE_ASCII + simdutf_warn_unused bool validate_ascii(const char *buf, + size_t len) const noexcept final; + simdutf_warn_unused result + validate_ascii_with_errors(const char *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_ASCII + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII + simdutf_warn_unused bool + validate_utf16le_as_ascii(const char16_t *buf, + size_t len) const noexcept final; + + simdutf_warn_unused bool + validate_utf16be_as_ascii(const char16_t *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII + +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool validate_utf16le(const char16_t *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused bool validate_utf16be(const char16_t *buf, + size_t len) const noexcept final; + simdutf_warn_unused result validate_utf16le_with_errors( + const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf16be_with_errors( + const char16_t *buf, size_t len) const noexcept final; + void to_well_formed_utf16be(const char16_t *input, size_t len, + char16_t *output) const noexcept final; + void to_well_formed_utf16le(const char16_t *input, size_t len, + char16_t *output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool validate_utf32(const char32_t *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused result validate_utf32_with_errors( + const char32_t *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf8( + const char *buf, size_t len, char *utf8_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf16le( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_latin1_to_utf16be( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_latin1_with_errors( + const char *buf, size_t len, char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t convert_utf8_to_utf16le( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf16be( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf16le_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf16be_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16le( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16be( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t convert_utf8_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf32_with_errors( + const char *buf, size_t len, char32_t *utf32_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf32( + const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + convert_utf16le_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf16be_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_latin1_with_errors( + const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_latin1_with_errors( + const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t convert_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + convert_utf32_to_latin1(const char32_t *buf, size_t len, + char *latin1_output) const noexcept final; + simdutf_warn_unused result + convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len, + char *latin1_output) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf32_to_latin1(const char32_t *buf, size_t len, + char *latin1_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t convert_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf8_with_errors( + const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t + convert_utf32_to_utf16le(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf32_to_utf16be(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf16le_with_errors( + const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf16be_with_errors( + const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf16le_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf16be_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_utf32_with_errors( + const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_utf32_with_errors( + const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF16 + void change_endianness_utf16(const char16_t *buf, size_t length, + char16_t *output) const noexcept final; + simdutf_warn_unused size_t count_utf16le(const char16_t *buf, + size_t length) const noexcept; + simdutf_warn_unused size_t count_utf16be(const char16_t *buf, + size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF8 + simdutf_warn_unused size_t count_utf8(const char *buf, + size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF8 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t + utf8_length_from_utf16le(const char16_t *input, size_t length) const noexcept; + simdutf_warn_unused size_t + utf8_length_from_utf16be(const char16_t *input, size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t utf32_length_from_utf16le( + const char16_t *input, size_t length) const noexcept; + simdutf_warn_unused size_t utf32_length_from_utf16be( + const char16_t *input, size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t + utf16_length_from_utf8(const char *input, size_t length) const noexcept; + simdutf_warn_unused result utf8_length_from_utf16le_with_replacement( + const char16_t *input, size_t length) const noexcept; + ; + simdutf_warn_unused result utf8_length_from_utf16be_with_replacement( + const char16_t *input, size_t length) const noexcept; + ; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t + utf8_length_from_utf32(const char32_t *input, size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t + utf16_length_from_utf32(const char32_t *input, size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t + utf32_length_from_utf8(const char *input, size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + latin1_length_from_utf8(const char *input, size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + utf8_length_from_latin1(const char *input, size_t length) const noexcept; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_BASE64 + simdutf_warn_unused result base64_to_binary( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept; + simdutf_warn_unused full_result base64_to_binary_details( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept; + simdutf_warn_unused result + base64_to_binary(const char16_t *input, size_t length, char *output, + base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept; + simdutf_warn_unused full_result base64_to_binary_details( + const char16_t *input, size_t length, char *output, + base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept; + size_t binary_to_base64(const char *input, size_t length, char *output, + base64_options options) const noexcept; + size_t binary_to_base64_with_lines(const char *input, size_t length, + char *output, size_t line_length, + base64_options options) const noexcept; + const char *find(const char *start, const char *end, + char character) const noexcept; + const char16_t *find(const char16_t *start, const char16_t *end, + char16_t character) const noexcept; +#endif // SIMDUTF_FEATURE_BASE64 +}; -} // utf32_to_latin1 namespace -} // unnamed namespace -} // namespace scalar +} // namespace lsx } // namespace simdutf -#endif -/* end file src/scalar/utf32_to_latin1/valid_utf32_to_latin1.h */ +#endif // SIMDUTF_LSX_IMPLEMENTATION_H +/* end file src/simdutf/lsx/implementation.h */ +/* begin file src/simdutf/lsx/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "lsx" +// #define SIMDUTF_IMPLEMENTATION lsx +#define SIMDUTF_SIMD_HAS_UNSIGNED_CMP 1 +/* end file src/simdutf/lsx/begin.h */ + // Declarations +/* begin file src/simdutf/lsx/intrinsics.h */ +#ifndef SIMDUTF_LSX_INTRINSICS_H +#define SIMDUTF_LSX_INTRINSICS_H -SIMDUTF_PUSH_DISABLE_WARNINGS -SIMDUTF_DISABLE_UNDESIRED_WARNINGS +// This should be the correct header whether +// you use visual studio or other compilers. +#include -#if SIMDUTF_IMPLEMENTATION_ARM64 -/* begin file src/arm64/implementation.cpp */ -/* begin file src/simdutf/arm64/begin.h */ -// redefining SIMDUTF_IMPLEMENTATION to "arm64" -// #define SIMDUTF_IMPLEMENTATION arm64 -/* end file src/simdutf/arm64/begin.h */ -namespace simdutf { -namespace arm64 { -namespace { -#ifndef SIMDUTF_ARM64_H -#error "arm64.h must be included" -#endif -using namespace simd; +/* +Encoding of argument for LoongArch64 xvldi instruction. See: +https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/misc/#__m256i-__lasx_xvldi-imm_n1024_1023-imm -simdutf_really_inline bool is_ascii(const simd8x64& input) { - simd8 bits = input.reduce_or(); - return bits.max_val() < 0b10000000u; -} +1: imm[12:8]=0b10000: broadcast imm[7:0] as 32-bit elements to all lanes -simdutf_unused simdutf_really_inline simd8 must_be_continuation(const simd8 prev1, const simd8 prev2, const simd8 prev3) { - simd8 is_second_byte = prev1 >= uint8_t(0b11000000u); - simd8 is_third_byte = prev2 >= uint8_t(0b11100000u); - simd8 is_fourth_byte = prev3 >= uint8_t(0b11110000u); - // Use ^ instead of | for is_*_byte, because ^ is commutative, and the caller is using ^ as well. - // This will work fine because we only have to report errors for cases with 0-1 lead bytes. - // Multiple lead bytes implies 2 overlapping multibyte characters, and if that happens, there is - // guaranteed to be at least *one* lead byte that is part of only 1 other multibyte character. - // The error will be detected there. - return is_second_byte ^ is_third_byte ^ is_fourth_byte; -} +2: imm[12:8]=0b10001: broadcast imm[7:0] << 8 as 32-bit elements to all lanes -simdutf_really_inline simd8 must_be_2_3_continuation(const simd8 prev2, const simd8 prev3) { - simd8 is_third_byte = prev2 >= uint8_t(0b11100000u); - simd8 is_fourth_byte = prev3 >= uint8_t(0b11110000u); - return is_third_byte ^ is_fourth_byte; -} +3: imm[12:8]=0b10010: broadcast imm[7:0] << 16 as 32-bit elements to all lanes -// common functions for utf8 conversions -simdutf_really_inline uint16x4_t convert_utf8_3_byte_to_utf16(uint8x16_t in) { - // Low half contains 10cccccc|1110aaaa - // High half contains 10bbbbbb|10bbbbbb -#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint8x16_t sh = simdutf_make_uint8x16_t(0, 2, 3, 5, 6, 8, 9, 11, 1, 1, 4, 4, 7, 7, 10, 10); -#else - const uint8x16_t sh = {0, 2, 3, 5, 6, 8, 9, 11, 1, 1, 4, 4, 7, 7, 10, 10}; -#endif - uint8x16_t perm = vqtbl1q_u8(in, sh); - // Split into half vectors. - // 10cccccc|1110aaaa - uint8x8_t perm_low = vget_low_u8(perm); // no-op - // 10bbbbbb|10bbbbbb - uint8x8_t perm_high = vget_high_u8(perm); - // xxxxxxxx 10bbbbbb - uint16x4_t mid = vreinterpret_u16_u8(perm_high); // no-op - // xxxxxxxx 1110aaaa - uint16x4_t high = vreinterpret_u16_u8(perm_low); // no-op - // Assemble with shift left insert. - // xxxxxxaa aabbbbbb - uint16x4_t mid_high = vsli_n_u16(mid, high, 6); - // (perm_low << 8) | (perm_low >> 8) - // xxxxxxxx 10cccccc - uint16x4_t low = vreinterpret_u16_u8(vrev16_u8(perm_low)); - // Shift left insert into the low bits - // aaaabbbb bbcccccc - uint16x4_t composed = vsli_n_u16(low, mid_high, 6); - return composed; -} +4: imm[12:8]=0b10011: broadcast imm[7:0] << 24 as 32-bit elements to all lanes -simdutf_really_inline uint16x8_t convert_utf8_2_byte_to_utf16(uint8x16_t in) { - // Converts 6 2 byte UTF-8 characters to 6 UTF-16 characters. - // Technically this calculates 8, but 6 does better and happens more often - // (The languages which use these codepoints use ASCII spaces so 8 would need to be - // in the middle of a very long word). +5: imm[12:8]=0b10100: broadcast imm[7:0] as 16-bit elements to all lanes - // 10bbbbbb 110aaaaa - uint16x8_t upper = vreinterpretq_u16_u8(in); - // (in << 8) | (in >> 8) - // 110aaaaa 10bbbbbb - uint16x8_t lower = vreinterpretq_u16_u8(vrev16q_u8(in)); - // 00000000 000aaaaa - uint16x8_t upper_masked = vandq_u16(upper, vmovq_n_u16(0x1F)); - // Assemble with shift left insert. - // 00000aaa aabbbbbb - uint16x8_t composed = vsliq_n_u16(lower, upper_masked, 6); - return composed; -} +6: imm[12:8]=0b10101: broadcast imm[7:0] << 8 as 16-bit elements to all lanes -simdutf_really_inline uint16x8_t convert_utf8_1_to_2_byte_to_utf16(uint8x16_t in, size_t shufutf8_idx) { - // Converts 6 1-2 byte UTF-8 characters to 6 UTF-16 characters. - // This is a relatively easy scenario - // we process SIX (6) input code-code units. The max length in bytes of six code - // code units spanning between 1 and 2 bytes each is 12 bytes. - uint8x16_t sh = vld1q_u8(reinterpret_cast(simdutf::tables::utf8_to_utf16::shufutf8[shufutf8_idx])); - // Shuffle - // 1 byte: 00000000 0bbbbbbb - // 2 byte: 110aaaaa 10bbbbbb - uint16x8_t perm = vreinterpretq_u16_u8(vqtbl1q_u8(in, sh)); - // Mask - // 1 byte: 00000000 0bbbbbbb - // 2 byte: 00000000 00bbbbbb - uint16x8_t ascii = vandq_u16(perm, vmovq_n_u16(0x7f)); // 6 or 7 bits - // 1 byte: 00000000 00000000 - // 2 byte: 000aaaaa 00000000 - uint16x8_t highbyte = vandq_u16(perm, vmovq_n_u16(0x1f00)); // 5 bits - // Combine with a shift right accumulate - // 1 byte: 00000000 0bbbbbbb - // 2 byte: 00000aaa aabbbbbb - uint16x8_t composed = vsraq_n_u16(ascii, highbyte, 2); - return composed; -} +7: imm[12:8]=0b10110: broadcast (imm[7:0] << 8) | 0xFF as 32-bit elements to all +lanes -/* begin file src/arm64/arm_detect_encodings.cpp */ -template -// len is known to be a multiple of 2 when this is called -int arm_detect_encodings(const char * buf, size_t len) { - const char* start = buf; - const char* end = buf + len; +8: imm[12:8]=0b10111: broadcast (imm[7:0] << 16) | 0xFFFF as 32-bit elements to +all lanes - bool is_utf8 = true; - bool is_utf16 = true; - bool is_utf32 = true; +9: imm[12:8]=0b11000: broadcast imm[7:0] as 8-bit elements to all lanes - int out = 0; +10: imm[12:8]=0b11001: repeat each bit of imm[7:0] eight times, and broadcast +the result as 64-bit elements to all lanes +*/ - const auto v_d8 = simd8::splat(0xd8); - const auto v_f8 = simd8::splat(0xf8); - - uint32x4_t currentmax = vmovq_n_u32(0x0); - - checker check{}; - - while(buf + 64 <= end) { - uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); - uint16x8_t secondin = vld1q_u16(reinterpret_cast(buf) + simd16::SIZE / sizeof(char16_t)); - uint16x8_t thirdin = vld1q_u16(reinterpret_cast(buf) + 2*simd16::SIZE / sizeof(char16_t)); - uint16x8_t fourthin = vld1q_u16(reinterpret_cast(buf) + 3*simd16::SIZE / sizeof(char16_t)); - - const auto u0 = simd16(in); - const auto u1 = simd16(secondin); - const auto u2 = simd16(thirdin); - const auto u3 = simd16(fourthin); - - const auto v0 = u0.shr<8>(); - const auto v1 = u1.shr<8>(); - const auto v2 = u2.shr<8>(); - const auto v3 = u3.shr<8>(); - - const auto in16 = simd16::pack(v0, v1); - const auto nextin16 = simd16::pack(v2, v3); - - const uint64_t surrogates_wordmask0 = ((in16 & v_f8) == v_d8).to_bitmask64(); - const uint64_t surrogates_wordmask1 = ((nextin16 & v_f8) == v_d8).to_bitmask64(); - - // Check for surrogates - if (surrogates_wordmask0 != 0 || surrogates_wordmask1 != 0) { - // Cannot be UTF8 - is_utf8 = false; - // Can still be either UTF-16LE or UTF-32 depending on the positions of the surrogates - // To be valid UTF-32, a surrogate cannot be in the two most significant bytes of any 32-bit word. - // On the other hand, to be valid UTF-16LE, at least one surrogate must be in the two most significant - // bytes of a 32-bit word since they always come in pairs in UTF-16LE. - // Note that we always proceed in multiple of 4 before this point so there is no offset in 32-bit code units. - - if (((surrogates_wordmask0 | surrogates_wordmask1) & 0xf0f0f0f0f0f0f0f0) != 0) { - is_utf32 = false; - // Code from arm_validate_utf16le.cpp - // Not efficient, we do not process surrogates_wordmask1 - const char16_t * input = reinterpret_cast(buf); - const char16_t* end16 = reinterpret_cast(start) + len/2; - - const auto v_fc = simd8::splat(0xfc); - const auto v_dc = simd8::splat(0xdc); - - const uint64_t V0 = ~surrogates_wordmask0; - - const auto vH0 = ((in16 & v_fc) == v_dc); - const uint64_t H0 = vH0.to_bitmask64(); - - const uint64_t L0 = ~H0 & surrogates_wordmask0; - - const uint64_t a0 = L0 & (H0 >> 4); - - const uint64_t b0 = a0 << 4; - - const uint64_t c0 = V0 | a0 | b0; - if (c0 == ~0ull) { - input += 16; - } else if (c0 == 0xfffffffffffffffull) { - input += 15; - } else { - is_utf16 = false; - break; - } - - while (input + 16 < end16) { - const auto in0 = simd16(input); - const auto in1 = simd16(input + simd16::SIZE / sizeof(char16_t)); - const auto t0 = in0.shr<8>(); - const auto t1 = in1.shr<8>(); - const simd8 in_16 = simd16::pack(t0, t1); - - const uint64_t surrogates_wordmask = ((in_16 & v_f8) == v_d8).to_bitmask64(); - if(surrogates_wordmask == 0) { - input += 16; - } else { - const uint64_t V = ~surrogates_wordmask; - - const auto vH = ((in_16 & v_fc) == v_dc); - const uint64_t H = vH.to_bitmask64(); - - const uint64_t L = ~H & surrogates_wordmask; - - const uint64_t a = L & (H >> 4); - - const uint64_t b = a << 4; - - const uint64_t c = V | a | b; - if (c == ~0ull) { - input += 16; - } else if (c == 0xfffffffffffffffull) { - input += 15; - } else { - is_utf16 = false; - break; - } - } - } - } else { - is_utf16 = false; - // Check for UTF-32 - if (len % 4 == 0) { - const char32_t * input = reinterpret_cast(buf); - const char32_t* end32 = reinterpret_cast(start) + len/4; - - // Must start checking for surrogates - uint32x4_t currentoffsetmax = vmovq_n_u32(0x0); - const uint32x4_t offset = vmovq_n_u32(0xffff2000); - const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff); - - const uint32x4_t in32 = vreinterpretq_u32_u16(in); - const uint32x4_t secondin32 = vreinterpretq_u32_u16(secondin); - const uint32x4_t thirdin32 = vreinterpretq_u32_u16(thirdin); - const uint32x4_t fourthin32 = vreinterpretq_u32_u16(fourthin); - - currentmax = vmaxq_u32(in32,currentmax); - currentmax = vmaxq_u32(secondin32,currentmax); - currentmax = vmaxq_u32(thirdin32,currentmax); - currentmax = vmaxq_u32(fourthin32,currentmax); - - currentoffsetmax = vmaxq_u32(vaddq_u32(in32, offset), currentoffsetmax); - currentoffsetmax = vmaxq_u32(vaddq_u32(secondin32, offset), currentoffsetmax); - currentoffsetmax = vmaxq_u32(vaddq_u32(thirdin32, offset), currentoffsetmax); - currentoffsetmax = vmaxq_u32(vaddq_u32(fourthin32, offset), currentoffsetmax); - - while (input + 4 < end32) { - const uint32x4_t in_32 = vld1q_u32(reinterpret_cast(input)); - currentmax = vmaxq_u32(in_32,currentmax); - currentoffsetmax = vmaxq_u32(vaddq_u32(in_32, offset), currentoffsetmax); - input += 4; - } - - uint32x4_t forbidden_words = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax); - if(vmaxvq_u32(forbidden_words) != 0) { - is_utf32 = false; - } - } else { - is_utf32 = false; - } - } - break; - } - // If no surrogate, validate under other encodings as well +namespace vldi { - // UTF-32 validation - currentmax = vmaxq_u32(vreinterpretq_u32_u16(in),currentmax); - currentmax = vmaxq_u32(vreinterpretq_u32_u16(secondin),currentmax); - currentmax = vmaxq_u32(vreinterpretq_u32_u16(thirdin),currentmax); - currentmax = vmaxq_u32(vreinterpretq_u32_u16(fourthin),currentmax); +template class const_u16 { + constexpr static const uint8_t b0 = ((v >> 0 * 8) & 0xff); + constexpr static const uint8_t b1 = ((v >> 1 * 8) & 0xff); - // UTF-8 validation - // Relies on ../generic/utf8_validation/utf8_lookup4_algorithm.h - simd::simd8x64 in8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(secondin), vreinterpretq_u8_u16(thirdin), vreinterpretq_u8_u16(fourthin)); - check.check_next_input(in8); + constexpr static bool is_case5 = uint16_t(b0) == v; + constexpr static bool is_case6 = (uint16_t(b1) << 8) == v; + constexpr static bool is_case9 = (b0 == b1); + constexpr static bool is_case10 = + ((b0 == 0xff) || (b0 == 0x00)) && ((b1 == 0xff) || (b1 == 0x00)); - buf += 64; - } +public: + constexpr static uint16_t operation = is_case5 ? 0b10100 + : is_case6 ? 0b10101 + : is_case9 ? 0b11000 + : is_case10 ? 0x11001 + : 0xffff; + + constexpr static uint16_t byte = + is_case5 ? b0 + : is_case6 ? b1 + : is_case9 ? b0 + : is_case10 ? ((b0 ? 0x55 : 0x00) | (b1 ? 0xaa : 0x00)) + : 0xffff; + + constexpr static int value = int((operation << 8) | byte) - 8192; + constexpr static bool valid = operation != 0xffff; +}; - // Check which encodings are possible +template class const_u32 { + constexpr static const uint8_t b0 = (v & 0xff); + constexpr static const uint8_t b1 = ((v >> 8) & 0xff); + constexpr static const uint8_t b2 = ((v >> 16) & 0xff); + constexpr static const uint8_t b3 = ((v >> 24) & 0xff); + + constexpr static bool is_case1 = (uint32_t(b0) == v); + constexpr static bool is_case2 = ((uint32_t(b1) << 8) == v); + constexpr static bool is_case3 = ((uint32_t(b2) << 16) == v); + constexpr static bool is_case4 = ((uint32_t(b3) << 24) == v); + constexpr static bool is_case5 = (b0 == b2) && (b1 == 0) && (b3 == 0); + constexpr static bool is_case6 = (b1 == b3) && (b0 == 0) && (b2 == 0); + constexpr static bool is_case7 = (b3 == 0) && (b2 == 0) && (b0 == 0xff); + constexpr static bool is_case8 = (b3 == 0) && (b1 == 0xff) && (b0 == 0xff); + constexpr static bool is_case9 = (b0 == b1) && (b0 == b2) && (b0 == b3); + constexpr static bool is_case10 = + ((b0 == 0xff) || (b0 == 0x00)) && ((b1 == 0xff) || (b1 == 0x00)) && + ((b2 == 0xff) || (b2 == 0x00)) && ((b3 == 0xff) || (b3 == 0x00)); - if (is_utf8) { - if (static_cast(buf - start) != len) { - uint8_t block[64]{}; - std::memset(block, 0x20, 64); - std::memcpy(block, buf, len - (buf - start)); - simd::simd8x64 in(block); - check.check_next_input(in); - } - if (!check.errors()) { - out |= simdutf::encoding_type::UTF8; - } - } +public: + constexpr static uint16_t operation = is_case1 ? 0b10000 + : is_case2 ? 0b10001 + : is_case3 ? 0b10010 + : is_case4 ? 0b10011 + : is_case5 ? 0b10100 + : is_case6 ? 0b10101 + : is_case7 ? 0b10110 + : is_case8 ? 0b10111 + : is_case9 ? 0b11000 + : is_case10 ? 0b11001 + : 0xffff; + + constexpr static uint16_t byte = + is_case1 ? b0 + : is_case2 ? b1 + : is_case3 ? b2 + : is_case4 ? b3 + : is_case5 ? b0 + : is_case6 ? b1 + : is_case7 ? b1 + : is_case8 ? b2 + : is_case9 ? b0 + : is_case10 ? ((b0 ? 0x11 : 0x00) | (b1 ? 0x22 : 0x00) | + (b2 ? 0x44 : 0x00) | (b3 ? 0x88 : 0x00)) + : 0xffff; + + constexpr static int value = int((operation << 8) | byte) - 8192; + constexpr static bool valid = operation != 0xffff; +}; - if (is_utf16 && scalar::utf16::validate(reinterpret_cast(buf), (len - (buf - start))/2)) { - out |= simdutf::encoding_type::UTF16_LE; - } +template class const_u64 { + constexpr static const uint8_t b0 = ((v >> 0 * 8) & 0xff); + constexpr static const uint8_t b1 = ((v >> 1 * 8) & 0xff); + constexpr static const uint8_t b2 = ((v >> 2 * 8) & 0xff); + constexpr static const uint8_t b3 = ((v >> 3 * 8) & 0xff); + constexpr static const uint8_t b4 = ((v >> 4 * 8) & 0xff); + constexpr static const uint8_t b5 = ((v >> 5 * 8) & 0xff); + constexpr static const uint8_t b6 = ((v >> 6 * 8) & 0xff); + constexpr static const uint8_t b7 = ((v >> 7 * 8) & 0xff); + + constexpr static bool is_case10 = + ((b0 == 0xff) || (b0 == 0x00)) && ((b1 == 0xff) || (b1 == 0x00)) && + ((b2 == 0xff) || (b2 == 0x00)) && ((b3 == 0xff) || (b3 == 0x00)) && + ((b4 == 0xff) || (b4 == 0x00)) && ((b5 == 0xff) || (b5 == 0x00)) && + ((b6 == 0xff) || (b6 == 0x00)) && ((b7 == 0xff) || (b7 == 0x00)); - if (is_utf32 && (len % 4 == 0)) { - const uint32x4_t standardmax = vmovq_n_u32(0x10ffff); - uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax); - if (vmaxvq_u32(is_zero) == 0 && scalar::utf32::validate(reinterpret_cast(buf), (len - (buf - start))/4)) { - out |= simdutf::encoding_type::UTF32_LE; - } - } +public: + constexpr static bool is_32bit = + ((v & 0xffffffff) == (v >> 32)) && const_u32<(v >> 32)>::value; + constexpr static uint8_t op_32bit = const_u32<(v >> 32)>::operation; + constexpr static uint8_t byte_32bit = const_u32<(v >> 32)>::byte; + + constexpr static uint16_t operation = is_32bit ? op_32bit + : is_case10 ? 0x11001 + : 0xffff; + + constexpr static uint16_t byte = + is_32bit ? byte_32bit + : is_case10 + ? ((b0 ? 0x01 : 0x00) | (b1 ? 0x02 : 0x00) | (b2 ? 0x04 : 0x00) | + (b3 ? 0x08 : 0x00) | (b4 ? 0x10 : 0x00) | (b5 ? 0x20 : 0x00) | + (b6 ? 0x40 : 0x00) | (b7 ? 0x80 : 0x00)) + : 0xffff; + + constexpr static int value = int((operation << 8) | byte) - 8192; + constexpr static bool valid = operation != 0xffff; +}; +} // namespace vldi - return out; -} -/* end file src/arm64/arm_detect_encodings.cpp */ +// Uncomment when running under QEMU affected +// by bug https://gitlab.com/qemu-project/qemu/-/issues/2865 +// Versions <= 9.2.2 are affected, likely anything newer is correct. +#ifndef QEMU_VLDI_BUG +// #define QEMU_VLDI_BUG 1 +#endif -/* begin file src/arm64/arm_validate_utf16.cpp */ -template -const char16_t* arm_validate_utf16(const char16_t* input, size_t size) { - const char16_t* end = input + size; - const auto v_d8 = simd8::splat(0xd8); - const auto v_f8 = simd8::splat(0xf8); - const auto v_fc = simd8::splat(0xfc); - const auto v_dc = simd8::splat(0xdc); - while (input + 16 < end) { - // 0. Load data: since the validation takes into account only higher - // byte of each word, we compress the two vectors into one which - // consists only the higher bytes. - auto in0 = simd16(input); - auto in1 = simd16(input + simd16::SIZE / sizeof(char16_t)); - if (!match_system(big_endian)) { - in0 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in0))); - in1 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in1))); - } - const auto t0 = in0.shr<8>(); - const auto t1 = in1.shr<8>(); - const simd8 in = simd16::pack(t0, t1); - // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). - const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64(); - if(surrogates_wordmask == 0) { - input += 16; - } else { - // 2. We have some surrogates that have to be distinguished: - // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) - // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) - // - // Fact: high surrogate has 11th bit set (3rd bit in the higher word) - - // V - non-surrogate code units - // V = not surrogates_wordmask - const uint64_t V = ~surrogates_wordmask; - - // H - word-mask for high surrogates: the six highest bits are 0b1101'11 - const auto vH = ((in & v_fc) == v_dc); - const uint64_t H = vH.to_bitmask64(); - - // L - word mask for low surrogates - // L = not H and surrogates_wordmask - const uint64_t L = ~H & surrogates_wordmask; - - const uint64_t a = L & (H >> 4); // A low surrogate must be followed by high one. - // (A low surrogate placed in the 7th register's word - // is an exception we handle.) - const uint64_t b = a << 4; // Just mark that the opposite fact is hold, - // thanks to that we have only two masks for valid case. - const uint64_t c = V | a | b; // Combine all the masks into the final one. - if (c == ~0ull) { - // The whole input register contains valid UTF-16, i.e., - // either single code units or proper surrogate pairs. - input += 16; - } else if (c == 0xfffffffffffffffull) { - // The 15 lower code units of the input register contains valid UTF-16. - // The 15th word may be either a low or high surrogate. It the next - // iteration we 1) check if the low surrogate is followed by a high - // one, 2) reject sole high surrogate. - input += 15; - } else { - return nullptr; - } - } - } - return input; +#ifndef lsx_splat_u16 + #ifdef QEMU_VLDI_BUG + #define lsx_splat_u16(v) __lsx_vreplgr2vr_h(v) + #define lsx_splat_u32(v) __lsx_vreplgr2vr_w(v) + #else +namespace { +template constexpr __m128i lsx_splat_u16_aux() { + return ((int16_t(x) < 512) && (int16_t(x) > -512)) + ? __lsx_vrepli_h( + ((int16_t(x) < 512) && (int16_t(x) > -512)) ? int16_t(x) : 0) + : (vldi::const_u16::valid + ? __lsx_vldi(vldi::const_u16::valid + ? vldi::const_u16::value + : 0) + : __lsx_vreplgr2vr_h(x)); +} + +template constexpr __m128i lsx_splat_u32_aux() { + return ((int32_t(x) < 512) && (int32_t(x) > -512)) + ? __lsx_vrepli_w( + ((int32_t(x) < 512) && (int32_t(x) > -512)) ? int32_t(x) : 0) + : (vldi::const_u32::valid + ? __lsx_vldi(vldi::const_u32::valid + ? vldi::const_u32::value + : 0) + : __lsx_vreplgr2vr_w(x)); } +} // namespace + #define lsx_splat_u16(v) lsx_splat_u16_aux<(v)>() + #define lsx_splat_u32(v) lsx_splat_u32_aux<(v)>() + #endif // QEMU_VLDI_BUG +#endif // lsx_splat_u16 +#endif // SIMDUTF_LSX_INTRINSICS_H +/* end file src/simdutf/lsx/intrinsics.h */ +/* begin file src/simdutf/lsx/bitmanipulation.h */ +#ifndef SIMDUTF_LSX_BITMANIPULATION_H +#define SIMDUTF_LSX_BITMANIPULATION_H + +#include +namespace simdutf { +namespace lsx { +namespace { -template -const result arm_validate_utf16_with_errors(const char16_t* input, size_t size) { - const char16_t* start = input; - const char16_t* end = input + size; - - const auto v_d8 = simd8::splat(0xd8); - const auto v_f8 = simd8::splat(0xf8); - const auto v_fc = simd8::splat(0xfc); - const auto v_dc = simd8::splat(0xdc); - while (input + 16 < end) { - // 0. Load data: since the validation takes into account only higher - // byte of each word, we compress the two vectors into one which - // consists only the higher bytes. - auto in0 = simd16(input); - auto in1 = simd16(input + simd16::SIZE / sizeof(char16_t)); - - if (!match_system(big_endian)) { - in0 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in0))); - in1 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in1))); - } - const auto t0 = in0.shr<8>(); - const auto t1 = in1.shr<8>(); - const simd8 in = simd16::pack(t0, t1); - // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). - const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64(); - if(surrogates_wordmask == 0) { - input += 16; - } else { - // 2. We have some surrogates that have to be distinguished: - // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) - // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) - // - // Fact: high surrogate has 11th bit set (3rd bit in the higher word) - - // V - non-surrogate code units - // V = not surrogates_wordmask - const uint64_t V = ~surrogates_wordmask; - - // H - word-mask for high surrogates: the six highest bits are 0b1101'11 - const auto vH = ((in & v_fc) == v_dc); - const uint64_t H = vH.to_bitmask64(); - - // L - word mask for low surrogates - // L = not H and surrogates_wordmask - const uint64_t L = ~H & surrogates_wordmask; - - const uint64_t a = L & (H >> 4); // A low surrogate must be followed by high one. - // (A low surrogate placed in the 7th register's word - // is an exception we handle.) - const uint64_t b = a << 4; // Just mark that the opposite fact is hold, - // thanks to that we have only two masks for valid case. - const uint64_t c = V | a | b; // Combine all the masks into the final one. - if (c == ~0ull) { - // The whole input register contains valid UTF-16, i.e., - // either single code units or proper surrogate pairs. - input += 16; - } else if (c == 0xfffffffffffffffull) { - // The 15 lower code units of the input register contains valid UTF-16. - // The 15th word may be either a low or high surrogate. It the next - // iteration we 1) check if the low surrogate is followed by a high - // one, 2) reject sole high surrogate. - input += 15; - } else { - return result(error_code::SURROGATE, input - start); - } - } - } - return result(error_code::SUCCESS, input - start); +simdutf_really_inline int count_ones(uint64_t input_num) { + return __lsx_vpickve2gr_w(__lsx_vpcnt_d(__lsx_vreplgr2vr_d(input_num)), 0); } -/* end file src/arm64/arm_validate_utf16.cpp */ -/* begin file src/arm64/arm_validate_utf32le.cpp */ -const char32_t* arm_validate_utf32le(const char32_t* input, size_t size) { - const char32_t* end = input + size; +#if SIMDUTF_NEED_TRAILING_ZEROES +simdutf_really_inline int trailing_zeroes(uint64_t input_num) { + return __builtin_ctzll(input_num); +} +#endif - const uint32x4_t standardmax = vmovq_n_u32(0x10ffff); - const uint32x4_t offset = vmovq_n_u32(0xffff2000); - const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff); - uint32x4_t currentmax = vmovq_n_u32(0x0); - uint32x4_t currentoffsetmax = vmovq_n_u32(0x0); +} // unnamed namespace +} // namespace lsx +} // namespace simdutf - while (input + 4 < end) { - const uint32x4_t in = vld1q_u32(reinterpret_cast(input)); - currentmax = vmaxq_u32(in,currentmax); - currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax); - input += 4; - } +#endif // SIMDUTF_LSX_BITMANIPULATION_H +/* end file src/simdutf/lsx/bitmanipulation.h */ +/* begin file src/simdutf/lsx/simd.h */ +#ifndef SIMDUTF_LSX_SIMD_H +#define SIMDUTF_LSX_SIMD_H - uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax); - if(vmaxvq_u32(is_zero) != 0) { - return nullptr; - } - is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax); - if(vmaxvq_u32(is_zero) != 0) { - return nullptr; - } +namespace simdutf { +namespace lsx { +namespace { +namespace simd { - return input; -} +template struct simd8; +// +// Base class of simd8 and simd8, both of which use __m128i +// internally. +// +template > struct base_u8 { + __m128i value; + static const int SIZE = sizeof(value); -const result arm_validate_utf32le_with_errors(const char32_t* input, size_t size) { - const char32_t* start = input; - const char32_t* end = input + size; + // Conversion from/to SIMD register + simdutf_really_inline base_u8(const __m128i _value) : value(_value) {} + simdutf_really_inline operator const __m128i &() const { return this->value; } + simdutf_really_inline operator __m128i &() { return this->value; } - const uint32x4_t standardmax = vmovq_n_u32(0x10ffff); - const uint32x4_t offset = vmovq_n_u32(0xffff2000); - const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff); - uint32x4_t currentmax = vmovq_n_u32(0x0); - uint32x4_t currentoffsetmax = vmovq_n_u32(0x0); + // Bit operations + simdutf_really_inline simd8 operator|(const simd8 other) const { + return __lsx_vor_v(this->value, other); + } + simdutf_really_inline simd8 operator&(const simd8 other) const { + return __lsx_vand_v(this->value, other); + } + simdutf_really_inline simd8 operator^(const simd8 other) const { + return __lsx_vxor_v(this->value, other); + } + simdutf_really_inline simd8 operator~() const { return *this ^ 0xFFu; } + simdutf_really_inline simd8 &operator|=(const simd8 other) { + auto this_cast = static_cast *>(this); + *this_cast = *this_cast | other; + return *this_cast; + } - while (input + 4 < end) { - const uint32x4_t in = vld1q_u32(reinterpret_cast(input)); - currentmax = vmaxq_u32(in,currentmax); - currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax); + friend simdutf_really_inline Mask operator==(const simd8 lhs, + const simd8 rhs) { + return __lsx_vseq_b(lhs, rhs); + } - uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax); - if(vmaxvq_u32(is_zero) != 0) { - return result(error_code::TOO_LARGE, input - start); - } + template + simdutf_really_inline simd8 prev(const simd8 prev_chunk) const { + return __lsx_vor_v(__lsx_vbsll_v(this->value, N), + __lsx_vbsrl_v(prev_chunk.value, 16 - N)); + } +}; - is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax); - if(vmaxvq_u32(is_zero) != 0) { - return result(error_code::SURROGATE, input - start); - } +// SIMD byte mask type (returned by things like eq and gt) +template <> struct simd8 : base_u8 { + typedef uint16_t bitmask_t; + typedef uint32_t bitmask2_t; - input += 4; - } + static simdutf_really_inline simd8 splat(bool _value) { + return __lsx_vreplgr2vr_b(uint8_t(-(!!_value))); + } - return result(error_code::SUCCESS, input - start); -} -/* end file src/arm64/arm_validate_utf32le.cpp */ + simdutf_really_inline simd8(const __m128i _value) : base_u8(_value) {} + // False constructor + simdutf_really_inline simd8() : simd8(__lsx_vldi(0)) {} + // Splat constructor + simdutf_really_inline simd8(bool _value) : simd8(splat(_value)) {} + simdutf_really_inline void store(uint8_t dst[16]) const { + return __lsx_vst(this->value, dst, 0); + } -/* begin file src/arm64/arm_convert_latin1_to_utf8.cpp */ -/* - Returns a pair: the first unprocessed byte from buf and utf8_output - A scalar routing should carry on the conversion of the tail. -*/ -std::pair -arm_convert_latin1_to_utf8(const char *latin1_input, size_t len, - char *utf8_out) { - uint8_t *utf8_output = reinterpret_cast(utf8_out); - const char *end = latin1_input + len; - const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080); - // We always write 16 bytes, of which more than the first 8 bytes - // are valid. A safety margin of 8 is more than sufficient. - while (latin1_input + 16 + 8 <= end) { - uint8x16_t in8 = vld1q_u8(reinterpret_cast(latin1_input)); - if (vmaxvq_u8(in8) <= 0x7F) { // ASCII fast path!!!! - vst1q_u8(utf8_output, in8); - utf8_output += 16; - latin1_input += 16; - continue; - } + simdutf_really_inline uint32_t to_bitmask() const { + return __lsx_vpickve2gr_wu(__lsx_vmsknz_b(*this), 0); + } +}; - // We just fallback on UTF-16 code. This could be optimized/simplified - // further. - uint16x8_t in16 = vmovl_u8(vget_low_u8(in8)); - // 1. prepare 2-byte values - // input 8-bit word : [aabb|bbbb] x 8 - // expected output : [1100|00aa|10bb|bbbb] x 8 - const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00); - const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f); - - // t0 = [0000|00aa|bbbb|bb00] - const uint16x8_t t0 = vshlq_n_u16(in16, 2); - // t1 = [0000|00aa|0000|0000] - const uint16x8_t t1 = vandq_u16(t0, v_1f00); - // t2 = [0000|0000|00bb|bbbb] - const uint16x8_t t2 = vandq_u16(in16, v_003f); - // t3 = [0000|00aa|00bb|bbbb] - const uint16x8_t t3 = vorrq_u16(t1, t2); - // t4 = [1100|00aa|10bb|bbbb] - const uint16x8_t t4 = vorrq_u16(t3, v_c080); - // 2. merge ASCII and 2-byte codewords - const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); - const uint16x8_t one_byte_bytemask = vcleq_u16(in16, v_007f); - const uint8x16_t utf8_unpacked = - vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in16, t4)); - // 3. prepare bitmask for 8-bit lookup -#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint16x8_t mask = simdutf_make_uint16x8_t(0x0001, 0x0004, 0x0010, 0x0040, - 0x0002, 0x0008, 0x0020, 0x0080); -#else - const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040, - 0x0002, 0x0008, 0x0020, 0x0080}; -#endif - uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask)); - // 4. pack the bytes - const uint8_t *row = - &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; - const uint8x16_t shuffle = vld1q_u8(row + 1); - const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle); +// Unsigned bytes +template <> struct simd8 : base_u8 { + static simdutf_really_inline simd8 splat(uint8_t _value) { + return __lsx_vreplgr2vr_b(_value); + } + static simdutf_really_inline simd8 zero() { return __lsx_vldi(0); } + static simdutf_really_inline simd8 load(const uint8_t *values) { + return __lsx_vld(values, 0); + } + simdutf_really_inline simd8(const __m128i _value) + : base_u8(_value) {} + // Zero constructor + simdutf_really_inline simd8() : simd8(zero()) {} + // Array constructor + simdutf_really_inline simd8(const uint8_t values[16]) : simd8(load(values)) {} + // Splat constructor + simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {} + // Member-by-member initialization + simdutf_really_inline + simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, + uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10, + uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15) + : simd8((__m128i)v16u8{v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, + v12, v13, v14, v15}) {} - // 5. store bytes - vst1q_u8(utf8_output, utf8_packed); - // 6. adjust pointers - latin1_input += 8; - utf8_output += row[0]; + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdutf_really_inline static simd8 + repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, + uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, + uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, + uint8_t v15) { + return simd8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15); + } - } // while + // Store to array + simdutf_really_inline void store(uint8_t dst[16]) const { + return __lsx_vst(this->value, dst, 0); + } - return std::make_pair(latin1_input, reinterpret_cast(utf8_output)); -} -/* end file src/arm64/arm_convert_latin1_to_utf8.cpp */ -/* begin file src/arm64/arm_convert_latin1_to_utf16.cpp */ -template -std::pair arm_convert_latin1_to_utf16(const char* buf, size_t len, char16_t* utf16_output) { - const char* end = buf + len; - - while (buf + 16 <= end) { - uint8x16_t in8 = vld1q_u8(reinterpret_cast(buf)); - uint16x8_t inlow = vmovl_u8(vget_low_u8(in8)); - if (!match_system(big_endian)) { inlow = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(inlow))); } - vst1q_u16(reinterpret_cast(utf16_output), inlow); - uint16x8_t inhigh = vmovl_u8(vget_high_u8(in8)); - if (!match_system(big_endian)) { inhigh = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(inhigh))); } - vst1q_u16(reinterpret_cast(utf16_output+8), inhigh); - utf16_output += 16; - buf += 16; - } + // Order-specific operations + simdutf_really_inline simd8 + operator>=(const simd8 other) const { + return __lsx_vsle_bu(other, *this); + } + simdutf_really_inline simd8 + operator>(const simd8 other) const { + return __lsx_vslt_bu(other, *this); + } + simdutf_really_inline simd8 &operator-=(const simd8 other) { + value = __lsx_vsub_b(value, other.value); + return *this; + } + // Same as >, but instead of guaranteeing all 1's == true, false = 0 and true + // = nonzero. For ARM, returns all 1's. + simdutf_really_inline simd8 + gt_bits(const simd8 other) const { + return simd8(*this > other); + } - return std::make_pair(buf, utf16_output); -} -/* end file src/arm64/arm_convert_latin1_to_utf16.cpp */ -/* begin file src/arm64/arm_convert_latin1_to_utf32.cpp */ -std::pair arm_convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) { - const char* end = buf + len; - - while (buf + 16 <= end) { - uint8x16_t in8 = vld1q_u8(reinterpret_cast(buf)); - uint16x8_t in8low = vmovl_u8(vget_low_u8(in8)); - uint32x4_t in16lowlow = vmovl_u16(vget_low_u16(in8low)); - uint32x4_t in16lowhigh = vmovl_u16(vget_high_u16(in8low)); - uint16x8_t in8high = vmovl_u8(vget_high_u8(in8)); - uint32x4_t in8highlow = vmovl_u16(vget_low_u16(in8high)); - uint32x4_t in8highhigh = vmovl_u16(vget_high_u16(in8high)); - vst1q_u32(reinterpret_cast(utf32_output), in16lowlow); - vst1q_u32(reinterpret_cast(utf32_output+4), in16lowhigh); - vst1q_u32(reinterpret_cast(utf32_output+8), in8highlow); - vst1q_u32(reinterpret_cast(utf32_output+12), in8highhigh); - - utf32_output += 16; - buf += 16; - } + // Bit-specific operations + simdutf_really_inline simd8 any_bits_set(simd8 bits) const { + return __lsx_vslt_bu(__lsx_vldi(0), __lsx_vand_v(this->value, bits)); + } + simdutf_really_inline bool is_ascii() const { + return __lsx_vpickve2gr_hu(__lsx_vmskgez_b(this->value), 0) == 0xFFFF; + } - return std::make_pair(buf, utf32_output); -} -/* end file src/arm64/arm_convert_latin1_to_utf32.cpp */ + simdutf_really_inline bool any_bits_set_anywhere() const { + return __lsx_vpickve2gr_hu(__lsx_vmsknz_b(this->value), 0) > 0; + } + template simdutf_really_inline simd8 shr() const { + return __lsx_vsrli_b(this->value, N); + } + template simdutf_really_inline simd8 shl() const { + return __lsx_vslli_b(this->value, N); + } -/* begin file src/arm64/arm_convert_utf8_to_utf16.cpp */ -// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the -// end of the code points. Only the least significant 12 bits of the mask -// are accessed. -// It returns how many bytes were consumed (up to 16, usually 12). -template -size_t convert_masked_utf8_to_utf16(const char *input, - uint64_t utf8_end_of_code_point_mask, - char16_t *&utf16_output) { - // we use an approach where we try to process up to 12 input bytes. - // Why 12 input bytes and not 16? Because we are concerned with the size of - // the lookup tables. Also 12 is nicely divisible by two and three. - // - uint8x16_t in = vld1q_u8(reinterpret_cast(input)); - const uint16_t input_utf8_end_of_code_point_mask = - utf8_end_of_code_point_mask & 0xfff; - // - // Optimization note: our main path below is load-latency dependent. Thus it is maybe - // beneficial to have fast paths that depend on branch prediction but have less latency. - // This results in more instructions but, potentially, also higher speeds. + // Perform a lookup assuming the value is between 0 and 16 (undefined behavior + // for out of range values) + template + simdutf_really_inline simd8 lookup_16(simd8 lookup_table) const { + return lookup_table.apply_lookup_16_to(*this); + } - // We first try a few fast paths. - // The obvious first test is ASCII, which actually consumes the full 16. - if((utf8_end_of_code_point_mask & 0xFFFF) == 0xffff) { - // We process in chunks of 16 bytes - // The routine in simd.h is reused. - simd8 temp{vreinterpretq_s8_u8(in)}; - temp.store_ascii_as_utf16(utf16_output); - utf16_output += 16; // We wrote 16 16-bit characters. - return 16; // We consumed 16 bytes. + template + simdutf_really_inline simd8 + lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4, + L replace5, L replace6, L replace7, L replace8, L replace9, + L replace10, L replace11, L replace12, L replace13, L replace14, + L replace15) const { + return lookup_16(simd8::repeat_16( + replace0, replace1, replace2, replace3, replace4, replace5, replace6, + replace7, replace8, replace9, replace10, replace11, replace12, + replace13, replace14, replace15)); } - // 3 byte sequences are the next most common, as seen in CJK, which has long sequences - // of these. - if (input_utf8_end_of_code_point_mask == 0x924) { - // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte UTF-16 code units. - uint16x4_t composed = convert_utf8_3_byte_to_utf16(in); - // Byte swap if necessary - if (!match_system(big_endian)) { - composed = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(composed))); - } - vst1_u16(reinterpret_cast(utf16_output), composed); - utf16_output += 4; // We wrote 4 16-bit characters. - return 12; // We consumed 12 bytes. + template + simdutf_really_inline simd8 + apply_lookup_16_to(const simd8 original) const { + __m128i original_tmp = __lsx_vand_v(original, __lsx_vldi(0x1f)); + return __lsx_vshuf_b(__lsx_vldi(0), *this, simd8(original_tmp)); } - // 2 byte sequences occur in short bursts in languages like Greek and Russian. - if ((utf8_end_of_code_point_mask & 0xFFF) == 0xaaa) { - // We want to take 6 2-byte UTF-8 code units and turn them into 6 2-byte UTF-16 code units. - uint16x8_t composed = convert_utf8_2_byte_to_utf16(in); - // Byte swap if necessary - if (!match_system(big_endian)) { - composed = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed))); - } - vst1q_u16(reinterpret_cast(utf16_output), composed); + simdutf_really_inline uint64_t sum_bytes() const { + const auto sum_u16 = __lsx_vhaddw_hu_bu(value, value); + const auto sum_u32 = __lsx_vhaddw_wu_hu(sum_u16, sum_u16); + const auto sum_u64 = __lsx_vhaddw_du_wu(sum_u32, sum_u32); - utf16_output += 6; // We wrote 6 16-bit characters. - return 12; // We consumed 12 bytes. + return uint64_t(__lsx_vpickve2gr_du(sum_u64, 0)) + + uint64_t(__lsx_vpickve2gr_du(sum_u64, 1)); } +}; - /// We do not have a fast path available, or the fast path is unimportant, so we fallback. - const uint8_t idx = - simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; +// Signed bytes +template <> struct simd8 { + __m128i value; - const uint8_t consumed = - simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; + static const int SIZE = sizeof(value); - if (idx < 64) { - // SIX (6) input code-code units - // Convert to UTF-16 - uint16x8_t composed = convert_utf8_1_to_2_byte_to_utf16(in, idx); - // Byte swap if necessary - if (!match_system(big_endian)) { - composed = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed))); - } - // Store - vst1q_u16(reinterpret_cast(utf16_output), composed); - utf16_output += 6; // We wrote 6 16-bit characters. - return consumed; - } else if (idx < 145) { - // FOUR (4) input code-code units - // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing. - uint8x16_t sh = vld1q_u8(reinterpret_cast(simdutf::tables::utf8_to_utf16::shufutf8[idx])); - // XXX: depending on the system scalar instructions might be faster. - // 1 byte: 00000000 00000000 0ccccccc - // 2 byte: 00000000 110bbbbb 10cccccc - // 3 byte: 1110aaaa 10bbbbbb 10cccccc - uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh)); - // 1 byte: 00000000 0ccccccc - // 2 byte: xx0bbbbb x0cccccc - // 3 byte: xxbbbbbb x0cccccc - uint16x4_t lowperm = vmovn_u32(perm); - // Partially mask with bic (doesn't require a temporary register unlike and) - // The shift left insert below will clear the top bits. - // 1 byte: 00000000 00000000 - // 2 byte: xx0bbbbb 00000000 - // 3 byte: xxbbbbbb 00000000 - uint16x4_t middlebyte = vbic_u16(lowperm, vmov_n_u16(uint16_t(~0xFF00))); - // ASCII - // 1 byte: 00000000 0ccccccc - // 2+byte: 00000000 00cccccc - uint16x4_t ascii = vand_u16(lowperm, vmov_n_u16(0x7F)); - // Split into narrow vectors. - // 2 byte: 00000000 00000000 - // 3 byte: 00000000 xxxxaaaa - uint16x4_t highperm = vshrn_n_u32(perm, 16); - // Shift right accumulate the middle byte - // 1 byte: 00000000 0ccccccc - // 2 byte: 00xx0bbb bbcccccc - // 3 byte: 00xxbbbb bbcccccc - uint16x4_t middlelow = vsra_n_u16(ascii, middlebyte, 2); - // Shift left and insert the top 4 bits, overwriting the garbage - // 1 byte: 00000000 0ccccccc - // 2 byte: 00000bbb bbcccccc - // 3 byte: aaaabbbb bbcccccc - uint16x4_t composed = vsli_n_u16(middlelow, highperm, 12); - // Byte swap if necessary - if (!match_system(big_endian)) { - composed = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(composed))); - } - vst1_u16(reinterpret_cast(utf16_output), composed); + static simdutf_really_inline simd8 splat(int8_t _value) { + return __lsx_vreplgr2vr_b(_value); + } + static simdutf_really_inline simd8 zero() { return __lsx_vldi(0); } + static simdutf_really_inline simd8 load(const int8_t values[16]) { + return __lsx_vld(values, 0); + } - utf16_output += 4; // We wrote 4 16-bit codepoints - return consumed; - } else if (idx < 209) { - // THREE (3) input code-code units - if (input_utf8_end_of_code_point_mask == 0x888) { - // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte UTF-16 pairs. - // Generating surrogate pairs is a little tricky though, but it is easier when we - // can assume they are all pairs. - // This version does not use the LUT, but 4 byte sequences are less common and the - // overhead of the extra memory access is less important than the early branch overhead - // in shorter sequences. + template + simdutf_really_inline void store_ascii_as_utf16(char16_t *p) const { + __m128i zero = __lsx_vldi(0); + if simdutf_constexpr (match_system(big_endian)) { + __lsx_vst(__lsx_vilvl_b(zero, (__m128i)this->value), + reinterpret_cast(p), 0); + __lsx_vst(__lsx_vilvh_b(zero, (__m128i)this->value), + reinterpret_cast(p + 8), 0); + } else { + __lsx_vst(__lsx_vilvl_b((__m128i)this->value, zero), + reinterpret_cast(p), 0); + __lsx_vst(__lsx_vilvh_b((__m128i)this->value, zero), + reinterpret_cast(p + 8), 0); + } + } + + simdutf_really_inline void store_ascii_as_utf32(char32_t *p) const { + __m128i zero = __lsx_vldi(0); + __m128i in16low = __lsx_vilvl_b(zero, (__m128i)this->value); + __m128i in16high = __lsx_vilvh_b(zero, (__m128i)this->value); + __m128i in32_0 = __lsx_vilvl_h(zero, in16low); + __m128i in32_1 = __lsx_vilvh_h(zero, in16low); + __m128i in32_2 = __lsx_vilvl_h(zero, in16high); + __m128i in32_3 = __lsx_vilvh_h(zero, in16high); + __lsx_vst(in32_0, reinterpret_cast(p), 0); + __lsx_vst(in32_1, reinterpret_cast(p + 4), 0); + __lsx_vst(in32_2, reinterpret_cast(p + 8), 0); + __lsx_vst(in32_3, reinterpret_cast(p + 12), 0); + } + + // In places where the table can be reused, which is most uses in simdutf, it + // is worth it to do 4 table lookups, as there is no direct zero extension + // from u8 to u32. + simdutf_really_inline void store_ascii_as_utf32_tbl(char32_t *p) const { + const simd8 tb1{0, 255, 255, 255, 1, 255, 255, 255, + 2, 255, 255, 255, 3, 255, 255, 255}; + const simd8 tb2{4, 255, 255, 255, 5, 255, 255, 255, + 6, 255, 255, 255, 7, 255, 255, 255}; + const simd8 tb3{8, 255, 255, 255, 9, 255, 255, 255, + 10, 255, 255, 255, 11, 255, 255, 255}; + const simd8 tb4{12, 255, 255, 255, 13, 255, 255, 255, + 14, 255, 255, 255, 15, 255, 255, 255}; + + // encourage store pairing and interleaving + const auto shuf1 = this->apply_lookup_16_to(tb1); + const auto shuf2 = this->apply_lookup_16_to(tb2); + shuf1.store(reinterpret_cast(p)); + shuf2.store(reinterpret_cast(p + 4)); + + const auto shuf3 = this->apply_lookup_16_to(tb3); + const auto shuf4 = this->apply_lookup_16_to(tb4); + shuf3.store(reinterpret_cast(p + 8)); + shuf4.store(reinterpret_cast(p + 12)); + } + // Conversion from/to SIMD register + simdutf_really_inline simd8(const __m128i _value) : value(_value) {} - // Swap byte pairs - // 10dddddd 10cccccc|10bbbbbb 11110aaa - // 10cccccc 10dddddd|11110aaa 10bbbbbb - uint8x16_t swap = vrev16q_u8(in); - // Shift left 2 bits - // cccccc00 dddddd00 xxxxxxxx bbbbbb00 - uint32x4_t shift = vreinterpretq_u32_u8(vshlq_n_u8(swap, 2)); - // Create a magic number containing the low 2 bits of the trail surrogate and all the - // corrections needed to create the pair. - // UTF-8 4b prefix = -0x0000|0xF000 - // surrogate offset = -0x0000|0x0040 (0x10000 << 6) - // surrogate high = +0x0000|0xD800 - // surrogate low = +0xDC00|0x0000 - // ------------------------------- - // = +0xDC00|0xE7C0 - uint32x4_t magic = vmovq_n_u32(0xDC00E7C0); - // Generate unadjusted trail surrogate minus lowest 2 bits - // xxxxxxxx xxxxxxxx|11110aaa bbbbbb00 - uint32x4_t trail = vbslq_u32(vmovq_n_u32(0x0000FF00), vreinterpretq_u32_u8(swap), shift); - // Insert low 2 bits of trail surrogate to magic number for later - // 11011100 00000000 11100111 110000cc - uint16x8_t magic_with_low_2 = vreinterpretq_u16_u32(vsraq_n_u32(magic, shift, 30)); - // Generate lead surrogate - // xxxxcccc ccdddddd|xxxxxxxx xxxxxxxx - uint32x4_t lead = vreinterpretq_u32_u16(vsliq_n_u16(vreinterpretq_u16_u8(swap), vreinterpretq_u16_u8(in), 6)); - // Mask out lead - // 000000cc ccdddddd|xxxxxxxx xxxxxxxx - lead = vbicq_u32(lead, vmovq_n_u32(uint32_t(~0x03FFFFFF))); - // Blend pairs - // 000000cc ccdddddd|11110aaa bbbbbb00 - uint16x8_t blend = vreinterpretq_u16_u32(vbslq_u32(vmovq_n_u32(0x0000FFFF), trail, lead)); - // Add magic number to finish the result - // 110111CC CCDDDDDD|110110AA BBBBBBCC - uint16x8_t composed = vaddq_u16(blend, magic_with_low_2); - // Byte swap if necessary - if (!match_system(big_endian)) { - composed = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed))); - } - vst1q_u16(reinterpret_cast(utf16_output), composed); - utf16_output += 6; // We 3 32-bit surrogate pairs. - return 12; // We consumed 12 bytes. - } - // 3 1-4 byte sequences - uint8x16_t sh = vld1q_u8(reinterpret_cast(simdutf::tables::utf8_to_utf16::shufutf8[idx])); + // Zero constructor + simdutf_really_inline simd8() : simd8(zero()) {} + // Splat constructor + simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdutf_really_inline simd8(const int8_t *values) : simd8(load(values)) {} - // 1 byte: 00000000 00000000 00000000 0ddddddd - // 3 byte: 00000000 00000000 110ccccc 10dddddd - // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd - // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd - uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh)); - // Mask the low and middle bytes - // 00000000 00000000 00000000 0ddddddd - uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7f)); - // Because the surrogates need more work, the high surrogate is computed first. - uint32x4_t middlehigh = vshlq_n_u32(perm, 2); - // 00000000 00000000 00cccccc 00000000 - uint32x4_t middlebyte = vandq_u32(perm, vmovq_n_u32(0x3F00)); - // Start assembling the sequence. Since the 4th byte is in the same position as it - // would be in a surrogate and there is no dependency, shift left instead of right. - // 3 byte: 00000000 10bbbbxx xxxxxxxx xxxxxxxx - // 4 byte: 11110aaa bbbbbbxx xxxxxxxx xxxxxxxx - uint32x4_t ab = vbslq_u32(vmovq_n_u32(0xFF000000), perm, middlehigh); - // Top 16 bits contains the high ten bits of the surrogate pair before correction - // 3 byte: 00000000 10bbbbcc|cccc0000 00000000 - // 4 byte: 11110aaa bbbbbbcc|cccc0000 00000000 - high 10 bits correct w/o correction - uint32x4_t abc = vbslq_u32(vmovq_n_u32(0xFFFC0000), ab, vshlq_n_u32(middlebyte, 4)); - // Combine the low 6 or 7 bits by a shift right accumulate - // 3 byte: 00000000 00000010|bbbbcccc ccdddddd - low 16 bits correct - // 4 byte: 00000011 110aaabb|bbbbcccc ccdddddd - low 10 bits correct w/o correction - uint32x4_t composed = vsraq_n_u32(ascii, abc, 6); - // After this is for surrogates - // Blend the low and high surrogates - // 4 byte: 11110aaa bbbbbbcc|bbbbcccc ccdddddd - uint32x4_t mixed = vbslq_u32(vmovq_n_u32(0xFFFF0000), abc, composed); - // Clear the upper 6 bits of the low surrogate. Don't clear the upper bits yet as - // 0x10000 was not subtracted from the codepoint yet. - // 4 byte: 11110aaa bbbbbbcc|000000cc ccdddddd - uint16x8_t masked_pair = - vreinterpretq_u16_u32(vbicq_u32(mixed, vmovq_n_u32(uint32_t(~0xFFFF03FF)))); - // Correct the remaining UTF-8 prefix, surrogate offset, and add the surrogate prefixes - // in one magic 16-bit addition. - // similar magic number but without the continue byte adjust and halfword swapped - // UTF-8 4b prefix = -0xF000|0x0000 - // surrogate offset = -0x0040|0x0000 (0x10000 << 6) - // surrogate high = +0xD800|0x0000 - // surrogate low = +0x0000|0xDC00 - // ----------------------------------- - // = +0xE7C0|0xDC00 - uint16x8_t magic = vreinterpretq_u16_u32(vmovq_n_u32(0xE7C0DC00)); - // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD - surrogate pair complete - uint32x4_t surrogates = vreinterpretq_u32_u16(vaddq_u16(masked_pair, magic)); - // If the high bit is 1 (s32 less than zero), this needs a surrogate pair - uint32x4_t is_pair = vcltzq_s32(vreinterpretq_s32_u32(perm)); + // Store to array + simdutf_really_inline void store(int8_t dst[16]) const { + return __lsx_vst(value, dst, 0); + } - // Select either the 4 byte surrogate pair or the 2 byte solo codepoint - // 3 byte: 0xxxxxxx xxxxxxxx|bbbbcccc ccdddddd - // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD - uint32x4_t selected = vbslq_u32(is_pair, surrogates, composed); - // Byte swap if necessary - if (!match_system(big_endian)) { - selected = vreinterpretq_u32_u8(vrev16q_u8(vreinterpretq_u8_u32(selected))); - } - // Attempting to shuffle and store would be complex, just scalarize. - uint32_t buffer[4]; - vst1q_u32(buffer, selected); - // Test for the top bit of the surrogate mask. - const uint32_t SURROGATE_MASK = match_system(big_endian) ? 0x80000000 : 0x00800000; - for (size_t i = 0; i < 3; i++) { - // Surrogate - if (buffer[i] & SURROGATE_MASK) { - utf16_output[0] = uint16_t(buffer[i] >> 16); - utf16_output[1] = uint16_t(buffer[i] & 0xFFFF); - utf16_output += 2; - } else { - utf16_output[0] = uint16_t(buffer[i] & 0xFFFF); - utf16_output++; - } - } - return consumed; - } else { - // here we know that there is an error but we do not handle errors - return 12; + simdutf_really_inline operator simd8() const { + return ((__m128i)this->value); } -} -/* end file src/arm64/arm_convert_utf8_to_utf16.cpp */ -/* begin file src/arm64/arm_convert_utf8_to_utf32.cpp */ -// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the -// end of the code points. Only the least significant 12 bits of the mask -// are accessed. -// It returns how many bytes were consumed (up to 12). -size_t convert_masked_utf8_to_utf32(const char *input, - uint64_t utf8_end_of_code_point_mask, - char32_t *&utf32_out) { - // we use an approach where we try to process up to 12 input bytes. - // Why 12 input bytes and not 16? Because we are concerned with the size of - // the lookup tables. Also 12 is nicely divisible by two and three. - // - uint32_t*& utf32_output = reinterpret_cast(utf32_out); - uint8x16_t in = vld1q_u8(reinterpret_cast(input)); - const uint16_t input_utf8_end_of_code_point_mask = - utf8_end_of_code_point_mask & 0xFFF; - // - // Optimization note: our main path below is load-latency dependent. Thus it is maybe - // beneficial to have fast paths that depend on branch prediction but have less latency. - // This results in more instructions but, potentially, also higher speeds. - // - // We first try a few fast paths. - if((utf8_end_of_code_point_mask & 0xffff) == 0xffff) { - // We process in chunks of 16 bytes. - // use fast implementation in src/simdutf/arm64/simd.h - // Ideally the compiler can keep the tables in registers. - simd8 temp{vreinterpretq_s8_u8(in)}; - temp.store_ascii_as_utf32_tbl(utf32_out); - utf32_output += 16; // We wrote 16 32-bit characters. - return 16; // We consumed 16 bytes. + simdutf_really_inline simd8 + operator|(const simd8 other) const { + return __lsx_vor_v((__m128i)value, (__m128i)other.value); } - if(input_utf8_end_of_code_point_mask == 0x924) { - // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte UTF-32 code units. - // Convert to UTF-16 - uint16x4_t composed_utf16 = convert_utf8_3_byte_to_utf16(in); - // Zero extend and store via ST2 with a zero. - uint16x4x2_t interleaver = {{ composed_utf16, vmov_n_u16(0) }}; - vst2_u16(reinterpret_cast(utf32_output), interleaver); - utf32_output += 4; // We wrote 4 32-bit characters. - return 12; // We consumed 12 bytes. + + simdutf_really_inline bool is_ascii() const { + return (__lsx_vpickve2gr_hu(__lsx_vmskgez_b((__m128i)this->value), 0) == + 0xffff); } - // 2 byte sequences occur in short bursts in languages like Greek and Russian. - if(input_utf8_end_of_code_point_mask == 0xaaa) { - // We want to take 6 2-byte UTF-8 code units and turn them into 6 4-byte UTF-32 code units. - // Convert to UTF-16 - uint16x8_t composed_utf16 = convert_utf8_2_byte_to_utf16(in); - // Zero extend and store via ST2 with a zero. - uint16x8x2_t interleaver = {{ composed_utf16, vmovq_n_u16(0) }}; - vst2q_u16(reinterpret_cast(utf32_output), interleaver); - utf32_output += 6; // We wrote 6 32-bit characters. - return 12; // We consumed 12 bytes. + // Order-sensitive comparisons + simdutf_really_inline simd8 operator>(const simd8 other) const { + return __lsx_vslt_b((__m128i)other.value, (__m128i)value); + } + simdutf_really_inline simd8 operator<(const simd8 other) const { + return __lsx_vslt_b((__m128i)value, (__m128i)other.value); } - /// Either no fast path or an unimportant fast path. - const uint8_t idx = - simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; - const uint8_t consumed = - simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; + template + simdutf_really_inline simd8 + prev(const simd8 prev_chunk) const { + return __lsx_vor_v(__lsx_vbsll_v(this->value, N), + __lsx_vbsrl_v(prev_chunk.value, 16 - N)); + } + template + simdutf_really_inline simd8 + apply_lookup_16_to(const simd8 original) const { + __m128i original_tmp = __lsx_vand_v(original, __lsx_vldi(0x1f)); + return __lsx_vshuf_b(__lsx_vldi(0), (__m128i)this->value, + simd8(original_tmp)); + } +}; - if (idx < 64) { - // SIX (6) input code-code units - // Convert to UTF-16 - uint16x8_t composed_utf16 = convert_utf8_1_to_2_byte_to_utf16(in, idx); - // Zero extend and store with ST2 and zero - uint16x8x2_t interleaver = {{ composed_utf16, vmovq_n_u16(0) }}; - vst2q_u16(reinterpret_cast(utf32_output), interleaver); - utf32_output += 6; // We wrote 6 32-bit characters. - return consumed; - } else if (idx < 145) { - // FOUR (4) input code-code units - // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing. - uint8x16_t sh = vld1q_u8(reinterpret_cast(simdutf::tables::utf8_to_utf16::shufutf8[idx])); - // Shuffle - // 1 byte: 00000000 00000000 0ccccccc - // 2 byte: 00000000 110bbbbb 10cccccc - // 3 byte: 1110aaaa 10bbbbbb 10cccccc - uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh)); - // Split - // 00000000 00000000 0ccccccc - uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7F)); // 6 or 7 bits - // Note: unmasked - // xxxxxxxx aaaaxxxx xxxxxxxx - uint32x4_t high = vshrq_n_u32(perm, 4); // 4 bits - // Use 16 bit bic instead of and. - // The top bits will be corrected later in the bsl - // 00000000 10bbbbbb 00000000 - uint32x4_t middle = - vreinterpretq_u32_u16(vbicq_u16(vreinterpretq_u16_u32(perm), vmovq_n_u16(uint16_t(~0xff00)))); // 5 or 6 bits - // Combine low and middle with shift right accumulate - // 00000000 00xxbbbb bbcccccc - uint32x4_t lowmid = vsraq_n_u32(ascii, middle, 2); - // Insert top 4 bits from high byte with bitwise select - // 00000000 aaaabbbb bbcccccc - uint32x4_t composed = vbslq_u32(vmovq_n_u32(0x0000F000), high, lowmid); - vst1q_u32(utf32_output, composed); - utf32_output += 4; // We wrote 4 32-bit characters. - return consumed; - } else if (idx < 209) { - // THREE (3) input code-code units - if (input_utf8_end_of_code_point_mask == 0x888) { - // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte UTF-32 code units. - // This uses the same method as the fixed 3 byte version, reversing and shift left insert. - // However, there is no need for a shuffle mask now, just rev16 and rev32. - // - // This version does not use the LUT, but 4 byte sequences are less common and the - // overhead of the extra memory access is less important than the early branch overhead - // in shorter sequences, so it comes last. +template struct simd8x64 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd8); + static_assert( + NUM_CHUNKS == 4, + "LoongArch kernel should use four registers per 64-byte block."); + simd8 chunks[NUM_CHUNKS]; - // Swap pairs of bytes - // 10dddddd|10cccccc|10bbbbbb|11110aaa - // 10cccccc 10dddddd|11110aaa 10bbbbbb - uint16x8_t swap1 = vreinterpretq_u16_u8(vrev16q_u8(in)); - // Shift left and insert - // xxxxcccc ccdddddd|xxxxxxxa aabbbbbb - uint16x8_t merge1 = vsliq_n_u16(swap1, vreinterpretq_u16_u8(in), 6); - // Swap 16-bit lanes - // xxxxcccc ccdddddd xxxxxxxa aabbbbbb - // xxxxxxxa aabbbbbb xxxxcccc ccdddddd - uint32x4_t swap2 = vreinterpretq_u32_u16(vrev32q_u16(merge1)); - // Shift insert again - // xxxxxxxx xxxaaabb bbbbcccc ccdddddd - uint32x4_t merge2 = vsliq_n_u32(swap2, vreinterpretq_u32_u16(merge1), 12); - // Clear the garbage - // 00000000 000aaabb bbbbcccc ccdddddd - uint32x4_t composed = vandq_u32(merge2, vmovq_n_u32(0x1FFFFF)); - // Store - vst1q_u32(utf32_output, composed); + simd8x64(const simd8x64 &o) = delete; // no copy allowed + simd8x64 & + operator=(const simd8 other) = delete; // no assignment allowed + simd8x64() = delete; // no default constructor allowed - utf32_output += 3; // We wrote 3 32-bit characters. - return 12; // We consumed 12 bytes. - } - // Unlike UTF-16, doing a fast codepath doesn't have nearly as much benefit due to - // surrogates no longer being involved. - uint8x16_t sh = vld1q_u8(reinterpret_cast(simdutf::tables::utf8_to_utf16::shufutf8[idx])); - // 1 byte: 00000000 00000000 00000000 0ddddddd - // 2 byte: 00000000 00000000 110ccccc 10dddddd - // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd - // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd - uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh)); - // Ascii - uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7F)); - uint32x4_t middle = vandq_u32(perm, vmovq_n_u32(0x3f00)); - // When converting the way we do, the 3 byte prefix will be interpreted as the - // 18th bit being set, since the code would interpret the lead byte (0b1110bbbb) - // as a continuation byte (0b10bbbbbb). To fix this, we can either xor or do an - // 8 bit add of the 6th bit shifted right by 1. Since NEON has shift right accumulate, - // we use that. - // 4 byte 3 byte - // 10bbbbbb 1110bbbb - // 00000000 01000000 6th bit - // 00000000 00100000 shift right - // 10bbbbbb 0000bbbb add - // 00bbbbbb 0000bbbb mask - uint8x16_t correction = - vreinterpretq_u8_u32(vandq_u32(perm, vmovq_n_u32(0x00400000))); - uint32x4_t corrected = - vreinterpretq_u32_u8(vsraq_n_u8(vreinterpretq_u8_u32(perm), correction, 1)); - // 00000000 00000000 0000cccc ccdddddd - uint32x4_t cd = vsraq_n_u32(ascii, middle, 2); - // Insert twice - // xxxxxxxx xxxaaabb bbbbxxxx xxxxxxxx - uint32x4_t ab = vbslq_u32(vmovq_n_u32(0x01C0000), vshrq_n_u32(corrected, 6), vshrq_n_u32(corrected, 4)); - // 00000000 000aaabb bbbbcccc ccdddddd - uint32x4_t composed = vbslq_u32(vmovq_n_u32(0xFFE00FFF), cd, ab); - // Store - vst1q_u32(utf32_output, composed); - utf32_output += 3; // We wrote 3 32-bit characters. - return consumed; - } else { - // here we know that there is an error but we do not handle errors - return 12; + simdutf_really_inline simd8x64(const simd8 chunk0, const simd8 chunk1, + const simd8 chunk2, const simd8 chunk3) + : chunks{chunk0, chunk1, chunk2, chunk3} {} + simdutf_really_inline simd8x64(const T *ptr) + : chunks{simd8::load(ptr), + simd8::load(ptr + sizeof(simd8) / sizeof(T)), + simd8::load(ptr + 2 * sizeof(simd8) / sizeof(T)), + simd8::load(ptr + 3 * sizeof(simd8) / sizeof(T))} {} + + simdutf_really_inline void store(T *ptr) const { + this->chunks[0].store(ptr + sizeof(simd8) * 0 / sizeof(T)); + this->chunks[1].store(ptr + sizeof(simd8) * 1 / sizeof(T)); + this->chunks[2].store(ptr + sizeof(simd8) * 2 / sizeof(T)); + this->chunks[3].store(ptr + sizeof(simd8) * 3 / sizeof(T)); } -} -/* end file src/arm64/arm_convert_utf8_to_utf32.cpp */ -/* begin file src/arm64/arm_convert_utf8_to_latin1.cpp */ -// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the -// end of the code points. Only the least significant 12 bits of the mask -// are accessed. -// It returns how many bytes were consumed (up to 16, usually 12). -size_t convert_masked_utf8_to_latin1(const char *input, - uint64_t utf8_end_of_code_point_mask, - char *&latin1_output) { - // we use an approach where we try to process up to 12 input bytes. - // Why 12 input bytes and not 16? Because we are concerned with the size of - // the lookup tables. Also 12 is nicely divisible by two and three. - // - uint8x16_t in = vld1q_u8(reinterpret_cast(input)); - const uint16_t input_utf8_end_of_code_point_mask = - utf8_end_of_code_point_mask & 0xfff; - // - // Optimization note: our main path below is load-latency dependent. Thus it is maybe - // beneficial to have fast paths that depend on branch prediction but have less latency. - // This results in more instructions but, potentially, also higher speeds. - // We first try a few fast paths. - // The obvious first test is ASCII, which actually consumes the full 16. - if((utf8_end_of_code_point_mask & 0xFFFF) == 0xffff) { - // We process in chunks of 16 bytes - vst1q_u8(reinterpret_cast(latin1_output), in); - latin1_output += 16; // We wrote 16 18-bit characters. - return 16; // We consumed 16 bytes. + simdutf_really_inline simd8x64 &operator|=(const simd8x64 &other) { + this->chunks[0] |= other.chunks[0]; + this->chunks[1] |= other.chunks[1]; + this->chunks[2] |= other.chunks[2]; + this->chunks[3] |= other.chunks[3]; + return *this; } - /// We do not have a fast path available, or the fast path is unimportant, so we fallback. - const uint8_t idx = - simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; - const uint8_t consumed = - simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; - // this indicates an invalid input: - if(idx >= 64) { return consumed; } - // Here we should have (idx < 64), if not, there is a bug in the validation or elsewhere. - // SIX (6) input code-code units - // this is a relatively easy scenario - // we process SIX (6) input code-code units. The max length in bytes of six code - // code units spanning between 1 and 2 bytes each is 12 bytes. - // Converts 6 1-2 byte UTF-8 characters to 6 UTF-16 characters. - // This is a relatively easy scenario - // we process SIX (6) input code-code units. The max length in bytes of six code - // code units spanning between 1 and 2 bytes each is 12 bytes. - uint8x16_t sh = vld1q_u8(reinterpret_cast(simdutf::tables::utf8_to_utf16::shufutf8[idx])); - // Shuffle - // 1 byte: 00000000 0bbbbbbb - // 2 byte: 110aaaaa 10bbbbbb - uint16x8_t perm = vreinterpretq_u16_u8(vqtbl1q_u8(in, sh)); - // Mask - // 1 byte: 00000000 0bbbbbbb - // 2 byte: 00000000 00bbbbbb - uint16x8_t ascii = vandq_u16(perm, vmovq_n_u16(0x7f)); // 6 or 7 bits - // 1 byte: 00000000 00000000 - // 2 byte: 000aaaaa 00000000 - uint16x8_t highbyte = vandq_u16(perm, vmovq_n_u16(0x1f00)); // 5 bits - // Combine with a shift right accumulate - // 1 byte: 00000000 0bbbbbbb - // 2 byte: 00000aaa aabbbbbb - uint16x8_t composed = vsraq_n_u16(ascii, highbyte, 2); - // writing 8 bytes even though we only care about the first 6 bytes. - uint8x8_t latin1_packed = vmovn_u16(composed); - vst1_u8(reinterpret_cast(latin1_output), latin1_packed); - latin1_output += 6; // We wrote 6 bytes. - return consumed; -} + simdutf_really_inline simd8 reduce_or() const { + return (this->chunks[0] | this->chunks[1]) | + (this->chunks[2] | this->chunks[3]); + } -/* end file src/arm64/arm_convert_utf8_to_latin1.cpp */ + simdutf_really_inline bool is_ascii() const { return reduce_or().is_ascii(); } -/* begin file src/arm64/arm_convert_utf16_to_latin1.cpp */ + template + simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const { + this->chunks[0].template store_ascii_as_utf16(ptr + + sizeof(simd8) * 0); + this->chunks[1].template store_ascii_as_utf16(ptr + + sizeof(simd8) * 1); + this->chunks[2].template store_ascii_as_utf16(ptr + + sizeof(simd8) * 2); + this->chunks[3].template store_ascii_as_utf16(ptr + + sizeof(simd8) * 3); + } -template -std::pair arm_convert_utf16_to_latin1(const char16_t* buf, size_t len, char* latin1_output) { - const char16_t* end = buf + len; - while (buf + 8 <= end) { - uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); - if (!match_system(big_endian)) { in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in))); } - if (vmaxvq_u16(in) <= 0xff) { - // 1. pack the bytes - uint8x8_t latin1_packed = vmovn_u16(in); - // 2. store (8 bytes) - vst1_u8(reinterpret_cast(latin1_output), latin1_packed); - // 3. adjust pointers - buf += 8; - latin1_output += 8; - } else { - return std::make_pair(nullptr, reinterpret_cast(latin1_output)); - } - } // while - return std::make_pair(buf, latin1_output); -} + simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const { + this->chunks[0].store_ascii_as_utf32_tbl(ptr + sizeof(simd8) * 0); + this->chunks[1].store_ascii_as_utf32_tbl(ptr + sizeof(simd8) * 1); + this->chunks[2].store_ascii_as_utf32_tbl(ptr + sizeof(simd8) * 2); + this->chunks[3].store_ascii_as_utf32_tbl(ptr + sizeof(simd8) * 3); + } -template -std::pair arm_convert_utf16_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) { - const char16_t* start = buf; - const char16_t* end = buf + len; - while (buf + 8 <= end) { - uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); - if (!match_system(big_endian)) { in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in))); } - if (vmaxvq_u16(in) <= 0xff) { - // 1. pack the bytes - uint8x8_t latin1_packed = vmovn_u16(in); - // 2. store (8 bytes) - vst1_u8(reinterpret_cast(latin1_output), latin1_packed); - // 3. adjust pointers - buf += 8; - latin1_output += 8; - } else { - // Let us do a scalar fallback. - for(int k = 0; k < 8; k++) { - uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; - if(word <= 0xff) { - *latin1_output++ = char(word); - } else { - return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), latin1_output); - } - } - } - } // while - return std::make_pair(result(error_code::SUCCESS, buf - start), latin1_output); -} -/* end file src/arm64/arm_convert_utf16_to_latin1.cpp */ -/* begin file src/arm64/arm_convert_utf16_to_utf8.cpp */ -/* - The vectorized algorithm works on single SSE register i.e., it - loads eight 16-bit code units. + simdutf_really_inline uint64_t to_bitmask() const { + __m128i mask = __lsx_vbsll_v(__lsx_vmsknz_b(this->chunks[3]), 6); + mask = __lsx_vor_v(mask, __lsx_vbsll_v(__lsx_vmsknz_b(this->chunks[2]), 4)); + mask = __lsx_vor_v(mask, __lsx_vbsll_v(__lsx_vmsknz_b(this->chunks[1]), 2)); + mask = __lsx_vor_v(mask, __lsx_vmsknz_b(this->chunks[0])); + return __lsx_vpickve2gr_du(mask, 0); + } - We consider three cases: - 1. an input register contains no surrogates and each value - is in range 0x0000 .. 0x07ff. - 2. an input register contains no surrogates and values are - is in range 0x0000 .. 0xffff. - 3. an input register contains surrogates --- i.e. codepoints - can have 16 or 32 bits. + simdutf_really_inline uint64_t lt(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64(this->chunks[0] < mask, this->chunks[1] < mask, + this->chunks[2] < mask, this->chunks[3] < mask) + .to_bitmask(); + } + simdutf_really_inline uint64_t gt(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64(this->chunks[0] > mask, this->chunks[1] > mask, + this->chunks[2] > mask, this->chunks[3] > mask) + .to_bitmask(); + } + simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const { + const simd8 mask = simd8::splat(m); + return simd8x64(simd8(this->chunks[0].value) >= mask, + simd8(this->chunks[1].value) >= mask, + simd8(this->chunks[2].value) >= mask, + simd8(this->chunks[3].value) >= mask) + .to_bitmask(); + } +}; // struct simd8x64 - Ad 1. +/* begin file src/simdutf/lsx/simd16-inl.h */ +template struct simd16; - When values are less than 0x0800, it means that a 16-bit code unit - can be converted into: 1) single UTF8 byte (when it's an ASCII - char) or 2) two UTF8 bytes. +template > struct base_u16 { + __m128i value; + static const size_t SIZE = sizeof(value); + static const size_t ELEMENTS = sizeof(value) / sizeof(T); - For this case we do only some shuffle to obtain these 2-byte - codes and finally compress the whole SSE register with a single - shuffle. + // Conversion from/to SIMD register + simdutf_really_inline base_u16() = default; + simdutf_really_inline base_u16(const __m128i _value) : value(_value) {} + // Bit operations + simdutf_really_inline simd16 operator|(const simd16 other) const { + return __lsx_vor_v(this->value, other.value); + } + simdutf_really_inline simd16 operator&(const simd16 other) const { + return __lsx_vand_v(this->value, other.value); + } + simdutf_really_inline simd16 operator~() const { + return __lsx_vxori_b(this->value, 0xFF); + } - We need 256-entry lookup table to get a compression pattern - and the number of output bytes in the compressed vector register. - Each entry occupies 17 bytes. + friend simdutf_really_inline Mask operator==(const simd16 lhs, + const simd16 rhs) { + return __lsx_vseq_h(lhs.value, rhs.value); + } - Ad 2. + template + simdutf_really_inline simd16 byte_right_shift() const { + return __lsx_vbsrl_v(this->value, N); + } - When values fit in 16-bit code units, but are above 0x07ff, then - a single word may produce one, two or three UTF8 bytes. + simdutf_really_inline uint16_t first() const { + return uint16_t(__lsx_vpickve2gr_w(value, 0)); + } +}; - We prepare data for all these three cases in two registers. - The first register contains lower two UTF8 bytes (used in all - cases), while the second one contains just the third byte for - the three-UTF8-bytes case. +template > +struct base16 : base_u16 { + using bitmask_type = uint16_t; - Finally these two registers are interleaved forming eight-element - array of 32-bit values. The array spans two SSE registers. - The bytes from the registers are compressed using two shuffles. + simdutf_really_inline base16() : base_u16() {} + simdutf_really_inline base16(const __m128i _value) : base_u16(_value) {} + template + simdutf_really_inline base16(const Pointer *ptr) + : base16(__lsx_vld(ptr, 0)) {} - We need 256-entry lookup table to get a compression pattern - and the number of output bytes in the compressed vector register. - Each entry occupies 17 bytes. + static const int SIZE = sizeof(base_u16::value); + template + simdutf_really_inline simd16 prev(const simd16 prev_chunk) const { + return __lsx_vor_v(__lsx_vbsll_v(*this, N * 2), + __lsx_vbsrl_v(prev_chunk, 16 - N * 2)); + } +}; - To summarize: - - We need two 256-entry tables that have 8704 bytes in total. -*/ -/* - Returns a pair: the first unprocessed byte from buf and utf8_output - A scalar routing should carry on the conversion of the tail. -*/ -template -std::pair arm_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_out) { - uint8_t * utf8_output = reinterpret_cast(utf8_out); - const char16_t* end = buf + len; +// SIMD byte mask type (returned by things like eq and gt) +template <> struct simd16 : base16 { + static simdutf_really_inline simd16 splat(bool _value) { + return __lsx_vreplgr2vr_h(uint16_t(-(!!_value))); + } - const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800); - const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); - const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080); - const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 - while (buf + 16 + safety_margin <= end) { - uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); - if (!match_system(big_endian)) { in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in))); } - if(vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!! - // It is common enough that we have sequences of 16 consecutive ASCII characters. - uint16x8_t nextin = vld1q_u16(reinterpret_cast(buf) + 8); - if (!match_system(big_endian)) { nextin = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(nextin))); } - if(vmaxvq_u16(nextin) > 0x7F) { - // 1. pack the bytes - // obviously suboptimal. - uint8x8_t utf8_packed = vmovn_u16(in); - // 2. store (8 bytes) - vst1_u8(utf8_output, utf8_packed); - // 3. adjust pointers - buf += 8; - utf8_output += 8; - in = nextin; - } else { - // 1. pack the bytes - // obviously suboptimal. - uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin); - // 2. store (16 bytes) - vst1q_u8(utf8_output, utf8_packed); - // 3. adjust pointers - buf += 16; - utf8_output += 16; - continue; // we are done for this round! - } - } + simdutf_really_inline simd16() : base16() {} + simdutf_really_inline simd16(const __m128i _value) : base16(_value) {} - if (vmaxvq_u16(in) <= 0x7FF) { - // 1. prepare 2-byte values - // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 - // expected output : [110a|aaaa|10bb|bbbb] x 8 - const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00); - const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f); - - // t0 = [000a|aaaa|bbbb|bb00] - const uint16x8_t t0 = vshlq_n_u16(in, 2); - // t1 = [000a|aaaa|0000|0000] - const uint16x8_t t1 = vandq_u16(t0, v_1f00); - // t2 = [0000|0000|00bb|bbbb] - const uint16x8_t t2 = vandq_u16(in, v_003f); - // t3 = [000a|aaaa|00bb|bbbb] - const uint16x8_t t3 = vorrq_u16(t1, t2); - // t4 = [110a|aaaa|10bb|bbbb] - const uint16x8_t t4 = vorrq_u16(t3, v_c080); - // 2. merge ASCII and 2-byte codewords - const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); - const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f); - const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4)); - // 3. prepare bitmask for 8-bit lookup -#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint16x8_t mask = simdutf_make_uint16x8_t(0x0001, 0x0004, - 0x0010, 0x0040, - 0x0002, 0x0008, - 0x0020, 0x0080); -#else - const uint16x8_t mask = { 0x0001, 0x0004, - 0x0010, 0x0040, - 0x0002, 0x0008, - 0x0020, 0x0080 }; -#endif - uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask)); - // 4. pack the bytes - const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; - const uint8x16_t shuffle = vld1q_u8(row + 1); - const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle); + simdutf_really_inline bitmask_type to_bitmask() const { + __m128i mask = __lsx_vmsknz_b(this->value); + bitmask_type mask0 = bitmask_type(__lsx_vpickve2gr_wu(mask, 0)); + return mask0; + } - // 5. store bytes - vst1q_u8(utf8_output, utf8_packed); + simdutf_really_inline bool is_zero() const { return __lsx_bz_v(this->value); } +}; - // 6. adjust pointers - buf += 8; - utf8_output += row[0]; - continue; +template struct base16_numeric : base16 { + static simdutf_really_inline simd16 splat(T _value) { + return __lsx_vreplgr2vr_h(_value); + } + static simdutf_really_inline simd16 zero() { return __lsx_vldi(0); } - } - const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800); - // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, - // it is likely an uncommon occurrence. - if (vmaxvq_u16(surrogates_bytemask) == 0) { - // case: code units from register produce either 1, 2 or 3 UTF-8 bytes -#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint16x8_t dup_even = simdutf_make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); -#else - const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e}; -#endif - /* In this branch we handle three cases: - 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte - 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes - 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes + template + static simdutf_really_inline simd16 load(const Pointer values) { + return __lsx_vld(values, 0); + } - We expand the input word (16-bit) into two code units (32-bit), thus - we have room for four bytes. However, we need five distinct bit - layouts. Note that the last byte in cases #2 and #3 is the same. + simdutf_really_inline base16_numeric(const __m128i _value) + : base16(_value) {} - We precompute byte 1 for case #1 and the common byte for cases #2 & #3 - in register t2. + // Store to array + simdutf_really_inline void store(T dst[8]) const { + return __lsx_vst(this->value, dst, 0); + } - We precompute byte 1 for case #3 and -- **conditionally** -- precompute - either byte 1 for case #2 or byte 2 for case #3. Note that they - differ by exactly one bit. + // Override to distinguish from bool version + simdutf_really_inline simd16 operator~() const { + return __lsx_vxori_b(this->value, 0xFF); + } +}; - Finally from these two code units we build proper UTF-8 sequence, taking - into account the case (i.e, the number of bytes to write). - */ - /** - * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: - * t2 => [0ccc|cccc] [10cc|cccc] - * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) - */ -#define simdutf_vec(x) vmovq_n_u16(static_cast(x)) - // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] - const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even))); - // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] - const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111)); - // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] - const uint16x8_t t2 = vorrq_u16 (t1, simdutf_vec(0b1000000000000000)); +// Unsigned code unitstemplate<> +template <> struct simd16 : base16_numeric { + simdutf_really_inline simd16(const __m128i _value) + : base16_numeric((__m128i)_value) {} - // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] - const uint16x8_t s0 = vshrq_n_u16(in, 12); - // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] - const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000)); - // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000] - const uint16x8_t s1s = vshlq_n_u16(s1, 2); - // [00bb|bbbb|0000|aaaa] - const uint16x8_t s2 = vorrq_u16(s0, s1s); - // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] - const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000)); - const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF); - const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff); - const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask); - const uint16x8_t s4 = veorq_u16(s3, m0); -#undef simdutf_vec + // Splat constructor + simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {} - // 4. expand code units 16-bit => 32-bit - const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4)); - const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4)); + // Array constructor + simdutf_really_inline simd16(const uint16_t *values) : simd16(load(values)) {} + simdutf_really_inline simd16(const char16_t *values) + : simd16(load(reinterpret_cast(values))) {} - // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle - const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); - const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f); -#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint16x8_t onemask = simdutf_make_uint16x8_t(0x0001, 0x0004, - 0x0010, 0x0040, - 0x0100, 0x0400, - 0x1000, 0x4000 ); - const uint16x8_t twomask = simdutf_make_uint16x8_t(0x0002, 0x0008, - 0x0020, 0x0080, - 0x0200, 0x0800, - 0x2000, 0x8000 ); -#else - const uint16x8_t onemask = { 0x0001, 0x0004, - 0x0010, 0x0040, - 0x0100, 0x0400, - 0x1000, 0x4000 }; - const uint16x8_t twomask = { 0x0002, 0x0008, - 0x0020, 0x0080, - 0x0200, 0x0800, - 0x2000, 0x8000 }; -#endif - const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask)); - const uint16_t mask = vaddvq_u16(combined); - // The following fast path may or may not be beneficial. - /*if(mask == 0) { - // We only have three-byte code units. Use fast path. - const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0}; - const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle); - const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle); - vst1q_u8(utf8_output, utf8_0); - utf8_output += 12; - vst1q_u8(utf8_output, utf8_1); - utf8_output += 12; - buf += 8; - continue; - }*/ - const uint8_t mask0 = uint8_t(mask); + // Copy constructor + simdutf_really_inline simd16(const simd16 mask) : simd16(mask.value) {} - const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; - const uint8x16_t shuffle0 = vld1q_u8(row0 + 1); - const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0); + // Order-specific operations + simdutf_really_inline simd16 &operator+=(const simd16 other) { + value = __lsx_vadd_h(value, other.value); + return *this; + } - const uint8_t mask1 = static_cast(mask >> 8); - const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; - const uint8x16_t shuffle1 = vld1q_u8(row1 + 1); - const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1); + template + static simdutf_really_inline simd8 + pack_shifted_right(const simd16 &v0, const simd16 &v1) { + return __lsx_vssrlni_bu_h(v1.value, v0.value, N); + } - vst1q_u8(utf8_output, utf8_0); - utf8_output += row0[0]; - vst1q_u8(utf8_output, utf8_1); - utf8_output += row1[0]; + // Pack with the unsigned saturation of two uint16_t code units into single + // uint8_t vector + static simdutf_really_inline simd8 pack(const simd16 &v0, + const simd16 &v1) { + return pack_shifted_right<0>(v0, v1); + } - buf += 8; - // surrogate pair(s) in a register - } else { - // Let us do a scalar fallback. - // It may seem wasteful to use scalar code, but being efficient with SIMD - // in the presence of surrogate pairs may require non-trivial tables. - size_t forward = 15; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; - if((word & 0xFF80)==0) { - *utf8_output++ = char(word); - } else if((word & 0xF800)==0) { - *utf8_output++ = char((word>>6) | 0b11000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else if((word &0xF800 ) != 0xD800) { - *utf8_output++ = char((word>>12) | 0b11100000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else { - // must be a surrogate pair - uint16_t diff = uint16_t(word - 0xD800); - uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1]; - k++; - uint16_t diff2 = uint16_t(next_word - 0xDC00); - if((diff | diff2) > 0x3FF) { return std::make_pair(nullptr, reinterpret_cast(utf8_output)); } - uint32_t value = (diff << 10) + diff2 + 0x10000; - *utf8_output++ = char((value>>18) | 0b11110000); - *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000); - *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((value & 0b111111) | 0b10000000); - } - } - buf += k; - } - } // while + // Change the endianness + simdutf_really_inline simd16 swap_bytes() const { + return __lsx_vshuf4i_b(this->value, 0b10110001); + } - return std::make_pair(buf, reinterpret_cast(utf8_output)); -} + simdutf_really_inline uint64_t sum() const { + const auto sum_u32 = __lsx_vhaddw_wu_hu(value, value); + const auto sum_u64 = __lsx_vhaddw_du_wu(sum_u32, sum_u32); + return uint64_t(__lsx_vpickve2gr_du(sum_u64, 0)) + + uint64_t(__lsx_vpickve2gr_du(sum_u64, 1)); + } +}; -/* - Returns a pair: a result struct and utf8_output. - If there is an error, the count field of the result is the position of the error. - Otherwise, it is the position of the first unprocessed byte in buf (even if finished). - A scalar routing should carry on the conversion of the tail if needed. -*/ -template -std::pair arm_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_out) { - uint8_t * utf8_output = reinterpret_cast(utf8_out); - const char16_t* start = buf; - const char16_t* end = buf + len; +simdutf_really_inline simd16 operator<(const simd16 a, + const simd16 b) { + return __lsx_vslt_hu(a.value, b.value); +} - const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800); - const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); - const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080); - const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 +simdutf_really_inline simd16 operator>(const simd16 a, + const simd16 b) { + return __lsx_vslt_hu(b.value, a.value); +} - while (buf + 16 + safety_margin <= end) { - uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); - if (!match_system(big_endian)) { in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in))); } - if(vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!! - // It is common enough that we have sequences of 16 consecutive ASCII characters. - uint16x8_t nextin = vld1q_u16(reinterpret_cast(buf) + 8); - if (!match_system(big_endian)) { nextin = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(nextin))); } - if(vmaxvq_u16(nextin) > 0x7F) { - // 1. pack the bytes - // obviously suboptimal. - uint8x8_t utf8_packed = vmovn_u16(in); - // 2. store (8 bytes) - vst1_u8(utf8_output, utf8_packed); - // 3. adjust pointers - buf += 8; - utf8_output += 8; - in = nextin; - } else { - // 1. pack the bytes - // obviously suboptimal. - uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin); - // 2. store (16 bytes) - vst1q_u8(utf8_output, utf8_packed); - // 3. adjust pointers - buf += 16; - utf8_output += 16; - continue; // we are done for this round! - } - } +simdutf_really_inline simd16 operator<=(const simd16 a, + const simd16 b) { + return __lsx_vsle_hu(a.value, b.value); +} - if (vmaxvq_u16(in) <= 0x7FF) { - // 1. prepare 2-byte values - // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 - // expected output : [110a|aaaa|10bb|bbbb] x 8 - const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00); - const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f); - - // t0 = [000a|aaaa|bbbb|bb00] - const uint16x8_t t0 = vshlq_n_u16(in, 2); - // t1 = [000a|aaaa|0000|0000] - const uint16x8_t t1 = vandq_u16(t0, v_1f00); - // t2 = [0000|0000|00bb|bbbb] - const uint16x8_t t2 = vandq_u16(in, v_003f); - // t3 = [000a|aaaa|00bb|bbbb] - const uint16x8_t t3 = vorrq_u16(t1, t2); - // t4 = [110a|aaaa|10bb|bbbb] - const uint16x8_t t4 = vorrq_u16(t3, v_c080); - // 2. merge ASCII and 2-byte codewords - const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); - const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f); - const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4)); - // 3. prepare bitmask for 8-bit lookup -#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint16x8_t mask = simdutf_make_uint16x8_t(0x0001, 0x0004, - 0x0010, 0x0040, - 0x0002, 0x0008, - 0x0020, 0x0080); -#else - const uint16x8_t mask = { 0x0001, 0x0004, - 0x0010, 0x0040, - 0x0002, 0x0008, - 0x0020, 0x0080 }; -#endif - uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask)); - // 4. pack the bytes - const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; - const uint8x16_t shuffle = vld1q_u8(row + 1); - const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle); +simdutf_really_inline simd16 operator>=(const simd16 a, + const simd16 b) { + return __lsx_vsle_hu(b.value, a.value); +} - // 5. store bytes - vst1q_u8(utf8_output, utf8_packed); +template struct simd16x32 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd16); + static_assert( + NUM_CHUNKS == 4, + "LOONGARCH kernel should use four registers per 64-byte block."); + simd16 chunks[NUM_CHUNKS]; - // 6. adjust pointers - buf += 8; - utf8_output += row[0]; - continue; + simd16x32(const simd16x32 &o) = delete; // no copy allowed + simd16x32 & + operator=(const simd16 other) = delete; // no assignment allowed + simd16x32() = delete; // no default constructor allowed - } - const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800); - // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, - // it is likely an uncommon occurrence. - if (vmaxvq_u16(surrogates_bytemask) == 0) { - // case: code units from register produce either 1, 2 or 3 UTF-8 bytes -#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint16x8_t dup_even = simdutf_make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); -#else - const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e}; -#endif - /* In this branch we handle three cases: - 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte - 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes - 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes + simdutf_really_inline + simd16x32(const simd16 chunk0, const simd16 chunk1, + const simd16 chunk2, const simd16 chunk3) + : chunks{chunk0, chunk1, chunk2, chunk3} {} + simdutf_really_inline simd16x32(const T *ptr) + : chunks{simd16::load(ptr), + simd16::load(ptr + sizeof(simd16) / sizeof(T)), + simd16::load(ptr + 2 * sizeof(simd16) / sizeof(T)), + simd16::load(ptr + 3 * sizeof(simd16) / sizeof(T))} {} - We expand the input word (16-bit) into two code units (32-bit), thus - we have room for four bytes. However, we need five distinct bit - layouts. Note that the last byte in cases #2 and #3 is the same. + simdutf_really_inline void store(T *ptr) const { + this->chunks[0].store(ptr + sizeof(simd16) * 0 / sizeof(T)); + this->chunks[1].store(ptr + sizeof(simd16) * 1 / sizeof(T)); + this->chunks[2].store(ptr + sizeof(simd16) * 2 / sizeof(T)); + this->chunks[3].store(ptr + sizeof(simd16) * 3 / sizeof(T)); + } - We precompute byte 1 for case #1 and the common byte for cases #2 & #3 - in register t2. + simdutf_really_inline void swap_bytes() { + this->chunks[0] = this->chunks[0].swap_bytes(); + this->chunks[1] = this->chunks[1].swap_bytes(); + this->chunks[2] = this->chunks[2].swap_bytes(); + this->chunks[3] = this->chunks[3].swap_bytes(); + } + simdutf_really_inline uint64_t to_bitmask() const { + uint64_t r0 = uint32_t(this->chunks[0].to_bitmask()); + uint64_t r1 = this->chunks[1].to_bitmask(); + uint64_t r2 = this->chunks[2].to_bitmask(); + uint64_t r3 = this->chunks[3].to_bitmask(); + return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48); + } + simdutf_really_inline uint64_t lteq(const T m) const { + const simd16 mask = simd16::splat(m); + return simd16x32(this->chunks[0] <= mask, this->chunks[1] <= mask, + this->chunks[2] <= mask, this->chunks[3] <= mask) + .to_bitmask(); + } +}; // struct simd16x32 - We precompute byte 1 for case #3 and -- **conditionally** -- precompute - either byte 1 for case #2 or byte 2 for case #3. Note that they - differ by exactly one bit. +simdutf_really_inline simd16 operator^(const simd16 a, + uint16_t b) { + const auto bv = __lsx_vreplgr2vr_h(b); + return __lsx_vxor_v(a.value, bv); +} - Finally from these two code units we build proper UTF-8 sequence, taking - into account the case (i.e, the number of bytes to write). - */ - /** - * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: - * t2 => [0ccc|cccc] [10cc|cccc] - * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) - */ -#define simdutf_vec(x) vmovq_n_u16(static_cast(x)) - // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] - const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even))); - // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] - const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111)); - // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] - const uint16x8_t t2 = vorrq_u16 (t1, simdutf_vec(0b1000000000000000)); +simdutf_really_inline simd16 operator^(const simd16 a, + const simd16 b) { + return __lsx_vxor_v(a.value, b.value); +} - // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] - const uint16x8_t s0 = vshrq_n_u16(in, 12); - // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] - const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000)); - // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000] - const uint16x8_t s1s = vshlq_n_u16(s1, 2); - // [00bb|bbbb|0000|aaaa] - const uint16x8_t s2 = vorrq_u16(s0, s1s); - // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] - const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000)); - const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF); - const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff); - const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask); - const uint16x8_t s4 = veorq_u16(s3, m0); -#undef simdutf_vec +simdutf_really_inline simd16 min(const simd16 a, + const simd16 b) { + return __lsx_vmin_hu(a.value, b.value); +} - // 4. expand code units 16-bit => 32-bit - const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4)); - const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4)); +simdutf_really_inline simd16 as_vector_u16(const simd16 x) { + return x.value; +} +/* end file src/simdutf/lsx/simd16-inl.h */ +/* begin file src/simdutf/lsx/simd32-inl.h */ +template struct simd32; - // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle - const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); - const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f); -#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint16x8_t onemask = simdutf_make_uint16x8_t(0x0001, 0x0004, - 0x0010, 0x0040, - 0x0100, 0x0400, - 0x1000, 0x4000 ); - const uint16x8_t twomask = simdutf_make_uint16x8_t(0x0002, 0x0008, - 0x0020, 0x0080, - 0x0200, 0x0800, - 0x2000, 0x8000 ); -#else - const uint16x8_t onemask = { 0x0001, 0x0004, - 0x0010, 0x0040, - 0x0100, 0x0400, - 0x1000, 0x4000 }; - const uint16x8_t twomask = { 0x0002, 0x0008, - 0x0020, 0x0080, - 0x0200, 0x0800, - 0x2000, 0x8000 }; -#endif - const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask)); - const uint16_t mask = vaddvq_u16(combined); - // The following fast path may or may not be beneficial. - /*if(mask == 0) { - // We only have three-byte code units. Use fast path. - const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0}; - const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle); - const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle); - vst1q_u8(utf8_output, utf8_0); - utf8_output += 12; - vst1q_u8(utf8_output, utf8_1); - utf8_output += 12; - buf += 8; - continue; - }*/ - const uint8_t mask0 = uint8_t(mask); +template <> struct simd32 { + __m128i value; + static const int SIZE = sizeof(value); + static const int ELEMENTS = SIZE / sizeof(uint32_t); - const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; - const uint8x16_t shuffle0 = vld1q_u8(row0 + 1); - const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0); + // constructors + simdutf_really_inline simd32(__m128i v) : value(v) {} - const uint8_t mask1 = static_cast(mask >> 8); - const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; - const uint8x16_t shuffle1 = vld1q_u8(row1 + 1); - const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1); + template + simdutf_really_inline simd32(Ptr *ptr) : value(__lsx_vld(ptr, 0)) {} - vst1q_u8(utf8_output, utf8_0); - utf8_output += row0[0]; - vst1q_u8(utf8_output, utf8_1); - utf8_output += row1[0]; + // in-place operators + simdutf_really_inline simd32 &operator-=(const simd32 other) { + value = __lsx_vsub_w(value, other.value); + return *this; + } - buf += 8; - // surrogate pair(s) in a register - } else { - // Let us do a scalar fallback. - // It may seem wasteful to use scalar code, but being efficient with SIMD - // in the presence of surrogate pairs may require non-trivial tables. - size_t forward = 15; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; - if((word & 0xFF80)==0) { - *utf8_output++ = char(word); - } else if((word & 0xF800)==0) { - *utf8_output++ = char((word>>6) | 0b11000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else if((word &0xF800 ) != 0xD800) { - *utf8_output++ = char((word>>12) | 0b11100000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else { - // must be a surrogate pair - uint16_t diff = uint16_t(word - 0xD800); - uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1]; - k++; - uint16_t diff2 = uint16_t(next_word - 0xDC00); - if((diff | diff2) > 0x3FF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), reinterpret_cast(utf8_output)); } - uint32_t value = (diff << 10) + diff2 + 0x10000; - *utf8_output++ = char((value>>18) | 0b11110000); - *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000); - *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((value & 0b111111) | 0b10000000); - } - } - buf += k; - } - } // while + // members + simdutf_really_inline uint64_t sum() const { + return uint64_t(__lsx_vpickve2gr_wu(value, 0)) + + uint64_t(__lsx_vpickve2gr_wu(value, 1)) + + uint64_t(__lsx_vpickve2gr_wu(value, 2)) + + uint64_t(__lsx_vpickve2gr_wu(value, 3)); + } - return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast(utf8_output)); -} -/* end file src/arm64/arm_convert_utf16_to_utf8.cpp */ -/* begin file src/arm64/arm_convert_utf16_to_utf32.cpp */ -/* - The vectorized algorithm works on single SSE register i.e., it - loads eight 16-bit code units. + // static members + static simdutf_really_inline simd32 splat(uint32_t x) { + return __lsx_vreplgr2vr_w(x); + } - We consider three cases: - 1. an input register contains no surrogates and each value - is in range 0x0000 .. 0x07ff. - 2. an input register contains no surrogates and values are - is in range 0x0000 .. 0xffff. - 3. an input register contains surrogates --- i.e. codepoints - can have 16 or 32 bits. + static simdutf_really_inline simd32 zero() { + return __lsx_vrepli_w(0); + } +}; - Ad 1. +// ------------------------------------------------------------ - When values are less than 0x0800, it means that a 16-bit code unit - can be converted into: 1) single UTF8 byte (when it's an ASCII - char) or 2) two UTF8 bytes. +template <> struct simd32 { + __m128i value; + static const int SIZE = sizeof(value); - For this case we do only some shuffle to obtain these 2-byte - codes and finally compress the whole SSE register with a single - shuffle. + // constructors + simdutf_really_inline simd32(__m128i v) : value(v) {} +}; - We need 256-entry lookup table to get a compression pattern - and the number of output bytes in the compressed vector register. - Each entry occupies 17 bytes. +// ------------------------------------------------------------ - Ad 2. +simdutf_really_inline simd32 operator&(const simd32 a, + const simd32 b) { + return __lsx_vor_v(a.value, b.value); +} - When values fit in 16-bit code units, but are above 0x07ff, then - a single word may produce one, two or three UTF8 bytes. +simdutf_really_inline simd32 operator<(const simd32 a, + const simd32 b) { + return __lsx_vslt_wu(a.value, b.value); +} - We prepare data for all these three cases in two registers. - The first register contains lower two UTF8 bytes (used in all - cases), while the second one contains just the third byte for - the three-UTF8-bytes case. +simdutf_really_inline simd32 operator>(const simd32 a, + const simd32 b) { + return __lsx_vslt_wu(b.value, a.value); +} - Finally these two registers are interleaved forming eight-element - array of 32-bit values. The array spans two SSE registers. - The bytes from the registers are compressed using two shuffles. +// ------------------------------------------------------------ - We need 256-entry lookup table to get a compression pattern - and the number of output bytes in the compressed vector register. - Each entry occupies 17 bytes. +simdutf_really_inline simd32 as_vector_u32(const simd32 v) { + return v.value; +} +/* end file src/simdutf/lsx/simd32-inl.h */ +/* begin file src/simdutf/lsx/simd64-inl.h */ +template struct simd64; +template <> struct simd64 { + __m128i value; + static const int SIZE = sizeof(value); + static const int ELEMENTS = SIZE / sizeof(uint64_t); - To summarize: - - We need two 256-entry tables that have 8704 bytes in total. -*/ -/* - Returns a pair: the first unprocessed byte from buf and utf8_output - A scalar routing should carry on the conversion of the tail. -*/ -template -std::pair arm_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_out) { - uint32_t * utf32_output = reinterpret_cast(utf32_out); - const char16_t* end = buf + len; + // constructors + simdutf_really_inline simd64(__m128i v) : value(v) {} - const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800); - const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); + template + simdutf_really_inline simd64(Ptr *ptr) : value(__lsx_vld(ptr, 0)) {} - while (buf + 8 <= end) { - uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); - if (!match_system(big_endian)) { in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in))); } + // in-place operators + simdutf_really_inline simd64 &operator+=(const simd64 other) { + value = __lsx_vadd_d(value, other.value); + return *this; + } - const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800); - // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, - // it is likely an uncommon occurrence. - if (vmaxvq_u16(surrogates_bytemask) == 0) { - // case: no surrogate pairs, extend all 16-bit code units to 32-bit code units - vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(in))); - vst1q_u32(utf32_output+4, vmovl_high_u16(in)); - utf32_output += 8; - buf += 8; - // surrogate pair(s) in a register - } else { - // Let us do a scalar fallback. - // It may seem wasteful to use scalar code, but being efficient with SIMD - // in the presence of surrogate pairs may require non-trivial tables. - size_t forward = 15; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; - if((word &0xF800 ) != 0xD800) { - *utf32_output++ = char32_t(word); - } else { - // must be a surrogate pair - uint16_t diff = uint16_t(word - 0xD800); - uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1]; - k++; - uint16_t diff2 = uint16_t(next_word - 0xDC00); - if((diff | diff2) > 0x3FF) { return std::make_pair(nullptr, reinterpret_cast(utf32_output)); } - uint32_t value = (diff << 10) + diff2 + 0x10000; - *utf32_output++ = char32_t(value); - } - } - buf += k; - } - } // while - return std::make_pair(buf, reinterpret_cast(utf32_output)); -} + // members + simdutf_really_inline uint64_t sum() const { + return uint64_t(__lsx_vpickve2gr_du(value, 0)) + + uint64_t(__lsx_vpickve2gr_du(value, 1)); + } + // static members + static simdutf_really_inline simd64 zero() { + return __lsx_vrepli_d(0); + } +}; -/* - Returns a pair: a result struct and utf8_output. - If there is an error, the count field of the result is the position of the error. - Otherwise, it is the position of the first unprocessed byte in buf (even if finished). - A scalar routing should carry on the conversion of the tail if needed. -*/ -template -std::pair arm_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_out) { - uint32_t * utf32_output = reinterpret_cast(utf32_out); - const char16_t* start = buf; - const char16_t* end = buf + len; +// ------------------------------------------------------------ - const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800); - const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); +template <> struct simd64 { + __m128i value; + static const int SIZE = sizeof(value); - while (buf + 8 <= end) { - uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); - if (!match_system(big_endian)) { in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in))); } + // constructors + simdutf_really_inline simd64(__m128i v) : value(v) {} +}; - const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800); - // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, - // it is likely an uncommon occurrence. - if (vmaxvq_u16(surrogates_bytemask) == 0) { - // case: no surrogate pairs, extend all 16-bit code units to 32-bit code units - vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(in))); - vst1q_u32(utf32_output+4, vmovl_high_u16(in)); - utf32_output += 8; - buf += 8; - // surrogate pair(s) in a register - } else { - // Let us do a scalar fallback. - // It may seem wasteful to use scalar code, but being efficient with SIMD - // in the presence of surrogate pairs may require non-trivial tables. - size_t forward = 15; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; - if((word &0xF800 ) != 0xD800) { - *utf32_output++ = char32_t(word); - } else { - // must be a surrogate pair - uint16_t diff = uint16_t(word - 0xD800); - uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1]; - k++; - uint16_t diff2 = uint16_t(next_word - 0xDC00); - if((diff | diff2) > 0x3FF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), reinterpret_cast(utf32_output)); } - uint32_t value = (diff << 10) + diff2 + 0x10000; - *utf32_output++ = char32_t(value); - } - } - buf += k; - } - } // while - return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast(utf32_output)); -} -/* end file src/arm64/arm_convert_utf16_to_utf32.cpp */ +// ------------------------------------------------------------ -/* begin file src/arm64/arm_convert_utf32_to_latin1.cpp */ -std::pair arm_convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) { - const char32_t* end = buf + len; - while (buf + 8 <= end) { - uint32x4_t in1 = vld1q_u32(reinterpret_cast(buf)); - uint32x4_t in2 = vld1q_u32(reinterpret_cast(buf+4)); +simd64 sum_8bytes(const simd8 v) { + const auto sum_u16 = __lsx_vhaddw_hu_bu(v, v); + const auto sum_u32 = __lsx_vhaddw_wu_hu(sum_u16, sum_u16); + const auto sum_u64 = __lsx_vhaddw_du_wu(sum_u32, sum_u32); - uint16x8_t utf16_packed = vcombine_u16(vqmovn_u32(in1), vqmovn_u32(in2)); - if (vmaxvq_u16(utf16_packed) <= 0xff) { - // 1. pack the bytes - uint8x8_t latin1_packed = vmovn_u16(utf16_packed); - // 2. store (8 bytes) - vst1_u8(reinterpret_cast(latin1_output), latin1_packed); - // 3. adjust pointers - buf += 8; - latin1_output += 8; - } else { - return std::make_pair(nullptr, reinterpret_cast(latin1_output)); - } - } // while - return std::make_pair(buf, latin1_output); + return simd64(sum_u64); } +/* end file src/simdutf/lsx/simd64-inl.h */ +} // namespace simd +} // unnamed namespace +} // namespace lsx +} // namespace simdutf -std::pair arm_convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) { - const char32_t* start = buf; - const char32_t* end = buf + len; +#endif // SIMDUTF_LSX_SIMD_H +/* end file src/simdutf/lsx/simd.h */ - while (buf + 8 <= end) { - uint32x4_t in1 = vld1q_u32(reinterpret_cast(buf)); - uint32x4_t in2 = vld1q_u32(reinterpret_cast(buf+4)); +/* begin file src/simdutf/lsx/end.h */ +#undef SIMDUTF_SIMD_HAS_UNSIGNED_CMP +/* end file src/simdutf/lsx/end.h */ - uint16x8_t utf16_packed = vcombine_u16(vqmovn_u32(in1), vqmovn_u32(in2)); +#endif // SIMDUTF_IMPLEMENTATION_LSX - if (vmaxvq_u16(utf16_packed) <= 0xff) { - // 1. pack the bytes - uint8x8_t latin1_packed = vmovn_u16(utf16_packed); - // 2. store (8 bytes) - vst1_u8(reinterpret_cast(latin1_output), latin1_packed); - // 3. adjust pointers - buf += 8; - latin1_output += 8; - } else { - // Let us do a scalar fallback. - for(int k = 0; k < 8; k++) { - uint32_t word = buf[k]; - if(word <= 0xff) { - *latin1_output++ = char(word); - } else { - return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), latin1_output); - } - } - } - } // while - return std::make_pair(result(error_code::SUCCESS, buf - start), latin1_output); -} -/* end file src/arm64/arm_convert_utf32_to_latin1.cpp */ -/* begin file src/arm64/arm_convert_utf32_to_utf8.cpp */ -std::pair arm_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_out) { - uint8_t * utf8_output = reinterpret_cast(utf8_out); - const char32_t* end = buf + len; +#endif // SIMDUTF_LSX_H +/* end file src/simdutf/lsx.h */ +/* begin file src/simdutf/fallback.h */ +#ifndef SIMDUTF_FALLBACK_H +#define SIMDUTF_FALLBACK_H - const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080); - uint16x8_t forbidden_bytemask = vmovq_n_u16(0x0); - const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 +// Note that fallback.h is always imported last. - while (buf + 16 + safety_margin < end) { - uint32x4_t in = vld1q_u32(reinterpret_cast(buf)); - uint32x4_t nextin = vld1q_u32(reinterpret_cast(buf+4)); - - // Check if no bits set above 16th - if(vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) { - // Pack UTF-32 to UTF-16 safely (without surrogate pairs) - // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp) - uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin)); - if(vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!! - // 1. pack the bytes - // obviously suboptimal. - uint8x8_t utf8_packed = vmovn_u16(utf16_packed); - // 2. store (8 bytes) - vst1_u8(utf8_output, utf8_packed); - // 3. adjust pointers - buf += 8; - utf8_output += 8; - continue; // we are done for this round! - } - - if (vmaxvq_u16(utf16_packed) <= 0x7FF) { - // 1. prepare 2-byte values - // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 - // expected output : [110a|aaaa|10bb|bbbb] x 8 - const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00); - const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f); - - // t0 = [000a|aaaa|bbbb|bb00] - const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2); - // t1 = [000a|aaaa|0000|0000] - const uint16x8_t t1 = vandq_u16(t0, v_1f00); - // t2 = [0000|0000|00bb|bbbb] - const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f); - // t3 = [000a|aaaa|00bb|bbbb] - const uint16x8_t t3 = vorrq_u16(t1, t2); - // t4 = [110a|aaaa|10bb|bbbb] - const uint16x8_t t4 = vorrq_u16(t3, v_c080); - // 2. merge ASCII and 2-byte codewords - const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); - const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f); - const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, utf16_packed, t4)); - // 3. prepare bitmask for 8-bit lookup - #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint16x8_t mask = simdutf_make_uint16x8_t(0x0001, 0x0004, - 0x0010, 0x0040, - 0x0002, 0x0008, - 0x0020, 0x0080); +// Default Fallback to on unless a builtin implementation has already been +// selected. +#ifndef SIMDUTF_IMPLEMENTATION_FALLBACK + #if SIMDUTF_CAN_ALWAYS_RUN_ARM64 || SIMDUTF_CAN_ALWAYS_RUN_ICELAKE || \ + SIMDUTF_CAN_ALWAYS_RUN_HASWELL || SIMDUTF_CAN_ALWAYS_RUN_WESTMERE || \ + SIMDUTF_CAN_ALWAYS_RUN_PPC64 || SIMDUTF_CAN_ALWAYS_RUN_RVV || \ + SIMDUTF_CAN_ALWAYS_RUN_LSX || SIMDUTF_CAN_ALWAYS_RUN_LASX + #define SIMDUTF_IMPLEMENTATION_FALLBACK 0 #else - const uint16x8_t mask = { 0x0001, 0x0004, - 0x0010, 0x0040, - 0x0002, 0x0008, - 0x0020, 0x0080 }; + #define SIMDUTF_IMPLEMENTATION_FALLBACK 1 #endif - uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask)); - // 4. pack the bytes - const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; - const uint8x16_t shuffle = vld1q_u8(row + 1); - const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle); - - // 5. store bytes - vst1q_u8(utf8_output, utf8_packed); +#endif - // 6. adjust pointers - buf += 8; - utf8_output += row[0]; - continue; - } else { - // case: code units from register produce either 1, 2 or 3 UTF-8 bytes - const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); - const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff); - forbidden_bytemask = vorrq_u16(vandq_u16(vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800)), forbidden_bytemask); +#define SIMDUTF_CAN_ALWAYS_RUN_FALLBACK (SIMDUTF_IMPLEMENTATION_FALLBACK) - #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint16x8_t dup_even = simdutf_make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); - #else - const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e}; - #endif - /* In this branch we handle three cases: - 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte - 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes - 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes +#if SIMDUTF_IMPLEMENTATION_FALLBACK - We expand the input word (16-bit) into two code units (32-bit), thus - we have room for four bytes. However, we need five distinct bit - layouts. Note that the last byte in cases #2 and #3 is the same. +namespace simdutf { +/** + * Fallback implementation (runs on any machine). + */ +namespace fallback {} // namespace fallback +} // namespace simdutf - We precompute byte 1 for case #1 and the common byte for cases #2 & #3 - in register t2. - - We precompute byte 1 for case #3 and -- **conditionally** -- precompute - either byte 1 for case #2 or byte 2 for case #3. Note that they - differ by exactly one bit. - - Finally from these two code units we build proper UTF-8 sequence, taking - into account the case (i.e, the number of bytes to write). - */ - /** - * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: - * t2 => [0ccc|cccc] [10cc|cccc] - * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) - */ - #define simdutf_vec(x) vmovq_n_u16(static_cast(x)) - // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] - const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed), vreinterpretq_u8_u16(dup_even))); - // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] - const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111)); - // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] - const uint16x8_t t2 = vorrq_u16 (t1, simdutf_vec(0b1000000000000000)); - - // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] - const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12); - // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] - const uint16x8_t s1 = vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000)); - // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000] - const uint16x8_t s1s = vshlq_n_u16(s1, 2); - // [00bb|bbbb|0000|aaaa] - const uint16x8_t s2 = vorrq_u16(s0, s1s); - // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] - const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000)); - const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF); - const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(utf16_packed, v_07ff); - const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask); - const uint16x8_t s4 = veorq_u16(s3, m0); - #undef simdutf_vec - - // 4. expand code units 16-bit => 32-bit - const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4)); - const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4)); - - // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle - const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); - const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f); - #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint16x8_t onemask = simdutf_make_uint16x8_t(0x0001, 0x0004, - 0x0010, 0x0040, - 0x0100, 0x0400, - 0x1000, 0x4000 ); - const uint16x8_t twomask = simdutf_make_uint16x8_t(0x0002, 0x0008, - 0x0020, 0x0080, - 0x0200, 0x0800, - 0x2000, 0x8000 ); - #else - const uint16x8_t onemask = { 0x0001, 0x0004, - 0x0010, 0x0040, - 0x0100, 0x0400, - 0x1000, 0x4000 }; - const uint16x8_t twomask = { 0x0002, 0x0008, - 0x0020, 0x0080, - 0x0200, 0x0800, - 0x2000, 0x8000 }; - #endif - const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask)); - const uint16_t mask = vaddvq_u16(combined); - // The following fast path may or may not be beneficial. - /*if(mask == 0) { - // We only have three-byte code units. Use fast path. - const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0}; - const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle); - const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle); - vst1q_u8(utf8_output, utf8_0); - utf8_output += 12; - vst1q_u8(utf8_output, utf8_1); - utf8_output += 12; - buf += 8; - continue; - }*/ - const uint8_t mask0 = uint8_t(mask); - const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; - const uint8x16_t shuffle0 = vld1q_u8(row0 + 1); - const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0); - - const uint8_t mask1 = static_cast(mask >> 8); - const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; - const uint8x16_t shuffle1 = vld1q_u8(row1 + 1); - const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1); +/* begin file src/simdutf/fallback/implementation.h */ +#ifndef SIMDUTF_FALLBACK_IMPLEMENTATION_H +#define SIMDUTF_FALLBACK_IMPLEMENTATION_H - vst1q_u8(utf8_output, utf8_0); - utf8_output += row0[0]; - vst1q_u8(utf8_output, utf8_1); - utf8_output += row1[0]; - buf += 8; - } - // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes. - } else { - // Let us do a scalar fallback. - // It may seem wasteful to use scalar code, but being efficient with SIMD - // in the presence of surrogate pairs may require non-trivial tables. - size_t forward = 15; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint32_t word = buf[k]; - if((word & 0xFFFFFF80)==0) { - *utf8_output++ = char(word); - } else if((word & 0xFFFFF800)==0) { - *utf8_output++ = char((word>>6) | 0b11000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else if((word & 0xFFFF0000)==0) { - if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, reinterpret_cast(utf8_output)); } - *utf8_output++ = char((word>>12) | 0b11100000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else { - if (word > 0x10FFFF) { return std::make_pair(nullptr, reinterpret_cast(utf8_output)); } - *utf8_output++ = char((word>>18) | 0b11110000); - *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } - } - buf += k; - } - } // while +namespace simdutf { +namespace fallback { - // check for invalid input - if (vmaxvq_u16(forbidden_bytemask) != 0) { - return std::make_pair(nullptr, reinterpret_cast(utf8_output)); - } - return std::make_pair(buf, reinterpret_cast(utf8_output)); +namespace { +using namespace simdutf; } +class implementation final : public simdutf::implementation { +public: + simdutf_really_inline implementation() + : simdutf::implementation("fallback", "Generic fallback implementation", + 0) {} + +#if SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused int detect_encodings(const char *input, + size_t length) const noexcept final; +#endif // SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool validate_utf8(const char *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF8 + simdutf_warn_unused result + validate_utf8_with_errors(const char *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_ASCII + simdutf_warn_unused bool validate_ascii(const char *buf, + size_t len) const noexcept final; + simdutf_warn_unused result + validate_ascii_with_errors(const char *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_ASCII +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII + simdutf_warn_unused bool + validate_utf16le_as_ascii(const char16_t *buf, + size_t len) const noexcept final; + + simdutf_warn_unused bool + validate_utf16be_as_ascii(const char16_t *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII + +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool validate_utf16le(const char16_t *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused bool validate_utf16be(const char16_t *buf, + size_t len) const noexcept final; + simdutf_warn_unused result validate_utf16le_with_errors( + const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf16be_with_errors( + const char16_t *buf, size_t len) const noexcept final; + void to_well_formed_utf16be(const char16_t *input, size_t len, + char16_t *output) const noexcept final; + void to_well_formed_utf16le(const char16_t *input, size_t len, + char16_t *output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool validate_utf32(const char32_t *buf, + size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused result validate_utf32_with_errors( + const char32_t *buf, size_t len) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf8( + const char *buf, size_t len, char *utf8_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf16le( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_latin1_to_utf16be( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_latin1_with_errors( + const char *buf, size_t len, char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t convert_utf8_to_utf16le( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf16be( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf16le_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf16be_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16le( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16be( + const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t convert_utf8_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf32_with_errors( + const char *buf, size_t len, char32_t *utf32_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf32( + const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + convert_utf16le_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf16be_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_latin1_with_errors( + const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_latin1_with_errors( + const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len, + char *latin1_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t convert_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t convert_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf8_with_errors( + const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + convert_utf32_to_latin1(const char32_t *buf, size_t len, + char *latin1_output) const noexcept final; + simdutf_warn_unused result + convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len, + char *latin1_output) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf32_to_latin1(const char32_t *buf, size_t len, + char *latin1_output) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t + convert_utf32_to_utf16le(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf32_to_utf16be(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf16le_with_errors( + const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf16be_with_errors( + const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len, + char16_t *utf16_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf16le_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_utf16be_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_utf32_with_errors( + const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_utf32_with_errors( + const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; + simdutf_warn_unused size_t + convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_buffer) const noexcept final; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 + void change_endianness_utf16(const char16_t *buf, size_t length, + char16_t *output) const noexcept final; + simdutf_warn_unused size_t + count_utf16le(const char16_t *buf, size_t length) const noexcept override; + simdutf_warn_unused size_t + count_utf16be(const char16_t *buf, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 + simdutf_warn_unused size_t count_utf8(const char *buf, + size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t utf8_length_from_utf16le( + const char16_t *input, size_t length) const noexcept override; + simdutf_warn_unused size_t utf8_length_from_utf16be( + const char16_t *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t utf32_length_from_utf16le( + const char16_t *input, size_t length) const noexcept override; + simdutf_warn_unused size_t utf32_length_from_utf16be( + const char16_t *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t utf16_length_from_utf8( + const char *input, size_t length) const noexcept override; + simdutf_warn_unused result utf8_length_from_utf16le_with_replacement( + const char16_t *input, size_t length) const noexcept override; + ; + simdutf_warn_unused result utf8_length_from_utf16be_with_replacement( + const char16_t *input, size_t length) const noexcept override; + ; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t utf8_length_from_utf32( + const char32_t *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t utf16_length_from_utf32( + const char32_t *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t utf32_length_from_utf8( + const char *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t latin1_length_from_utf8( + const char *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t utf8_length_from_latin1( + const char *input, size_t length) const noexcept override; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_BASE64 + simdutf_warn_unused result base64_to_binary( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept override; + simdutf_warn_unused full_result base64_to_binary_details( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept override; + simdutf_warn_unused result base64_to_binary( + const char16_t *input, size_t length, char *output, + base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept override; + simdutf_warn_unused full_result base64_to_binary_details( + const char16_t *input, size_t length, char *output, + base64_options options, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept override; + size_t binary_to_base64(const char *input, size_t length, char *output, + base64_options options) const noexcept override; + size_t + binary_to_base64_with_lines(const char *input, size_t length, char *output, + size_t line_length, + base64_options options) const noexcept override; + const char *find(const char *start, const char *end, + char character) const noexcept override; + const char16_t *find(const char16_t *start, const char16_t *end, + char16_t character) const noexcept override; + +#endif // SIMDUTF_FEATURE_BASE64 +}; +} // namespace fallback +} // namespace simdutf -std::pair arm_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_out) { - uint8_t * utf8_output = reinterpret_cast(utf8_out); - const char32_t* start = buf; - const char32_t* end = buf + len; +#endif // SIMDUTF_FALLBACK_IMPLEMENTATION_H +/* end file src/simdutf/fallback/implementation.h */ - const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080); - const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 +/* begin file src/simdutf/fallback/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "fallback" +// #define SIMDUTF_IMPLEMENTATION fallback +/* end file src/simdutf/fallback/begin.h */ - while (buf + 16 + safety_margin < end) { - uint32x4_t in = vld1q_u32(reinterpret_cast(buf)); - uint32x4_t nextin = vld1q_u32(reinterpret_cast(buf+4)); + // Declarations +/* begin file src/simdutf/fallback/bitmanipulation.h */ +#ifndef SIMDUTF_FALLBACK_BITMANIPULATION_H +#define SIMDUTF_FALLBACK_BITMANIPULATION_H - // Check if no bits set above 16th - if(vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) { - // Pack UTF-32 to UTF-16 safely (without surrogate pairs) - // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp) - uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin)); - if(vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!! - // 1. pack the bytes - // obviously suboptimal. - uint8x8_t utf8_packed = vmovn_u16(utf16_packed); - // 2. store (8 bytes) - vst1_u8(utf8_output, utf8_packed); - // 3. adjust pointers - buf += 8; - utf8_output += 8; - continue; // we are done for this round! - } +#include - if (vmaxvq_u16(utf16_packed) <= 0x7FF) { - // 1. prepare 2-byte values - // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 - // expected output : [110a|aaaa|10bb|bbbb] x 8 - const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00); - const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f); +namespace simdutf { +namespace fallback { +namespace {} // unnamed namespace +} // namespace fallback +} // namespace simdutf - // t0 = [000a|aaaa|bbbb|bb00] - const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2); - // t1 = [000a|aaaa|0000|0000] - const uint16x8_t t1 = vandq_u16(t0, v_1f00); - // t2 = [0000|0000|00bb|bbbb] - const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f); - // t3 = [000a|aaaa|00bb|bbbb] - const uint16x8_t t3 = vorrq_u16(t1, t2); - // t4 = [110a|aaaa|10bb|bbbb] - const uint16x8_t t4 = vorrq_u16(t3, v_c080); - // 2. merge ASCII and 2-byte codewords - const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); - const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f); - const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, utf16_packed, t4)); - // 3. prepare bitmask for 8-bit lookup - #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint16x8_t mask = simdutf_make_uint16x8_t(0x0001, 0x0004, - 0x0010, 0x0040, - 0x0002, 0x0008, - 0x0020, 0x0080); - #else - const uint16x8_t mask = { 0x0001, 0x0004, - 0x0010, 0x0040, - 0x0002, 0x0008, - 0x0020, 0x0080 }; - #endif - uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask)); - // 4. pack the bytes - const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; - const uint8x16_t shuffle = vld1q_u8(row + 1); - const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle); +#endif // SIMDUTF_FALLBACK_BITMANIPULATION_H +/* end file src/simdutf/fallback/bitmanipulation.h */ - // 5. store bytes - vst1q_u8(utf8_output, utf8_packed); +/* begin file src/simdutf/fallback/end.h */ +/* end file src/simdutf/fallback/end.h */ - // 6. adjust pointers - buf += 8; - utf8_output += row[0]; - continue; - } else { - // case: code units from register produce either 1, 2 or 3 UTF-8 bytes +#endif // SIMDUTF_IMPLEMENTATION_FALLBACK +#endif // SIMDUTF_FALLBACK_H +/* end file src/simdutf/fallback.h */ +#ifndef SIMDUTF_REGULAR_VISUAL_STUDIO +SIMDUTF_POP_DISABLE_WARNINGS +#endif - // check for invalid input - const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); - const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff); - const uint16x8_t forbidden_bytemask = vandq_u16(vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800)); - if (vmaxvq_u16(forbidden_bytemask) != 0) { - return std::make_pair(result(error_code::SURROGATE, buf - start), reinterpret_cast(utf8_output)); - } +// The scalar routines should be included once. +#if SIMDUTF_FEATURE_ASCII +#endif // SIMDUTF_FEATURE_ASCII +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING || \ + (SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1) +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING || + // (SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1) +#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING +#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_LATIN1 +#endif // SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_BASE64 +#endif // SIMDUTF_FEATURE_BASE64 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && \ + (SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_LATIN1) +#endif // SIMDUTF_FEATURE_UTF8 && (SIMDUTF_FEATURE_UTF16 || + // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_LATIN1) + +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_UTF32 +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 - #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint16x8_t dup_even = simdutf_make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); - #else - const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e}; - #endif - /* In this branch we handle three cases: - 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte - 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes - 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes +/* begin file src/implementation.cpp */ +#include +#include +#include +#if SIMDUTF_ATOMIC_REF + #include +#endif - We expand the input word (16-bit) into two code units (32-bit), thus - we have room for four bytes. However, we need five distinct bit - layouts. Note that the last byte in cases #2 and #3 is the same. +static_assert(sizeof(uint8_t) == sizeof(char), + "simdutf requires that uint8_t be a char"); +static_assert(sizeof(uint16_t) == sizeof(char16_t), + "simdutf requires that char16_t be 16 bits"); +static_assert(sizeof(uint32_t) == sizeof(char32_t), + "simdutf requires that char32_t be 32 bits"); +// next line is redundant, but it is kept to catch defective systems. +static_assert(CHAR_BIT == 8, "simdutf requires 8-bit bytes"); - We precompute byte 1 for case #1 and the common byte for cases #2 & #3 - in register t2. - - We precompute byte 1 for case #3 and -- **conditionally** -- precompute - either byte 1 for case #2 or byte 2 for case #3. Note that they - differ by exactly one bit. - - Finally from these two code units we build proper UTF-8 sequence, taking - into account the case (i.e, the number of bytes to write). - */ - /** - * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: - * t2 => [0ccc|cccc] [10cc|cccc] - * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) - */ - #define simdutf_vec(x) vmovq_n_u16(static_cast(x)) - // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] - const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed), vreinterpretq_u8_u16(dup_even))); - // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] - const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111)); - // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] - const uint16x8_t t2 = vorrq_u16 (t1, simdutf_vec(0b1000000000000000)); - - // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] - const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12); - // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] - const uint16x8_t s1 = vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000)); - // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000] - const uint16x8_t s1s = vshlq_n_u16(s1, 2); - // [00bb|bbbb|0000|aaaa] - const uint16x8_t s2 = vorrq_u16(s0, s1s); - // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] - const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000)); - const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF); - const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(utf16_packed, v_07ff); - const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask); - const uint16x8_t s4 = veorq_u16(s3, m0); - #undef simdutf_vec - - // 4. expand code units 16-bit => 32-bit - const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4)); - const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4)); - - // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle - const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); - const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f); - #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint16x8_t onemask = simdutf_make_uint16x8_t(0x0001, 0x0004, - 0x0010, 0x0040, - 0x0100, 0x0400, - 0x1000, 0x4000 ); - const uint16x8_t twomask = simdutf_make_uint16x8_t(0x0002, 0x0008, - 0x0020, 0x0080, - 0x0200, 0x0800, - 0x2000, 0x8000 ); - #else - const uint16x8_t onemask = { 0x0001, 0x0004, - 0x0010, 0x0040, - 0x0100, 0x0400, - 0x1000, 0x4000 }; - const uint16x8_t twomask = { 0x0002, 0x0008, - 0x0020, 0x0080, - 0x0200, 0x0800, - 0x2000, 0x8000 }; - #endif - const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask)); - const uint16_t mask = vaddvq_u16(combined); - // The following fast path may or may not be beneficial. - /*if(mask == 0) { - // We only have three-byte code units. Use fast path. - const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0}; - const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle); - const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle); - vst1q_u8(utf8_output, utf8_0); - utf8_output += 12; - vst1q_u8(utf8_output, utf8_1); - utf8_output += 12; - buf += 8; - continue; - }*/ - const uint8_t mask0 = uint8_t(mask); - - const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; - const uint8x16_t shuffle0 = vld1q_u8(row0 + 1); - const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0); - - const uint8_t mask1 = static_cast(mask >> 8); - const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; - const uint8x16_t shuffle1 = vld1q_u8(row1 + 1); - const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1); +// Useful for debugging purposes +namespace simdutf { +namespace { - vst1q_u8(utf8_output, utf8_0); - utf8_output += row0[0]; - vst1q_u8(utf8_output, utf8_1); - utf8_output += row1[0]; +template std::string toBinaryString(T b) { + std::string binary = ""; + T mask = T(1) << (sizeof(T) * CHAR_BIT - 1); + while (mask > 0) { + binary += ((b & mask) == 0) ? '0' : '1'; + mask >>= 1; + } + return binary; +} +} // namespace +} // namespace simdutf - buf += 8; - } - // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes. - } else { - // Let us do a scalar fallback. - // It may seem wasteful to use scalar code, but being efficient with SIMD - // in the presence of surrogate pairs may require non-trivial tables. - size_t forward = 15; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint32_t word = buf[k]; - if((word & 0xFFFFFF80)==0) { - *utf8_output++ = char(word); - } else if((word & 0xFFFFF800)==0) { - *utf8_output++ = char((word>>6) | 0b11000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else if((word & 0xFFFF0000)==0) { - if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), reinterpret_cast(utf8_output)); } - *utf8_output++ = char((word>>12) | 0b11100000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else { - if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast(utf8_output)); } - *utf8_output++ = char((word>>18) | 0b11110000); - *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } - } - buf += k; - } - } // while +namespace simdutf { +bool implementation::supported_by_runtime_system() const { + uint32_t required_instruction_sets = this->required_instruction_sets(); + uint32_t supported_instruction_sets = + internal::detect_supported_architectures(); + return ((supported_instruction_sets & required_instruction_sets) == + required_instruction_sets); +} - return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast(utf8_output)); +#if SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused encoding_type implementation::autodetect_encoding( + const char *input, size_t length) const noexcept { + // If there is a BOM, then we trust it. + auto bom_encoding = simdutf::BOM::check_bom(input, length); + if (bom_encoding != encoding_type::unspecified) { + return bom_encoding; + } + // UTF8 is common, it includes ASCII, and is commonly represented + // without a BOM, so if it fits, go with that. Note that it is still + // possible to get it wrong, we are only 'guessing'. If some has UTF-16 + // data without a BOM, it could pass as UTF-8. + // + // An interesting twist might be to check for UTF-16 ASCII first (every + // other byte is zero). + if (validate_utf8(input, length)) { + return encoding_type::UTF8; + } + // The next most common encoding that might appear without BOM is probably + // UTF-16LE, so try that next. + if ((length % 2) == 0) { + // important: we need to divide by two + if (validate_utf16le(reinterpret_cast(input), + length / 2)) { + return encoding_type::UTF16_LE; + } + } + if ((length % 4) == 0) { + if (validate_utf32(reinterpret_cast(input), length / 4)) { + return encoding_type::UTF32_LE; + } + } + return encoding_type::unspecified; } -/* end file src/arm64/arm_convert_utf32_to_utf8.cpp */ -/* begin file src/arm64/arm_convert_utf32_to_utf16.cpp */ -template -std::pair arm_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_out) { - uint16_t * utf16_output = reinterpret_cast(utf16_out); - const char32_t* end = buf + len; - uint16x4_t forbidden_bytemask = vmov_n_u16(0x0); + #ifdef SIMDUTF_INTERNAL_TESTS +std::vector +implementation::internal_tests() const { + return {}; +} + #endif +#endif // SIMDUTF_FEATURE_DETECT_ENCODING - while(buf + 4 <= end) { - uint32x4_t in = vld1q_u32(reinterpret_cast(buf)); +#if SIMDUTF_FEATURE_BASE64 +simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64( + const char *input, size_t length) const noexcept { + return scalar::base64::maximal_binary_length_from_base64(input, length); +} - // Check if no bits set above 16th - if(vmaxvq_u32(in) <= 0xFFFF) { - uint16x4_t utf16_packed = vmovn_u32(in); +simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64( + const char16_t *input, size_t length) const noexcept { + return scalar::base64::maximal_binary_length_from_base64(input, length); +} +simdutf_warn_unused size_t implementation::base64_length_from_binary( + size_t length, base64_options options) const noexcept { + return scalar::base64::base64_length_from_binary(length, options); +} +#endif // SIMDUTF_FEATURE_BASE64 - const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800); - const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff); - forbidden_bytemask = vorr_u16(vand_u16(vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800)), forbidden_bytemask); +namespace internal { +// When there is a single implementation, we should not pay a price +// for dispatching to the best implementation. We should just use the +// one we have. This is a compile-time check. +#define SIMDUTF_SINGLE_IMPLEMENTATION \ + (SIMDUTF_IMPLEMENTATION_ICELAKE + SIMDUTF_IMPLEMENTATION_HASWELL + \ + SIMDUTF_IMPLEMENTATION_WESTMERE + SIMDUTF_IMPLEMENTATION_ARM64 + \ + SIMDUTF_IMPLEMENTATION_PPC64 + SIMDUTF_IMPLEMENTATION_LSX + \ + SIMDUTF_IMPLEMENTATION_LASX + SIMDUTF_IMPLEMENTATION_FALLBACK == \ + 1) + +// Static array of known implementations. We are hoping these get baked into the +// executable without requiring a static initializer. - if (!match_system(big_endian)) { utf16_packed = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(utf16_packed))); } - vst1_u16(utf16_output, utf16_packed); - utf16_output += 4; - buf += 4; - } else { - size_t forward = 3; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint32_t word = buf[k]; - if((word & 0xFFFF0000)==0) { - // will not generate a surrogate pair - if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, reinterpret_cast(utf16_output)); } - *utf16_output++ = !match_system(big_endian) ? char16_t(word >> 8 | word << 8) : char16_t(word); - } else { - // will generate a surrogate pair - if (word > 0x10FFFF) { return std::make_pair(nullptr, reinterpret_cast(utf16_output)); } - word -= 0x10000; - uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); - uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); - if (!match_system(big_endian)) { - high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8); - low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8); - } - *utf16_output++ = char16_t(high_surrogate); - *utf16_output++ = char16_t(low_surrogate); - } - } - buf += k; - } +#if SIMDUTF_IMPLEMENTATION_ICELAKE +static const icelake::implementation *get_icelake_singleton() { + static const icelake::implementation icelake_singleton{}; + return &icelake_singleton; +} +#endif +#if SIMDUTF_IMPLEMENTATION_HASWELL +static const haswell::implementation *get_haswell_singleton() { + static const haswell::implementation haswell_singleton{}; + return &haswell_singleton; +} +#endif +#if SIMDUTF_IMPLEMENTATION_WESTMERE +static const westmere::implementation *get_westmere_singleton() { + static const westmere::implementation westmere_singleton{}; + return &westmere_singleton; +} +#endif +#if SIMDUTF_IMPLEMENTATION_ARM64 +static const arm64::implementation *get_arm64_singleton() { + static const arm64::implementation arm64_singleton{}; + return &arm64_singleton; +} +#endif +#if SIMDUTF_IMPLEMENTATION_PPC64 +static const ppc64::implementation *get_ppc64_singleton() { + static const ppc64::implementation ppc64_singleton{}; + return &ppc64_singleton; +} +#endif +#if SIMDUTF_IMPLEMENTATION_RVV +static const rvv::implementation *get_rvv_singleton() { + static const rvv::implementation rvv_singleton{}; + return &rvv_singleton; +} +#endif +#if SIMDUTF_IMPLEMENTATION_LASX +static const lasx::implementation *get_lasx_singleton() { + static const lasx::implementation lasx_singleton{}; + return &lasx_singleton; +} +#endif +#if SIMDUTF_IMPLEMENTATION_LSX +static const lsx::implementation *get_lsx_singleton() { + static const lsx::implementation lsx_singleton{}; + return &lsx_singleton; +} +#endif +#if SIMDUTF_IMPLEMENTATION_FALLBACK +static const fallback::implementation *get_fallback_singleton() { + static const fallback::implementation fallback_singleton{}; + return &fallback_singleton; +} +#endif + +#if SIMDUTF_SINGLE_IMPLEMENTATION +static const implementation *get_single_implementation() { + return + #if SIMDUTF_IMPLEMENTATION_ICELAKE + get_icelake_singleton(); + #endif + #if SIMDUTF_IMPLEMENTATION_HASWELL + get_haswell_singleton(); + #endif + #if SIMDUTF_IMPLEMENTATION_WESTMERE + get_westmere_singleton(); + #endif + #if SIMDUTF_IMPLEMENTATION_ARM64 + get_arm64_singleton(); + #endif + #if SIMDUTF_IMPLEMENTATION_PPC64 + get_ppc64_singleton(); + #endif + #if SIMDUTF_IMPLEMENTATION_LASX + get_lasx_singleton(); + #endif + #if SIMDUTF_IMPLEMENTATION_LSX + get_lsx_singleton(); + #endif + #if SIMDUTF_IMPLEMENTATION_FALLBACK + get_fallback_singleton(); + #endif +} +#endif + +/** + * @private Detects best supported implementation on first use, and sets it + */ +class detect_best_supported_implementation_on_first_use final + : public implementation { +public: + std::string name() const noexcept final { return set_best()->name(); } + std::string description() const noexcept final { + return set_best()->description(); + } + uint32_t required_instruction_sets() const noexcept final { + return set_best()->required_instruction_sets(); } - // check for invalid input - if (vmaxv_u16(forbidden_bytemask) != 0) { - return std::make_pair(nullptr, reinterpret_cast(utf16_output)); +#if SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused int + detect_encodings(const char *input, size_t length) const noexcept override { + return set_best()->detect_encodings(input, length); } +#endif // SIMDUTF_FEATURE_DETECT_ENCODING - return std::make_pair(buf, reinterpret_cast(utf16_output)); -} +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool + validate_utf8(const char *buf, size_t len) const noexcept final override { + return set_best()->validate_utf8(buf, len); + } +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF8 + simdutf_warn_unused result validate_utf8_with_errors( + const char *buf, size_t len) const noexcept final override { + return set_best()->validate_utf8_with_errors(buf, len); + } +#endif // SIMDUTF_FEATURE_UTF8 -template -std::pair arm_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_out) { - uint16_t * utf16_output = reinterpret_cast(utf16_out); - const char32_t* start = buf; - const char32_t* end = buf + len; +#if SIMDUTF_FEATURE_ASCII + simdutf_warn_unused bool + validate_ascii(const char *buf, size_t len) const noexcept final override { + return set_best()->validate_ascii(buf, len); + } + simdutf_warn_unused result validate_ascii_with_errors( + const char *buf, size_t len) const noexcept final override { + return set_best()->validate_ascii_with_errors(buf, len); + } +#endif // SIMDUTF_FEATURE_ASCII - while(buf + 4 <= end) { - uint32x4_t in = vld1q_u32(reinterpret_cast(buf)); +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII + simdutf_warn_unused bool + validate_utf16le_as_ascii(const char16_t *buf, + size_t len) const noexcept final override { + return set_best()->validate_utf16le_as_ascii(buf, len); + } + simdutf_warn_unused bool + validate_utf16be_as_ascii(const char16_t *buf, + size_t len) const noexcept final override { + return set_best()->validate_utf16be_as_ascii(buf, len); + } +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII - // Check if no bits set above 16th - if(vmaxvq_u32(in) <= 0xFFFF) { - uint16x4_t utf16_packed = vmovn_u32(in); +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool + validate_utf16le(const char16_t *buf, + size_t len) const noexcept final override { + return set_best()->validate_utf16le(buf, len); + } +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING - const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800); - const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff); - const uint16x4_t forbidden_bytemask = vand_u16(vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800)); - if (vmaxv_u16(forbidden_bytemask) != 0) { - return std::make_pair(result(error_code::SURROGATE, buf - start), reinterpret_cast(utf16_output)); - } +#if SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused bool + validate_utf16be(const char16_t *buf, + size_t len) const noexcept final override { + return set_best()->validate_utf16be(buf, len); + } - if (!match_system(big_endian)) { utf16_packed = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(utf16_packed))); } - vst1_u16(utf16_output, utf16_packed); - utf16_output += 4; - buf += 4; - } else { - size_t forward = 3; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint32_t word = buf[k]; - if((word & 0xFFFF0000)==0) { - // will not generate a surrogate pair - if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), reinterpret_cast(utf16_output)); } - *utf16_output++ = !match_system(big_endian) ? char16_t(word >> 8 | word << 8) : char16_t(word); - } else { - // will generate a surrogate pair - if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast(utf16_output)); } - word -= 0x10000; - uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); - uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); - if (!match_system(big_endian)) { - high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8); - low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8); - } - *utf16_output++ = char16_t(high_surrogate); - *utf16_output++ = char16_t(low_surrogate); - } - } - buf += k; - } + simdutf_warn_unused result validate_utf16le_with_errors( + const char16_t *buf, size_t len) const noexcept final override { + return set_best()->validate_utf16le_with_errors(buf, len); } - return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast(utf16_output)); -} -/* end file src/arm64/arm_convert_utf32_to_utf16.cpp */ -/* begin file src/arm64/arm_base64.cpp */ -/** - * References and further reading: - * - * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the - * speed of a memory copy, Software: Practice and Experience 50 (2), 2020. - * https://arxiv.org/abs/1910.05109 - * - * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2 - * Instructions, ACM Transactions on the Web 12 (3), 2018. - * https://arxiv.org/abs/1704.00605 - * - * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings. - * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force, - * Request for Comments: 4648. - * - * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization. - * http://www.alfredklomp.com/programming/sse-base64/. (2014). - * - * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD - * acceleration. https://github.com/aklomp/base64. (2014). - * - * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014). - * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/ - * - * Nick Kopp. 2013. Base64 Encoding on a GPU. - * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013). - */ + simdutf_warn_unused result validate_utf16be_with_errors( + const char16_t *buf, size_t len) const noexcept final override { + return set_best()->validate_utf16be_with_errors(buf, len); + } + void to_well_formed_utf16be(const char16_t *input, size_t len, + char16_t *output) const noexcept final override { + return set_best()->to_well_formed_utf16be(input, len, output); + } + void to_well_formed_utf16le(const char16_t *input, size_t len, + char16_t *output) const noexcept final override { + return set_best()->to_well_formed_utf16le(input, len, output); + } +#endif // SIMDUTF_FEATURE_UTF16 -size_t encode_base64(char *dst, const char *src, size_t srclen, - base64_options options) { - // credit: Wojciech Muła - uint8_t *out = (uint8_t *)dst; - constexpr static uint8_t source_table[64] = { - 'A', 'Q', 'g', 'w', 'B', 'R', 'h', 'x', 'C', 'S', 'i', 'y', 'D', - 'T', 'j', 'z', 'E', 'U', 'k', '0', 'F', 'V', 'l', '1', 'G', 'W', - 'm', '2', 'H', 'X', 'n', '3', 'I', 'Y', 'o', '4', 'J', 'Z', 'p', - '5', 'K', 'a', 'q', '6', 'L', 'b', 'r', '7', 'M', 'c', 's', '8', - 'N', 'd', 't', '9', 'O', 'e', 'u', '+', 'P', 'f', 'v', '/', - }; - constexpr static uint8_t source_table_url[64] = { - 'A', 'Q', 'g', 'w', 'B', 'R', 'h', 'x', 'C', 'S', 'i', 'y', 'D', - 'T', 'j', 'z', 'E', 'U', 'k', '0', 'F', 'V', 'l', '1', 'G', 'W', - 'm', '2', 'H', 'X', 'n', '3', 'I', 'Y', 'o', '4', 'J', 'Z', 'p', - '5', 'K', 'a', 'q', '6', 'L', 'b', 'r', '7', 'M', 'c', 's', '8', - 'N', 'd', 't', '9', 'O', 'e', 'u', '-', 'P', 'f', 'v', '_', - }; - const uint8x16_t v3f = vdupq_n_u8(0x3f); -#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - // When trying to load a uint8_t array, Visual Studio might - // error with: error C2664: '__n128x4 neon_ld4m_q8(const char *)': - // cannot convert argument 1 from 'const uint8_t [64]' to 'const char * - const uint8x16x4_t table = - vld4q_u8((reinterpret_cast( - options & base64_url) ? source_table_url : source_table)); -#else - const uint8x16x4_t table = - vld4q_u8((options & base64_url) ? source_table_url : source_table); -#endif - size_t i = 0; - for (; i + 16 * 3 <= srclen; i += 16 * 3) { - const uint8x16x3_t in = vld3q_u8((const uint8_t *)src + i); - uint8x16x4_t result; - result.val[0] = vshrq_n_u8(in.val[0], 2); - result.val[1] = - vandq_u8(vsliq_n_u8(vshrq_n_u8(in.val[1], 4), in.val[0], 4), v3f); - result.val[2] = - vandq_u8(vsliq_n_u8(vshrq_n_u8(in.val[2], 6), in.val[1], 2), v3f); - result.val[3] = vandq_u8(in.val[2], v3f); - result.val[0] = vqtbl4q_u8(table, result.val[0]); - result.val[1] = vqtbl4q_u8(table, result.val[1]); - result.val[2] = vqtbl4q_u8(table, result.val[2]); - result.val[3] = vqtbl4q_u8(table, result.val[3]); - vst4q_u8(out, result); - out += 64; +#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool + validate_utf32(const char32_t *buf, + size_t len) const noexcept final override { + return set_best()->validate_utf32(buf, len); } - out += scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i, - options); +#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING - return size_t((char *)out - dst); -} +#if SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused result validate_utf32_with_errors( + const char32_t *buf, size_t len) const noexcept final override { + return set_best()->validate_utf32_with_errors(buf, len); + } +#endif // SIMDUTF_FEATURE_UTF32 -static inline void compress(uint8x16_t data, uint16_t mask, char *output) { - if (mask == 0) { - vst1q_u8((uint8_t *)output, data); - return; +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + convert_latin1_to_utf8(const char *buf, size_t len, + char *utf8_output) const noexcept final override { + return set_best()->convert_latin1_to_utf8(buf, len, utf8_output); } - uint8_t mask1 = uint8_t(mask); // least significant 8 bits - uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits - uint64x2_t compactmasku64 = {tables::base64::thintable_epi8[mask1], - tables::base64::thintable_epi8[mask2]}; - uint8x16_t compactmask = vreinterpretq_u8_u64(compactmasku64); -#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint8x16_t off = - simdutf_make_uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8); -#else - const uint8x16_t off = {0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8}; -#endif +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 - compactmask = vaddq_u8(compactmask, off); - uint8x16_t pruned = vqtbl1q_u8(data, compactmask); +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf16le( + const char *buf, size_t len, + char16_t *utf16_output) const noexcept final override { + return set_best()->convert_latin1_to_utf16le(buf, len, utf16_output); + } - int pop1 = tables::base64::BitsSetTable256mul2[mask1]; - // then load the corresponding mask, what it does is to write - // only the first pop1 bytes from the first 8 bytes, and then - // it fills in with the bytes from the second 8 bytes + some filling - // at the end. - compactmask = vld1q_u8(tables::base64::pshufb_combine_table + pop1 * 8); - uint8x16_t answer = vqtbl1q_u8(pruned, compactmask); - vst1q_u8((uint8_t *)output, answer); -} + simdutf_warn_unused size_t convert_latin1_to_utf16be( + const char *buf, size_t len, + char16_t *utf16_output) const noexcept final override { + return set_best()->convert_latin1_to_utf16be(buf, len, utf16_output); + } +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 -struct block64 { - uint8x16_t chunks[4]; -}; -static_assert(sizeof(block64) == 64, "block64 is not 64 bytes"); -template uint64_t to_base64_mask(block64 *b, bool *error) { - uint8x16_t v0f = vdupq_n_u8(0xf); +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf32( + const char *buf, size_t len, + char32_t *latin1_output) const noexcept final override { + return set_best()->convert_latin1_to_utf32(buf, len, latin1_output); + } +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 - uint8x16_t underscore0, underscore1, underscore2, underscore3; - if (base64_url) { - underscore0 = vceqq_u8(b->chunks[0], vdupq_n_u8(0x5f)); - underscore1 = vceqq_u8(b->chunks[1], vdupq_n_u8(0x5f)); - underscore2 = vceqq_u8(b->chunks[2], vdupq_n_u8(0x5f)); - underscore3 = vceqq_u8(b->chunks[3], vdupq_n_u8(0x5f)); - } else { - (void)underscore0; - (void)underscore1; - (void)underscore2; - (void)underscore3; +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + convert_utf8_to_latin1(const char *buf, size_t len, + char *latin1_output) const noexcept final override { + return set_best()->convert_utf8_to_latin1(buf, len, latin1_output); } - uint8x16_t lo_nibbles0 = vandq_u8(b->chunks[0], v0f); - uint8x16_t lo_nibbles1 = vandq_u8(b->chunks[1], v0f); - uint8x16_t lo_nibbles2 = vandq_u8(b->chunks[2], v0f); - uint8x16_t lo_nibbles3 = vandq_u8(b->chunks[3], v0f); - // Needed by the decoding step. - uint8x16_t hi_nibbles0 = vshrq_n_u8(b->chunks[0], 4); - uint8x16_t hi_nibbles1 = vshrq_n_u8(b->chunks[1], 4); - uint8x16_t hi_nibbles2 = vshrq_n_u8(b->chunks[2], 4); - uint8x16_t hi_nibbles3 = vshrq_n_u8(b->chunks[3], 4); - uint8x16_t lut_lo; -#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - if (base64_url) { - lut_lo = - simdutf_make_uint8x16_t(0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, - 0x70, 0x61, 0xe1, 0xf4, 0xf5, 0xa5, 0xf4, 0xf4); - } else { - lut_lo = - simdutf_make_uint8x16_t(0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, - 0x70, 0x61, 0xe1, 0xb4, 0xf5, 0xe5, 0xf4, 0xb4); + simdutf_warn_unused result convert_utf8_to_latin1_with_errors( + const char *buf, size_t len, + char *latin1_output) const noexcept final override { + return set_best()->convert_utf8_to_latin1_with_errors(buf, len, + latin1_output); } -#else - if (base64_url) { - lut_lo = uint8x16_t{0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, - 0x70, 0x61, 0xe1, 0xf4, 0xf5, 0xa5, 0xf4, 0xf4}; - } else { - lut_lo = uint8x16_t{0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, - 0x70, 0x61, 0xe1, 0xb4, 0xf5, 0xe5, 0xf4, 0xb4}; + + simdutf_warn_unused size_t convert_valid_utf8_to_latin1( + const char *buf, size_t len, + char *latin1_output) const noexcept final override { + return set_best()->convert_valid_utf8_to_latin1(buf, len, latin1_output); } -#endif - uint8x16_t lo0 = vqtbl1q_u8(lut_lo, lo_nibbles0); - uint8x16_t lo1 = vqtbl1q_u8(lut_lo, lo_nibbles1); - uint8x16_t lo2 = vqtbl1q_u8(lut_lo, lo_nibbles2); - uint8x16_t lo3 = vqtbl1q_u8(lut_lo, lo_nibbles3); - uint8x16_t lut_hi; -#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - if (base64_url) { - lut_hi = - simdutf_make_uint8x16_t(0x11, 0x20, 0x42, 0x80, 0x8, 0x4, 0x8, 0x4, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20); - } else { - lut_hi = - simdutf_make_uint8x16_t(0x11, 0x20, 0x42, 0x80, 0x8, 0x4, 0x8, 0x4, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20); +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t convert_utf8_to_utf16le( + const char *buf, size_t len, + char16_t *utf16_output) const noexcept final override { + return set_best()->convert_utf8_to_utf16le(buf, len, utf16_output); } -#else - if (base64_url) { - lut_hi = uint8x16_t{0x11, 0x20, 0x42, 0x80, 0x8, 0x4, 0x8, 0x4, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20}; - } else { - lut_hi = uint8x16_t{0x11, 0x20, 0x42, 0x80, 0x8, 0x4, 0x8, 0x4, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20}; + + simdutf_warn_unused size_t convert_utf8_to_utf16be( + const char *buf, size_t len, + char16_t *utf16_output) const noexcept final override { + return set_best()->convert_utf8_to_utf16be(buf, len, utf16_output); } -#endif - uint8x16_t hi0 = vqtbl1q_u8(lut_hi, hi_nibbles0); - uint8x16_t hi1 = vqtbl1q_u8(lut_hi, hi_nibbles1); - uint8x16_t hi2 = vqtbl1q_u8(lut_hi, hi_nibbles2); - uint8x16_t hi3 = vqtbl1q_u8(lut_hi, hi_nibbles3); - if (base64_url) { - hi0 = vbicq_u8(hi0, underscore0); - hi1 = vbicq_u8(hi1, underscore1); - hi2 = vbicq_u8(hi2, underscore2); - hi3 = vbicq_u8(hi3, underscore3); + simdutf_warn_unused result convert_utf8_to_utf16le_with_errors( + const char *buf, size_t len, + char16_t *utf16_output) const noexcept final override { + return set_best()->convert_utf8_to_utf16le_with_errors(buf, len, + utf16_output); } - uint8_t checks = - vmaxvq_u8(vorrq_u8(vorrq_u8(vandq_u8(lo0, hi0), vandq_u8(lo1, hi1)), - vorrq_u8(vandq_u8(lo2, hi2), vandq_u8(lo3, hi3)))); -#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint8x16_t bit_mask = - simdutf_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, - 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80); -#else - const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, - 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; -#endif - uint64_t badcharmask = 0; - *error = checks > 0x3; - if (checks) { - // Add each of the elements next to each other, successively, to stuff each - // 8 byte mask into one. - uint8x16_t test0 = vtstq_u8(lo0, hi0); - uint8x16_t test1 = vtstq_u8(lo1, hi1); - uint8x16_t test2 = vtstq_u8(lo2, hi2); - uint8x16_t test3 = vtstq_u8(lo3, hi3); - uint8x16_t sum0 = - vpaddq_u8(vandq_u8(test0, bit_mask), vandq_u8(test1, bit_mask)); - uint8x16_t sum1 = - vpaddq_u8(vandq_u8(test2, bit_mask), vandq_u8(test3, bit_mask)); - sum0 = vpaddq_u8(sum0, sum1); - sum0 = vpaddq_u8(sum0, sum0); - badcharmask = vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0); + simdutf_warn_unused result convert_utf8_to_utf16be_with_errors( + const char *buf, size_t len, + char16_t *utf16_output) const noexcept final override { + return set_best()->convert_utf8_to_utf16be_with_errors(buf, len, + utf16_output); } - // This is the transformation step that can be done while we are waiting for - // sum0 - uint8x16_t roll_lut; -#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - if (base64_url) { - roll_lut = - simdutf_make_uint8x16_t(0xe0, 0x11, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9, - 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0); - } else { - roll_lut = - simdutf_make_uint8x16_t(0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9, - 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0); + + simdutf_warn_unused size_t convert_valid_utf8_to_utf16le( + const char *buf, size_t len, + char16_t *utf16_output) const noexcept final override { + return set_best()->convert_valid_utf8_to_utf16le(buf, len, utf16_output); } -#else - if (base64_url) { - roll_lut = uint8x16_t{0xe0, 0x11, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9, - 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; - } else { - roll_lut = uint8x16_t{0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9, - 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; + + simdutf_warn_unused size_t convert_valid_utf8_to_utf16be( + const char *buf, size_t len, + char16_t *utf16_output) const noexcept final override { + return set_best()->convert_valid_utf8_to_utf16be(buf, len, utf16_output); + } + simdutf_warn_unused result utf8_length_from_utf16le_with_replacement( + const char16_t *input, size_t length) const noexcept final override { + return set_best()->utf8_length_from_utf16le_with_replacement(input, length); } -#endif - uint8x16_t vsecond_last = base64_url ? vdupq_n_u8(0x2d) : vdupq_n_u8(0x2f); - if (base64_url) { - hi_nibbles0 = vbicq_u8(hi_nibbles0, underscore0); - hi_nibbles1 = vbicq_u8(hi_nibbles1, underscore1); - hi_nibbles2 = vbicq_u8(hi_nibbles2, underscore2); - hi_nibbles3 = vbicq_u8(hi_nibbles3, underscore3); - } - uint8x16_t roll0 = vqtbl1q_u8( - roll_lut, vaddq_u8(vceqq_u8(b->chunks[0], vsecond_last), hi_nibbles0)); - uint8x16_t roll1 = vqtbl1q_u8( - roll_lut, vaddq_u8(vceqq_u8(b->chunks[1], vsecond_last), hi_nibbles1)); - uint8x16_t roll2 = vqtbl1q_u8( - roll_lut, vaddq_u8(vceqq_u8(b->chunks[2], vsecond_last), hi_nibbles2)); - uint8x16_t roll3 = vqtbl1q_u8( - roll_lut, vaddq_u8(vceqq_u8(b->chunks[3], vsecond_last), hi_nibbles3)); - b->chunks[0] = vaddq_u8(b->chunks[0], roll0); - b->chunks[1] = vaddq_u8(b->chunks[1], roll1); - b->chunks[2] = vaddq_u8(b->chunks[2], roll2); - b->chunks[3] = vaddq_u8(b->chunks[3], roll3); - return badcharmask; -} -void copy_block(block64 *b, char *output) { - vst1q_u8((uint8_t *)output, b->chunks[0]); - vst1q_u8((uint8_t *)output + 16, b->chunks[1]); - vst1q_u8((uint8_t *)output + 32, b->chunks[2]); - vst1q_u8((uint8_t *)output + 48, b->chunks[3]); -} + simdutf_warn_unused result utf8_length_from_utf16be_with_replacement( + const char16_t *input, size_t length) const noexcept final override { + return set_best()->utf8_length_from_utf16be_with_replacement(input, length); + } -uint64_t compress_block(block64 *b, uint64_t mask, char *output) { - uint64_t popcounts = - vget_lane_u64(vreinterpret_u64_u8(vcnt_u8(vcreate_u8(~mask))), 0); - uint64_t offsets = popcounts * 0x0101010101010101; - compress(b->chunks[0], uint16_t(mask), output); - compress(b->chunks[1], uint16_t(mask >> 16), &output[(offsets >> 8) & 0xFF]); - compress(b->chunks[2], uint16_t(mask >> 32), &output[(offsets >> 24) & 0xFF]); - compress(b->chunks[3], uint16_t(mask >> 48), &output[(offsets >> 40) & 0xFF]); - return offsets >> 56; -} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 -// The caller of this function is responsible to ensure that there are 64 bytes available -// from reading at src. The data is read into a block64 structure. -void load_block(block64 *b, const char *src) { - b->chunks[0] = vld1q_u8(reinterpret_cast(src)); - b->chunks[1] = vld1q_u8(reinterpret_cast(src) + 16); - b->chunks[2] = vld1q_u8(reinterpret_cast(src) + 32); - b->chunks[3] = vld1q_u8(reinterpret_cast(src) + 48); -} +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t + convert_utf8_to_utf32(const char *buf, size_t len, + char32_t *utf32_output) const noexcept final override { + return set_best()->convert_utf8_to_utf32(buf, len, utf32_output); + } -// The caller of this function is responsible to ensure that there are 32 bytes available -// from reading at data. It returns a 16-byte value, narrowing with saturation the 16-bit words. -inline uint8x16_t load_satured(const uint16_t *data) { - uint16x8_t in1 = vld1q_u16(data); - uint16x8_t in2 = vld1q_u16(data + 8); - return vqmovn_high_u16(vqmovn_u16(in1), in2); -} + simdutf_warn_unused result convert_utf8_to_utf32_with_errors( + const char *buf, size_t len, + char32_t *utf32_output) const noexcept final override { + return set_best()->convert_utf8_to_utf32_with_errors(buf, len, + utf32_output); + } -// The caller of this function is responsible to ensure that there are 128 bytes available -// from reading at src. The data is read into a block64 structure. -void load_block(block64 *b, const char16_t *src) { - b->chunks[0] = load_satured(reinterpret_cast(src)); - b->chunks[1] = load_satured(reinterpret_cast(src) + 16); - b->chunks[2] = load_satured(reinterpret_cast(src) + 32); - b->chunks[3] = load_satured(reinterpret_cast(src) + 48); -} + simdutf_warn_unused size_t convert_valid_utf8_to_utf32( + const char *buf, size_t len, + char32_t *utf32_output) const noexcept final override { + return set_best()->convert_valid_utf8_to_utf32(buf, len, utf32_output); + } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 -// decode 64 bytes and output 48 bytes -void base64_decode_block(char *out, const char *src) { - uint8x16x4_t str = vld4q_u8((uint8_t *)src); - uint8x16x3_t outvec; - outvec.val[0] = - vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4)); - outvec.val[1] = - vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2)); - outvec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]); - vst3q_u8((uint8_t *)out, outvec); -} +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + convert_utf16le_to_latin1(const char16_t *buf, size_t len, + char *latin1_output) const noexcept final override { + return set_best()->convert_utf16le_to_latin1(buf, len, latin1_output); + } -template -result compress_decode_base64(char *dst, const char_type *src, size_t srclen, - base64_options options) { - const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value - : tables::base64::to_base64_value; - // skip trailing spaces - while (srclen > 0 && to_base64[uint8_t(src[srclen - 1])] == 64) { - srclen--; + simdutf_warn_unused size_t + convert_utf16be_to_latin1(const char16_t *buf, size_t len, + char *latin1_output) const noexcept final override { + return set_best()->convert_utf16be_to_latin1(buf, len, latin1_output); } - size_t equalsigns = 0; - if (srclen > 0 && src[srclen - 1] == '=') { - srclen--; - equalsigns = 1; - // skip trailing spaces - while (srclen > 0 && to_base64[uint8_t(src[srclen - 1])] == 64) { - srclen--; - } - if (srclen > 0 && src[srclen - 1] == '=') { - srclen--; - equalsigns = 2; - } + + simdutf_warn_unused result convert_utf16le_to_latin1_with_errors( + const char16_t *buf, size_t len, + char *latin1_output) const noexcept final override { + return set_best()->convert_utf16le_to_latin1_with_errors(buf, len, + latin1_output); } - const char_type *const srcinit = src; - const char *const dstinit = dst; - const char_type *const srcend = src + srclen; - constexpr size_t block_size = 10; - char buffer[block_size * 64]; - char *bufferptr = buffer; - if (srclen >= 64) { - const char_type *const srcend64 = src + srclen - 64; - while (src <= srcend64) { - block64 b; - load_block(&b, src); - src += 64; - bool error = false; - uint64_t badcharmask = to_base64_mask(&b, &error); - if(badcharmask) - if (error) { - src -= 64; + simdutf_warn_unused result convert_utf16be_to_latin1_with_errors( + const char16_t *buf, size_t len, + char *latin1_output) const noexcept final override { + return set_best()->convert_utf16be_to_latin1_with_errors(buf, len, + latin1_output); + } - while (src < srcend && to_base64[uint8_t(*src)] <= 64) { - src++; - } - return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)}; - } + simdutf_warn_unused size_t convert_valid_utf16le_to_latin1( + const char16_t *buf, size_t len, + char *latin1_output) const noexcept final override { + return set_best()->convert_valid_utf16le_to_latin1(buf, len, latin1_output); + } - if (badcharmask != 0) { - // optimization opportunity: check for simple masks like those made of - // continuous 1s followed by continuous 0s. And masks containing a - // single bad character. + simdutf_warn_unused size_t convert_valid_utf16be_to_latin1( + const char16_t *buf, size_t len, + char *latin1_output) const noexcept final override { + return set_best()->convert_valid_utf16be_to_latin1(buf, len, latin1_output); + } +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 - bufferptr += compress_block(&b, badcharmask, bufferptr); - } else { - // optimization opportunity: if bufferptr == buffer and mask == 0, we - // can avoid the call to compress_block and decode directly. - copy_block(&b, bufferptr); - bufferptr += 64; - // base64_decode_block(dst, &b); - // dst += 48; - } - if (bufferptr >= (block_size - 1) * 64 + buffer) { - for (size_t i = 0; i < (block_size - 1); i++) { - base64_decode_block(dst, buffer + i * 64); - dst += 48; - } - std::memcpy(buffer, buffer + (block_size - 1) * 64, - 64); // 64 might be too much - bufferptr -= (block_size - 1) * 64; - } - } +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t + convert_utf16le_to_utf8(const char16_t *buf, size_t len, + char *utf8_output) const noexcept final override { + return set_best()->convert_utf16le_to_utf8(buf, len, utf8_output); } - char *buffer_start = buffer; - // Optimization note: if this is almost full, then it is worth our - // time, otherwise, we should just decode directly. - int last_block = (int)((bufferptr - buffer_start) % 64); - if (last_block != 0 && srcend - src + last_block >= 64) { - while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) { - uint8_t val = to_base64[uint8_t(*src)]; - *bufferptr = char(val); - if (val > 64) { - return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)}; - } - bufferptr += (val <= 63); - src++; - } + + simdutf_warn_unused size_t + convert_utf16be_to_utf8(const char16_t *buf, size_t len, + char *utf8_output) const noexcept final override { + return set_best()->convert_utf16be_to_utf8(buf, len, utf8_output); } - for (; buffer_start + 64 <= bufferptr; buffer_start += 64) { - base64_decode_block(dst, buffer_start); - dst += 48; + simdutf_warn_unused result convert_utf16le_to_utf8_with_errors( + const char16_t *buf, size_t len, + char *utf8_output) const noexcept final override { + return set_best()->convert_utf16le_to_utf8_with_errors(buf, len, + utf8_output); } - if ((bufferptr - buffer_start) % 64 != 0) { - while (buffer_start + 4 < bufferptr) { - uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) + - (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) + - (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) + - (uint32_t(uint8_t(buffer_start[3])) << 0 * 6)) - << 8; - triple = scalar::utf32::swap_bytes(triple); - std::memcpy(dst, &triple, 4); - dst += 3; - buffer_start += 4; - } - if (buffer_start + 4 <= bufferptr) { - uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) + - (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) + - (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) + - (uint32_t(uint8_t(buffer_start[3])) << 0 * 6)) - << 8; - triple = scalar::utf32::swap_bytes(triple); - std::memcpy(dst, &triple, 3); + simdutf_warn_unused result convert_utf16be_to_utf8_with_errors( + const char16_t *buf, size_t len, + char *utf8_output) const noexcept final override { + return set_best()->convert_utf16be_to_utf8_with_errors(buf, len, + utf8_output); + } - dst += 3; - buffer_start += 4; - } - // we may have 1, 2 or 3 bytes left and we need to decode them so let us - // bring in src content - int leftover = int(bufferptr - buffer_start); - if (leftover > 0) { - while (leftover < 4 && src < srcend) { - uint8_t val = to_base64[uint8_t(*src)]; - if (val > 64) { - return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)}; - } - buffer_start[leftover] = char(val); - leftover += (val <= 63); - src++; - } - - if (leftover == 1) { - return {BASE64_INPUT_REMAINDER, size_t(dst - dstinit)}; - } - if (leftover == 2) { - uint32_t triple = (uint32_t(buffer_start[0]) << 3 * 6) + - (uint32_t(buffer_start[1]) << 2 * 6); - triple = scalar::utf32::swap_bytes(triple); - triple >>= 8; - std::memcpy(dst, &triple, 1); - dst += 1; - } else if (leftover == 3) { - uint32_t triple = (uint32_t(buffer_start[0]) << 3 * 6) + - (uint32_t(buffer_start[1]) << 2 * 6) + - (uint32_t(buffer_start[2]) << 1 * 6); - triple = scalar::utf32::swap_bytes(triple); - triple >>= 8; - - std::memcpy(dst, &triple, 2); - dst += 2; - } else { - uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) + - (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) + - (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) + - (uint32_t(uint8_t(buffer_start[3])) << 0 * 6)) - << 8; - triple = scalar::utf32::swap_bytes(triple); - std::memcpy(dst, &triple, 3); - dst += 3; - } - } + simdutf_warn_unused size_t convert_valid_utf16le_to_utf8( + const char16_t *buf, size_t len, + char *utf8_output) const noexcept final override { + return set_best()->convert_valid_utf16le_to_utf8(buf, len, utf8_output); } - if (src < srcend + equalsigns) { - result r = - scalar::base64::base64_tail_decode(dst, src, srcend - src, options); - if (r.error == error_code::INVALID_BASE64_CHARACTER) { - r.count += size_t(src - srcinit); - return r; - } else { - r.count += size_t(dst - dstinit); - } - if(r.error == error_code::SUCCESS && equalsigns > 0) { - // additional checks - if((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) { - r.error = error_code::INVALID_BASE64_CHARACTER; - } - } - return r; + + simdutf_warn_unused size_t convert_valid_utf16be_to_utf8( + const char16_t *buf, size_t len, + char *utf8_output) const noexcept final override { + return set_best()->convert_valid_utf16be_to_utf8(buf, len, utf8_output); } - if(equalsigns > 0) { - if((size_t(dst - dstinit) % 3 == 0) || ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) { - return {INVALID_BASE64_CHARACTER, size_t(dst - dstinit)}; - } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + convert_utf32_to_latin1(const char32_t *buf, size_t len, + char *latin1_output) const noexcept final override { + return set_best()->convert_utf32_to_latin1(buf, len, latin1_output); } - return {SUCCESS, size_t(dst - dstinit)}; -} -/* end file src/arm64/arm_base64.cpp */ -} // unnamed namespace -} // namespace arm64 -} // namespace simdutf -/* begin file src/generic/buf_block_reader.h */ -namespace simdutf { -namespace arm64 { -namespace { + simdutf_warn_unused result convert_utf32_to_latin1_with_errors( + const char32_t *buf, size_t len, + char *latin1_output) const noexcept final override { + return set_best()->convert_utf32_to_latin1_with_errors(buf, len, + latin1_output); + } -// Walks through a buffer in block-sized increments, loading the last part with spaces -template -struct buf_block_reader { -public: - simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len); - simdutf_really_inline size_t block_index(); - simdutf_really_inline bool has_full_block() const; - simdutf_really_inline const uint8_t *full_block() const; - /** - * Get the last block, padded with spaces. - * - * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this - * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there - * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding. - * - * @return the number of effective characters in the last block. - */ - simdutf_really_inline size_t get_remainder(uint8_t *dst) const; - simdutf_really_inline void advance(); -private: - const uint8_t *buf; - const size_t len; - const size_t lenminusstep; - size_t idx; -}; + simdutf_warn_unused size_t convert_valid_utf32_to_latin1( + const char32_t *buf, size_t len, + char *latin1_output) const noexcept final override { + return set_best()->convert_utf32_to_latin1(buf, len, latin1_output); + } +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 -// Routines to print masks and text for debugging bitmask operations -simdutf_unused static char * format_input_text_64(const uint8_t *text) { - static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); - for (size_t i=0; i); i++) { - buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]); +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t + convert_utf32_to_utf8(const char32_t *buf, size_t len, + char *utf8_output) const noexcept final override { + return set_best()->convert_utf32_to_utf8(buf, len, utf8_output); } - buf[sizeof(simd8x64)] = '\0'; - return buf; -} -// Routines to print masks and text for debugging bitmask operations -simdutf_unused static char * format_input_text(const simd8x64& in) { - static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); - in.store(reinterpret_cast(buf)); - for (size_t i=0; i); i++) { - if (buf[i] < ' ') { buf[i] = '_'; } + simdutf_warn_unused result convert_utf32_to_utf8_with_errors( + const char32_t *buf, size_t len, + char *utf8_output) const noexcept final override { + return set_best()->convert_utf32_to_utf8_with_errors(buf, len, utf8_output); } - buf[sizeof(simd8x64)] = '\0'; - return buf; -} -simdutf_unused static char * format_mask(uint64_t mask) { - static char *buf = reinterpret_cast(malloc(64 + 1)); - for (size_t i=0; i<64; i++) { - buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' '; + simdutf_warn_unused size_t + convert_valid_utf32_to_utf8(const char32_t *buf, size_t len, + char *utf8_output) const noexcept final override { + return set_best()->convert_valid_utf32_to_utf8(buf, len, utf8_output); } - buf[64] = '\0'; - return buf; -} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 -template -simdutf_really_inline buf_block_reader::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t convert_utf32_to_utf16le( + const char32_t *buf, size_t len, + char16_t *utf16_output) const noexcept final override { + return set_best()->convert_utf32_to_utf16le(buf, len, utf16_output); + } -template -simdutf_really_inline size_t buf_block_reader::block_index() { return idx; } + simdutf_warn_unused size_t convert_utf32_to_utf16be( + const char32_t *buf, size_t len, + char16_t *utf16_output) const noexcept final override { + return set_best()->convert_utf32_to_utf16be(buf, len, utf16_output); + } -template -simdutf_really_inline bool buf_block_reader::has_full_block() const { - return idx < lenminusstep; -} + simdutf_warn_unused result convert_utf32_to_utf16le_with_errors( + const char32_t *buf, size_t len, + char16_t *utf16_output) const noexcept final override { + return set_best()->convert_utf32_to_utf16le_with_errors(buf, len, + utf16_output); + } -template -simdutf_really_inline const uint8_t *buf_block_reader::full_block() const { - return &buf[idx]; -} + simdutf_warn_unused result convert_utf32_to_utf16be_with_errors( + const char32_t *buf, size_t len, + char16_t *utf16_output) const noexcept final override { + return set_best()->convert_utf32_to_utf16be_with_errors(buf, len, + utf16_output); + } -template -simdutf_really_inline size_t buf_block_reader::get_remainder(uint8_t *dst) const { - if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers - std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once. - std::memcpy(dst, buf + idx, len - idx); - return len - idx; -} + simdutf_warn_unused size_t convert_valid_utf32_to_utf16le( + const char32_t *buf, size_t len, + char16_t *utf16_output) const noexcept final override { + return set_best()->convert_valid_utf32_to_utf16le(buf, len, utf16_output); + } -template -simdutf_really_inline void buf_block_reader::advance() { - idx += STEP_SIZE; -} + simdutf_warn_unused size_t convert_valid_utf32_to_utf16be( + const char32_t *buf, size_t len, + char16_t *utf16_output) const noexcept final override { + return set_best()->convert_valid_utf32_to_utf16be(buf, len, utf16_output); + } -} // unnamed namespace -} // namespace arm64 -} // namespace simdutf -/* end file src/generic/buf_block_reader.h */ -/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ -namespace simdutf { -namespace arm64 { -namespace { -namespace utf8_validation { + simdutf_warn_unused size_t convert_utf16le_to_utf32( + const char16_t *buf, size_t len, + char32_t *utf32_output) const noexcept final override { + return set_best()->convert_utf16le_to_utf32(buf, len, utf32_output); + } -using namespace simd; + simdutf_warn_unused size_t convert_utf16be_to_utf32( + const char16_t *buf, size_t len, + char32_t *utf32_output) const noexcept final override { + return set_best()->convert_utf16be_to_utf32(buf, len, utf32_output); + } - simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { -// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) -// Bit 1 = Too Long (ASCII followed by continuation) -// Bit 2 = Overlong 3-byte -// Bit 4 = Surrogate -// Bit 5 = Overlong 2-byte -// Bit 7 = Two Continuations - constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ - // 11______ 11______ - constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ - constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ - constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ - constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ - constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ - constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ - // 11110100 101_____ - // 11110101 1001____ - // 11110101 101_____ - // 1111011_ 1001____ - // 1111011_ 101_____ - // 11111___ 1001____ - // 11111___ 101_____ - constexpr const uint8_t TOO_LARGE_1000 = 1<<6; - // 11110101 1000____ - // 1111011_ 1000____ - // 11111___ 1000____ - constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ - - const simd8 byte_1_high = prev1.shr<4>().lookup_16( - // 0_______ ________ - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - // 10______ ________ - TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, - // 1100____ ________ - TOO_SHORT | OVERLONG_2, - // 1101____ ________ - TOO_SHORT, - // 1110____ ________ - TOO_SHORT | OVERLONG_3 | SURROGATE, - // 1111____ ________ - TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 - ); - constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . - const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( - // ____0000 ________ - CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, - // ____0001 ________ - CARRY | OVERLONG_2, - // ____001_ ________ - CARRY, - CARRY, - - // ____0100 ________ - CARRY | TOO_LARGE, - // ____0101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - // ____011_ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - - // ____1___ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - // ____1101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000 - ); - const simd8 byte_2_high = input.shr<4>().lookup_16( - // ________ 0_______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + simdutf_warn_unused result convert_utf16le_to_utf32_with_errors( + const char16_t *buf, size_t len, + char32_t *utf32_output) const noexcept final override { + return set_best()->convert_utf16le_to_utf32_with_errors(buf, len, + utf32_output); + } - // ________ 1000____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, - // ________ 1001____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, - // ________ 101_____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + simdutf_warn_unused result convert_utf16be_to_utf32_with_errors( + const char16_t *buf, size_t len, + char32_t *utf32_output) const noexcept final override { + return set_best()->convert_utf16be_to_utf32_with_errors(buf, len, + utf32_output); + } - // ________ 11______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT - ); - return (byte_1_high & byte_1_low & byte_2_high); + simdutf_warn_unused size_t convert_valid_utf16le_to_utf32( + const char16_t *buf, size_t len, + char32_t *utf32_output) const noexcept final override { + return set_best()->convert_valid_utf16le_to_utf32(buf, len, utf32_output); } - simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, - const simd8 prev_input, const simd8 sc) { - simd8 prev2 = input.prev<2>(prev_input); - simd8 prev3 = input.prev<3>(prev_input); - simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); - simd8 must23_80 = must23 & uint8_t(0x80); - return must23_80 ^ sc; + + simdutf_warn_unused size_t convert_valid_utf16be_to_utf32( + const char16_t *buf, size_t len, + char32_t *utf32_output) const noexcept final override { + return set_best()->convert_valid_utf16be_to_utf32(buf, len, utf32_output); } +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 - // - // Return nonzero if there are incomplete multibyte characters at the end of the block: - // e.g. if there is a 4-byte character, but it's 3 bytes from the end. - // - simdutf_really_inline simd8 is_incomplete(const simd8 input) { - // If the previous input's last 3 bytes match this, they're too short (they ended at EOF): - // ... 1111____ 111_____ 11______ - static const uint8_t max_array[32] = { - 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1 - }; - const simd8 max_value(&max_array[sizeof(max_array)-sizeof(simd8)]); - return input.gt_bits(max_value); +#if SIMDUTF_FEATURE_UTF16 + void change_endianness_utf16(const char16_t *buf, size_t len, + char16_t *output) const noexcept final override { + set_best()->change_endianness_utf16(buf, len, output); } - struct utf8_checker { - // If this is nonzero, there has been a UTF-8 error. - simd8 error; - // The last input we received - simd8 prev_input_block; - // Whether the last input we received was incomplete (used for ASCII fast path) - simd8 prev_incomplete; + simdutf_warn_unused size_t + count_utf16le(const char16_t *buf, size_t len) const noexcept final override { + return set_best()->count_utf16le(buf, len); + } - // - // Check whether the current bytes are valid UTF-8. - // - simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { - // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes - // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) - simd8 prev1 = input.prev<1>(prev_input); - simd8 sc = check_special_cases(input, prev1); - this->error |= check_multibyte_lengths(input, prev_input, sc); - } - - // The only problem that can happen at EOF is that a multibyte character is too short - // or a byte value too large in the last bytes: check_special_cases only checks for bytes - // too large in the first of two bytes. - simdutf_really_inline void check_eof() { - // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't - // possibly finish them. - this->error |= this->prev_incomplete; - } + simdutf_warn_unused size_t + count_utf16be(const char16_t *buf, size_t len) const noexcept final override { + return set_best()->count_utf16be(buf, len); + } +#endif // SIMDUTF_FEATURE_UTF16 - simdutf_really_inline void check_next_input(const simd8x64& input) { - if(simdutf_likely(is_ascii(input))) { - this->error |= this->prev_incomplete; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], this->prev_input_block); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], this->prev_input_block); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - this->prev_incomplete = is_incomplete(input.chunks[simd8x64::NUM_CHUNKS-1]); - this->prev_input_block = input.chunks[simd8x64::NUM_CHUNKS-1]; +#if SIMDUTF_FEATURE_UTF8 + simdutf_warn_unused size_t + count_utf8(const char *buf, size_t len) const noexcept final override { + return set_best()->count_utf8(buf, len); + } +#endif // SIMDUTF_FEATURE_UTF8 - } - } +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + latin1_length_from_utf8(const char *buf, size_t len) const noexcept override { + return set_best()->latin1_length_from_utf8(buf, len); + } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 - // do not forget to call check_eof! - simdutf_really_inline bool errors() const { - return this->error.any_bits_set_anywhere(); - } +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + utf8_length_from_latin1(const char *buf, size_t len) const noexcept override { + return set_best()->utf8_length_from_latin1(buf, len); + } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 - }; // struct utf8_checker -} // namespace utf8_validation +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t utf8_length_from_utf16le( + const char16_t *buf, size_t len) const noexcept override { + return set_best()->utf8_length_from_utf16le(buf, len); + } -using utf8_validation::utf8_checker; + simdutf_warn_unused size_t utf8_length_from_utf16be( + const char16_t *buf, size_t len) const noexcept override { + return set_best()->utf8_length_from_utf16be(buf, len); + } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 -} // unnamed namespace -} // namespace arm64 -} // namespace simdutf -/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ -/* begin file src/generic/utf8_validation/utf8_validator.h */ -namespace simdutf { -namespace arm64 { -namespace { -namespace utf8_validation { +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t utf32_length_from_utf16le( + const char16_t *buf, size_t len) const noexcept override { + return set_best()->utf32_length_from_utf16le(buf, len); + } -/** - * Validates that the string is actual UTF-8. - */ -template -bool generic_validate_utf8(const uint8_t * input, size_t length) { - checker c{}; - buf_block_reader<64> reader(input, length); - while (reader.has_full_block()) { - simd::simd8x64 in(reader.full_block()); - c.check_next_input(in); - reader.advance(); - } - uint8_t block[64]{}; - reader.get_remainder(block); - simd::simd8x64 in(block); - c.check_next_input(in); - reader.advance(); - c.check_eof(); - return !c.errors(); -} + simdutf_warn_unused size_t utf32_length_from_utf16be( + const char16_t *buf, size_t len) const noexcept override { + return set_best()->utf32_length_from_utf16be(buf, len); + } +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 -bool generic_validate_utf8(const char * input, size_t length) { - return generic_validate_utf8(reinterpret_cast(input),length); -} +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t + utf16_length_from_utf8(const char *buf, size_t len) const noexcept override { + return set_best()->utf16_length_from_utf8(buf, len); + } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 -/** - * Validates that the string is actual UTF-8 and stops on errors. - */ -template -result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) { - checker c{}; - buf_block_reader<64> reader(input, length); - size_t count{0}; - while (reader.has_full_block()) { - simd::simd8x64 in(reader.full_block()); - c.check_next_input(in); - if(c.errors()) { - if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk - result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(input), reinterpret_cast(input + count), length - count); - res.count += count; - return res; - } - reader.advance(); - count += 64; - } - uint8_t block[64]{}; - reader.get_remainder(block); - simd::simd8x64 in(block); - c.check_next_input(in); - reader.advance(); - c.check_eof(); - if (c.errors()) { - if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk - result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(input), reinterpret_cast(input) + count, length - count); - res.count += count; - return res; - } else { - return result(error_code::SUCCESS, length); - } -} +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t utf8_length_from_utf32( + const char32_t *buf, size_t len) const noexcept override { + return set_best()->utf8_length_from_utf32(buf, len); + } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 -result generic_validate_utf8_with_errors(const char * input, size_t length) { - return generic_validate_utf8_with_errors(reinterpret_cast(input),length); -} +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t utf16_length_from_utf32( + const char32_t *buf, size_t len) const noexcept override { + return set_best()->utf16_length_from_utf32(buf, len); + } +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 -template -bool generic_validate_ascii(const uint8_t * input, size_t length) { - buf_block_reader<64> reader(input, length); - uint8_t blocks[64]{}; - simd::simd8x64 running_or(blocks); - while (reader.has_full_block()) { - simd::simd8x64 in(reader.full_block()); - running_or |= in; - reader.advance(); - } - uint8_t block[64]{}; - reader.get_remainder(block); - simd::simd8x64 in(block); - running_or |= in; - return running_or.is_ascii(); -} +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t + utf32_length_from_utf8(const char *buf, size_t len) const noexcept override { + return set_best()->utf32_length_from_utf8(buf, len); + } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 -bool generic_validate_ascii(const char * input, size_t length) { - return generic_validate_ascii(reinterpret_cast(input),length); -} +#if SIMDUTF_FEATURE_BASE64 + simdutf_warn_unused result base64_to_binary( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_handling_options = + last_chunk_handling_options::loose) const noexcept override { + return set_best()->base64_to_binary(input, length, output, options, + last_chunk_handling_options); + } -template -result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) { - buf_block_reader<64> reader(input, length); - size_t count{0}; - while (reader.has_full_block()) { - simd::simd8x64 in(reader.full_block()); - if (!in.is_ascii()) { - result res = scalar::ascii::validate_with_errors(reinterpret_cast(input + count), length - count); - return result(res.error, count + res.count); - } - reader.advance(); + simdutf_warn_unused full_result base64_to_binary_details( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_handling_options = + last_chunk_handling_options::loose) const noexcept override { + return set_best()->base64_to_binary_details(input, length, output, options, + last_chunk_handling_options); + } - count += 64; + simdutf_warn_unused result base64_to_binary( + const char16_t *input, size_t length, char *output, + base64_options options, + last_chunk_handling_options last_chunk_handling_options = + last_chunk_handling_options::loose) const noexcept override { + return set_best()->base64_to_binary(input, length, output, options, + last_chunk_handling_options); } - uint8_t block[64]{}; - reader.get_remainder(block); - simd::simd8x64 in(block); - if (!in.is_ascii()) { - result res = scalar::ascii::validate_with_errors(reinterpret_cast(input + count), length - count); - return result(res.error, count + res.count); - } else { - return result(error_code::SUCCESS, length); + + simdutf_warn_unused full_result base64_to_binary_details( + const char16_t *input, size_t length, char *output, + base64_options options, + last_chunk_handling_options last_chunk_handling_options = + last_chunk_handling_options::loose) const noexcept override { + return set_best()->base64_to_binary_details(input, length, output, options, + last_chunk_handling_options); } -} -result generic_validate_ascii_with_errors(const char * input, size_t length) { - return generic_validate_ascii_with_errors(reinterpret_cast(input),length); -} + size_t binary_to_base64(const char *input, size_t length, char *output, + base64_options options) const noexcept override { + return set_best()->binary_to_base64(input, length, output, options); + } -} // namespace utf8_validation -} // unnamed namespace -} // namespace arm64 -} // namespace simdutf -/* end file src/generic/utf8_validation/utf8_validator.h */ -// transcoding from UTF-8 to UTF-16 -/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ + size_t + binary_to_base64_with_lines(const char *input, size_t length, char *output, + size_t line_length, + base64_options options) const noexcept override { + return set_best()->binary_to_base64_with_lines(input, length, output, + line_length, options); + } + const char *find(const char *start, const char *end, + char character) const noexcept override { + return set_best()->find(start, end, character); + } -namespace simdutf { -namespace arm64 { -namespace { -namespace utf8_to_utf16 { + const char16_t *find(const char16_t *start, const char16_t *end, + char16_t character) const noexcept override { + return set_best()->find(start, end, character); + } +#endif // SIMDUTF_FEATURE_BASE64 -using namespace simd; + simdutf_really_inline + detect_best_supported_implementation_on_first_use() noexcept + : implementation("best_supported_detector", + "Detects the best supported implementation and sets it", + 0) {} -template -simdutf_warn_unused size_t convert_valid(const char* input, size_t size, - char16_t* utf16_output) noexcept { - // The implementation is not specific to haswell and should be moved to the generic directory. - size_t pos = 0; - char16_t* start{utf16_output}; - const size_t safety_margin = 16; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - // this loop could be unrolled further. For example, we could process the mask - // far more than 64 bytes. - simd8x64 in(reinterpret_cast(input + pos)); - if(in.is_ascii()) { - in.store_ascii_as_utf16(utf16_output); - utf16_output += 64; - pos += 64; - } else { - // Slow path. We hope that the compiler will recognize that this is a slow path. - // Anything that is not a continuation mask is a 'leading byte', that is, the - // start of a new code point. - uint64_t utf8_continuation_mask = in.lt(-65 + 1); - // -65 is 0b10111111 in two-complement's, so largest possible continuation byte - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - // The *start* of code points is not so useful, rather, we want the *end* of code points. - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times when using solely - // the slow/regular path, and at least four times if there are fast paths. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - // - // Thus we may allow convert_masked_utf8_to_utf16 to process - // more bytes at a time under a fast-path mode where 16 bytes - // are consumed at once (e.g., when encountering ASCII). - size_t consumed = convert_masked_utf8_to_utf16(input + pos, - utf8_end_of_code_point_mask, utf16_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. - } - } - utf16_output += scalar::utf8_to_utf16::convert_valid(input + pos, size - pos, utf16_output); - return utf16_output - start; -} +private: + const implementation *set_best() const noexcept; +}; -} // namespace utf8_to_utf16 -} // unnamed namespace -} // namespace arm64 -} // namespace simdutf -/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ -/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */ +static_assert(std::is_trivially_destructible< + detect_best_supported_implementation_on_first_use>::value, + "detect_best_supported_implementation_on_first_use should be " + "trivially destructible"); +static const std::initializer_list & +get_available_implementation_pointers() { + static const std::initializer_list + available_implementation_pointers{ +#if SIMDUTF_IMPLEMENTATION_ICELAKE + get_icelake_singleton(), +#endif +#if SIMDUTF_IMPLEMENTATION_HASWELL + get_haswell_singleton(), +#endif +#if SIMDUTF_IMPLEMENTATION_WESTMERE + get_westmere_singleton(), +#endif +#if SIMDUTF_IMPLEMENTATION_ARM64 + get_arm64_singleton(), +#endif +#if SIMDUTF_IMPLEMENTATION_PPC64 + get_ppc64_singleton(), +#endif +#if SIMDUTF_IMPLEMENTATION_RVV + get_rvv_singleton(), +#endif +#if SIMDUTF_IMPLEMENTATION_LASX + get_lasx_singleton(), +#endif +#if SIMDUTF_IMPLEMENTATION_LSX + get_lsx_singleton(), +#endif +#if SIMDUTF_IMPLEMENTATION_FALLBACK + get_fallback_singleton(), +#endif + }; // available_implementation_pointers + return available_implementation_pointers; +} -namespace simdutf { -namespace arm64 { -namespace { -namespace utf8_to_utf16 { -using namespace simd; +// So we can return UNSUPPORTED_ARCHITECTURE from the parser when there is no +// support +class unsupported_implementation final : public implementation { +public: +#if SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused int detect_encodings(const char *, + size_t) const noexcept override { + return encoding_type::unspecified; + } +#endif // SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool validate_utf8(const char *, + size_t) const noexcept final override { + return false; // Just refuse to validate. Given that we have a fallback + // implementation + // it seems unlikely that unsupported_implementation will ever be used. If + // it is used, then it will flag all strings as invalid. The alternative is + // to return an error_code from which the user has to figure out whether the + // string is valid UTF-8... which seems like a lot of work just to handle + // the very unlikely case that we have an unsupported implementation. And, + // when it does happen (that we have an unsupported implementation), what + // are the chances that the programmer has a fallback? Given that *we* + // provide the fallback, it implies that the programmer would need a + // fallback for our fallback. + } +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF8 + simdutf_warn_unused result validate_utf8_with_errors( + const char *, size_t) const noexcept final override { + return result(error_code::OTHER, 0); + } +#endif // SIMDUTF_FEATURE_UTF8 +#if SIMDUTF_FEATURE_ASCII + simdutf_warn_unused bool + validate_ascii(const char *, size_t) const noexcept final override { + return false; + } - simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { -// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) -// Bit 1 = Too Long (ASCII followed by continuation) -// Bit 2 = Overlong 3-byte -// Bit 4 = Surrogate -// Bit 5 = Overlong 2-byte -// Bit 7 = Two Continuations - constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ - // 11______ 11______ - constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ - constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ - constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ - constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ - constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ - constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ - // 11110100 101_____ - // 11110101 1001____ - // 11110101 101_____ - // 1111011_ 1001____ - // 1111011_ 101_____ - // 11111___ 1001____ - // 11111___ 101_____ - constexpr const uint8_t TOO_LARGE_1000 = 1<<6; - // 11110101 1000____ - // 1111011_ 1000____ - // 11111___ 1000____ - constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ - - const simd8 byte_1_high = prev1.shr<4>().lookup_16( - // 0_______ ________ - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - // 10______ ________ - TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, - // 1100____ ________ - TOO_SHORT | OVERLONG_2, - // 1101____ ________ - TOO_SHORT, - // 1110____ ________ - TOO_SHORT | OVERLONG_3 | SURROGATE, - // 1111____ ________ - TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 - ); - constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . - const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( - // ____0000 ________ - CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, - // ____0001 ________ - CARRY | OVERLONG_2, - // ____001_ ________ - CARRY, - CARRY, - - // ____0100 ________ - CARRY | TOO_LARGE, - // ____0101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - // ____011_ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - - // ____1___ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - // ____1101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000 - ); - const simd8 byte_2_high = input.shr<4>().lookup_16( - // ________ 0_______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, - - // ________ 1000____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, - // ________ 1001____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, - // ________ 101_____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + simdutf_warn_unused result validate_ascii_with_errors( + const char *, size_t) const noexcept final override { + return result(error_code::OTHER, 0); + } +#endif // SIMDUTF_FEATURE_ASCII - // ________ 11______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT - ); - return (byte_1_high & byte_1_low & byte_2_high); +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII + simdutf_warn_unused bool + validate_utf16le_as_ascii(const char16_t *, + size_t) const noexcept final override { + return false; } - simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, - const simd8 prev_input, const simd8 sc) { - simd8 prev2 = input.prev<2>(prev_input); - simd8 prev3 = input.prev<3>(prev_input); - simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); - simd8 must23_80 = must23 & uint8_t(0x80); - return must23_80 ^ sc; + + simdutf_warn_unused bool + validate_utf16be_as_ascii(const char16_t *, + size_t) const noexcept final override { + return false; } +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool + validate_utf16le(const char16_t *, size_t) const noexcept final override { + return false; + } +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING - struct validating_transcoder { - // If this is nonzero, there has been a UTF-8 error. - simd8 error; +#if SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused bool + validate_utf16be(const char16_t *, size_t) const noexcept final override { + return false; + } - validating_transcoder() : error(uint8_t(0)) {} - // - // Check whether the current bytes are valid UTF-8. - // - simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { - // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes - // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) - simd8 prev1 = input.prev<1>(prev_input); - simd8 sc = check_special_cases(input, prev1); - this->error |= check_multibyte_lengths(input, prev_input, sc); - } - - - template - simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) { - size_t pos = 0; - char16_t* start{utf16_output}; - // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes, - // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate - // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, - // to give us a good margin. - size_t leading_byte = 0; - size_t margin = size; - for(; margin > 0 && leading_byte < 8; margin--) { - leading_byte += (int8_t(in[margin-1]) > -65); - } - // If the input is long enough, then we have that margin-1 is the eight last leading byte. - const size_t safety_margin = size - margin + 1; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 input(reinterpret_cast(in + pos)); - if(input.is_ascii()) { - input.store_ascii_as_utf16(utf16_output); - utf16_output += 64; - pos += 64; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - auto zero = simd8{uint8_t(0)}; - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_utf16(in + pos, - utf8_end_of_code_point_mask, utf16_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. - } - } - if(errors()) { return 0; } - if(pos < size) { - size_t howmany = scalar::utf8_to_utf16::convert(in + pos, size - pos, utf16_output); - if(howmany == 0) { return 0; } - utf16_output += howmany; - } - return utf16_output - start; - } - - template - simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) { - size_t pos = 0; - char16_t* start{utf16_output}; - // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes, - // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate - // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, - // to give us a good margin. - size_t leading_byte = 0; - size_t margin = size; - for(; margin > 0 && leading_byte < 8; margin--) { - leading_byte += (int8_t(in[margin-1]) > -65); - } - // If the input is long enough, then we have that margin-1 is the eight last leading byte. - const size_t safety_margin = size - margin + 1; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 input(reinterpret_cast(in + pos)); - if(input.is_ascii()) { - input.store_ascii_as_utf16(utf16_output); - utf16_output += 64; - pos += 64; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - auto zero = simd8{uint8_t(0)}; - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - if (errors()) { - // rewind_and_convert_with_errors will seek a potential error from in+pos onward, - // with the ability to go back up to pos bytes, and read size-pos bytes forward. - result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); - res.count += pos; - return res; - } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_utf16(in + pos, - utf8_end_of_code_point_mask, utf16_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. - } - } - if(errors()) { - // rewind_and_convert_with_errors will seek a potential error from in+pos onward, - // with the ability to go back up to pos bytes, and read size-pos bytes forward. - result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); - res.count += pos; - return res; - } - if(pos < size) { - // rewind_and_convert_with_errors will seek a potential error from in+pos onward, - // with the ability to go back up to pos bytes, and read size-pos bytes forward. - result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); - if (res.error) { // In case of error, we want the error position - res.count += pos; - return res; - } else { // In case of success, we want the number of word written - utf16_output += res.count; - } - } - return result(error_code::SUCCESS, utf16_output - start); - } + simdutf_warn_unused result validate_utf16le_with_errors( + const char16_t *, size_t) const noexcept final override { + return result(error_code::OTHER, 0); + } - simdutf_really_inline bool errors() const { - return this->error.any_bits_set_anywhere(); - } + simdutf_warn_unused result validate_utf16be_with_errors( + const char16_t *, size_t) const noexcept final override { + return result(error_code::OTHER, 0); + } + void to_well_formed_utf16be(const char16_t *, size_t, + char16_t *) const noexcept final override {} + void to_well_formed_utf16le(const char16_t *, size_t, + char16_t *) const noexcept final override {} +#endif // SIMDUTF_FEATURE_UTF16 - }; // struct utf8_checker -} // utf8_to_utf16 namespace -} // unnamed namespace -} // namespace arm64 -} // namespace simdutf -/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */ -// transcoding from UTF-8 to UTF-32 -/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ +#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING + simdutf_warn_unused bool + validate_utf32(const char32_t *, size_t) const noexcept final override { + return false; + } +#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING -namespace simdutf { -namespace arm64 { -namespace { -namespace utf8_to_utf32 { +#if SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused result validate_utf32_with_errors( + const char32_t *, size_t) const noexcept final override { + return result(error_code::OTHER, 0); + } +#endif // SIMDUTF_FEATURE_UTF32 -using namespace simd; +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf8( + const char *, size_t, char *) const noexcept final override { + return 0; + } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf16le( + const char *, size_t, char16_t *) const noexcept final override { + return 0; + } -simdutf_warn_unused size_t convert_valid(const char* input, size_t size, - char32_t* utf32_output) noexcept { - size_t pos = 0; - char32_t* start{utf32_output}; - const size_t safety_margin = 16; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 in(reinterpret_cast(input + pos)); - if(in.is_ascii()) { - in.store_ascii_as_utf32(utf32_output); - utf32_output += 64; - pos += 64; - } else { - // -65 is 0b10111111 in two-complement's, so largest possible continuation byte - uint64_t utf8_continuation_mask = in.lt(-65 + 1); - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - size_t max_starting_point = (pos + 64) - 12; - while(pos < max_starting_point) { - size_t consumed = convert_masked_utf8_to_utf32(input + pos, - utf8_end_of_code_point_mask, utf32_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - } + simdutf_warn_unused size_t convert_latin1_to_utf16be( + const char *, size_t, char16_t *) const noexcept final override { + return 0; } - utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output); - return utf32_output - start; -} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_latin1_to_utf32( + const char *, size_t, char32_t *) const noexcept final override { + return 0; + } +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 -} // namespace utf8_to_utf32 -} // unnamed namespace -} // namespace arm64 -} // namespace simdutf -/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ -/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */ +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_utf8_to_latin1( + const char *, size_t, char *) const noexcept final override { + return 0; + } + simdutf_warn_unused result convert_utf8_to_latin1_with_errors( + const char *, size_t, char *) const noexcept final override { + return result(error_code::OTHER, 0); + } -namespace simdutf { -namespace arm64 { -namespace { -namespace utf8_to_utf32 { -using namespace simd; + simdutf_warn_unused size_t convert_valid_utf8_to_latin1( + const char *, size_t, char *) const noexcept final override { + return 0; + } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t convert_utf8_to_utf16le( + const char *, size_t, char16_t *) const noexcept final override { + return 0; + } - simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { -// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) -// Bit 1 = Too Long (ASCII followed by continuation) -// Bit 2 = Overlong 3-byte -// Bit 4 = Surrogate -// Bit 5 = Overlong 2-byte -// Bit 7 = Two Continuations - constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ - // 11______ 11______ - constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ - constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ - constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ - constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ - constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ - constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ - // 11110100 101_____ - // 11110101 1001____ - // 11110101 101_____ - // 1111011_ 1001____ - // 1111011_ 101_____ - // 11111___ 1001____ - // 11111___ 101_____ - constexpr const uint8_t TOO_LARGE_1000 = 1<<6; - // 11110101 1000____ - // 1111011_ 1000____ - // 11111___ 1000____ - constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ - - const simd8 byte_1_high = prev1.shr<4>().lookup_16( - // 0_______ ________ - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - // 10______ ________ - TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, - // 1100____ ________ - TOO_SHORT | OVERLONG_2, - // 1101____ ________ - TOO_SHORT, - // 1110____ ________ - TOO_SHORT | OVERLONG_3 | SURROGATE, - // 1111____ ________ - TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 - ); - constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . - const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( - // ____0000 ________ - CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, - // ____0001 ________ - CARRY | OVERLONG_2, - // ____001_ ________ - CARRY, - CARRY, - - // ____0100 ________ - CARRY | TOO_LARGE, - // ____0101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - // ____011_ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - - // ____1___ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - // ____1101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000 - ); - const simd8 byte_2_high = input.shr<4>().lookup_16( - // ________ 0_______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + simdutf_warn_unused size_t convert_utf8_to_utf16be( + const char *, size_t, char16_t *) const noexcept final override { + return 0; + } - // ________ 1000____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, - // ________ 1001____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, - // ________ 101_____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + simdutf_warn_unused result convert_utf8_to_utf16le_with_errors( + const char *, size_t, char16_t *) const noexcept final override { + return result(error_code::OTHER, 0); + } - // ________ 11______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT - ); - return (byte_1_high & byte_1_low & byte_2_high); + simdutf_warn_unused result convert_utf8_to_utf16be_with_errors( + const char *, size_t, char16_t *) const noexcept final override { + return result(error_code::OTHER, 0); } - simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, - const simd8 prev_input, const simd8 sc) { - simd8 prev2 = input.prev<2>(prev_input); - simd8 prev3 = input.prev<3>(prev_input); - simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); - simd8 must23_80 = must23 & uint8_t(0x80); - return must23_80 ^ sc; + + simdutf_warn_unused size_t convert_valid_utf8_to_utf16le( + const char *, size_t, char16_t *) const noexcept final override { + return 0; } + simdutf_warn_unused size_t convert_valid_utf8_to_utf16be( + const char *, size_t, char16_t *) const noexcept final override { + return 0; + } + simdutf_warn_unused result utf8_length_from_utf16le_with_replacement( + const char16_t *, size_t) const noexcept final override { + return {OTHER, 0}; // Not supported + } - struct validating_transcoder { - // If this is nonzero, there has been a UTF-8 error. - simd8 error; + simdutf_warn_unused result utf8_length_from_utf16be_with_replacement( + const char16_t *, size_t) const noexcept final override { + return {OTHER, 0}; // Not supported + } - validating_transcoder() : error(uint8_t(0)) {} - // - // Check whether the current bytes are valid UTF-8. - // - simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { - // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes - // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) - simd8 prev1 = input.prev<1>(prev_input); - simd8 sc = check_special_cases(input, prev1); - this->error |= check_multibyte_lengths(input, prev_input, sc); - } - - - - simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) { - size_t pos = 0; - char32_t* start{utf32_output}; - // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes, - // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate - // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, - // to give us a good margin. - size_t leading_byte = 0; - size_t margin = size; - for(; margin > 0 && leading_byte < 8; margin--) { - leading_byte += (int8_t(in[margin-1]) > -65); - } - // If the input is long enough, then we have that margin-1 is the fourth last leading byte. - const size_t safety_margin = size - margin + 1; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 input(reinterpret_cast(in + pos)); - if(input.is_ascii()) { - input.store_ascii_as_utf32(utf32_output); - utf32_output += 64; - pos += 64; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - auto zero = simd8{uint8_t(0)}; - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_utf32(in + pos, - utf8_end_of_code_point_mask, utf32_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. - } - } - if(errors()) { return 0; } - if(pos < size) { - size_t howmany = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output); - if(howmany == 0) { return 0; } - utf32_output += howmany; - } - return utf32_output - start; - } - - simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) { - size_t pos = 0; - char32_t* start{utf32_output}; - // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes, - // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate - // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, - // to give us a good margin. - size_t leading_byte = 0; - size_t margin = size; - for(; margin > 0 && leading_byte < 8; margin--) { - leading_byte += (int8_t(in[margin-1]) > -65); - } - // If the input is long enough, then we have that margin-1 is the fourth last leading byte. - const size_t safety_margin = size - margin + 1; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 input(reinterpret_cast(in + pos)); - if(input.is_ascii()) { - input.store_ascii_as_utf32(utf32_output); - utf32_output += 64; - pos += 64; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - auto zero = simd8{uint8_t(0)}; - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - if (errors()) { - result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); - res.count += pos; - return res; - } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_utf32(in + pos, - utf8_end_of_code_point_mask, utf32_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. - } - } - if(errors()) { - result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); - res.count += pos; - return res; - } - if(pos < size) { - result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); - if (res.error) { // In case of error, we want the error position - res.count += pos; - return res; - } else { // In case of success, we want the number of word written - utf32_output += res.count; - } - } - return result(error_code::SUCCESS, utf32_output - start); - } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 - simdutf_really_inline bool errors() const { - return this->error.any_bits_set_anywhere(); - } +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t convert_utf8_to_utf32( + const char *, size_t, char32_t *) const noexcept final override { + return 0; + } - }; // struct utf8_checker -} // utf8_to_utf32 namespace -} // unnamed namespace -} // namespace arm64 -} // namespace simdutf -/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */ -// other functions -/* begin file src/generic/utf8.h */ + simdutf_warn_unused result convert_utf8_to_utf32_with_errors( + const char *, size_t, char32_t *) const noexcept final override { + return result(error_code::OTHER, 0); + } -namespace simdutf { -namespace arm64 { -namespace { -namespace utf8 { + simdutf_warn_unused size_t convert_valid_utf8_to_utf32( + const char *, size_t, char32_t *) const noexcept final override { + return 0; + } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 -using namespace simd; +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_utf16le_to_latin1( + const char16_t *, size_t, char *) const noexcept final override { + return 0; + } -simdutf_really_inline size_t count_code_points(const char* in, size_t size) { - size_t pos = 0; - size_t count = 0; - for(;pos + 64 <= size; pos += 64) { - simd8x64 input(reinterpret_cast(in + pos)); - uint64_t utf8_continuation_mask = input.gt(-65); - count += count_ones(utf8_continuation_mask); - } - return count + scalar::utf8::count_code_points(in + pos, size - pos); -} + simdutf_warn_unused size_t convert_utf16be_to_latin1( + const char16_t *, size_t, char *) const noexcept final override { + return 0; + } -simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) { - size_t pos = 0; - size_t count = 0; - // This algorithm could no doubt be improved! - for(;pos + 64 <= size; pos += 64) { - simd8x64 input(reinterpret_cast(in + pos)); - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - // We count one word for anything that is not a continuation (so - // leading bytes). - count += 64 - count_ones(utf8_continuation_mask); - int64_t utf8_4byte = input.gteq_unsigned(240); - count += count_ones(utf8_4byte); - } - return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos); -} -} // utf8 namespace -} // unnamed namespace -} // namespace arm64 -} // namespace simdutf -/* end file src/generic/utf8.h */ -/* begin file src/generic/utf16.h */ -namespace simdutf { -namespace arm64 { -namespace { -namespace utf16 { + simdutf_warn_unused result convert_utf16le_to_latin1_with_errors( + const char16_t *, size_t, char *) const noexcept final override { + return result(error_code::OTHER, 0); + } -template -simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) { - size_t pos = 0; - size_t count = 0; - for(;pos < size/32*32; pos += 32) { - simd16x32 input(reinterpret_cast(in + pos)); - if (!match_system(big_endian)) { input.swap_bytes(); } - uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF); - count += count_ones(not_pair) / 2; - } - return count + scalar::utf16::count_code_points(in + pos, size - pos); -} + simdutf_warn_unused result convert_utf16be_to_latin1_with_errors( + const char16_t *, size_t, char *) const noexcept final override { + return result(error_code::OTHER, 0); + } -template -simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) { - size_t pos = 0; - size_t count = 0; - // This algorithm could no doubt be improved! - for(;pos < size/32*32; pos += 32) { - simd16x32 input(reinterpret_cast(in + pos)); - if (!match_system(big_endian)) { input.swap_bytes(); } - uint64_t ascii_mask = input.lteq(0x7F); - uint64_t twobyte_mask = input.lteq(0x7FF); - uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF); + simdutf_warn_unused size_t convert_valid_utf16le_to_latin1( + const char16_t *, size_t, char *) const noexcept final override { + return 0; + } - size_t ascii_count = count_ones(ascii_mask) / 2; - size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2; - size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2; - size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2; - count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count; - } - return count + scalar::utf16::utf8_length_from_utf16(in + pos, size - pos); -} + simdutf_warn_unused size_t convert_valid_utf16be_to_latin1( + const char16_t *, size_t, char *) const noexcept final override { + return 0; + } +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 -template -simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) { - return count_code_points(in, size); -} +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t convert_utf16le_to_utf8( + const char16_t *, size_t, char *) const noexcept final override { + return 0; + } -simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) { - size_t pos = 0; + simdutf_warn_unused size_t convert_utf16be_to_utf8( + const char16_t *, size_t, char *) const noexcept final override { + return 0; + } - while (pos < size/32*32) { - simd16x32 input(reinterpret_cast(in + pos)); - input.swap_bytes(); - input.store(reinterpret_cast(output)); - pos += 32; - output += 32; + simdutf_warn_unused result convert_utf16le_to_utf8_with_errors( + const char16_t *, size_t, char *) const noexcept final override { + return result(error_code::OTHER, 0); } - scalar::utf16::change_endianness_utf16(in + pos, size - pos, output); -} + simdutf_warn_unused result convert_utf16be_to_utf8_with_errors( + const char16_t *, size_t, char *) const noexcept final override { + return result(error_code::OTHER, 0); + } -} // utf16 -} // unnamed namespace -} // namespace arm64 -} // namespace simdutf -/* end file src/generic/utf16.h */ -// transcoding from UTF-8 to Latin 1 -/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */ + simdutf_warn_unused size_t convert_valid_utf16le_to_utf8( + const char16_t *, size_t, char *) const noexcept final override { + return 0; + } + simdutf_warn_unused size_t convert_valid_utf16be_to_utf8( + const char16_t *, size_t, char *) const noexcept final override { + return 0; + } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 -namespace simdutf { -namespace arm64 { -namespace { -namespace utf8_to_latin1 { -using namespace simd; +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t convert_utf32_to_latin1( + const char32_t *, size_t, char *) const noexcept final override { + return 0; + } + simdutf_warn_unused result convert_utf32_to_latin1_with_errors( + const char32_t *, size_t, char *) const noexcept final override { + return result(error_code::OTHER, 0); + } - simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { -// For UTF-8 to Latin 1, we can allow any ASCII character, and any continuation byte, -// but the non-ASCII leading bytes must be 0b11000011 or 0b11000010 and nothing else. -// -// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) -// Bit 1 = Too Long (ASCII followed by continuation) -// Bit 2 = Overlong 3-byte -// Bit 4 = Surrogate -// Bit 5 = Overlong 2-byte -// Bit 7 = Two Continuations - constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ - // 11______ 11______ - constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ - constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ - constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ - constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ - constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ - constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ - // 11110100 101_____ - // 11110101 1001____ - // 11110101 101_____ - // 1111011_ 1001____ - // 1111011_ 101_____ - // 11111___ 1001____ - // 11111___ 101_____ - constexpr const uint8_t TOO_LARGE_1000 = 1<<6; - // 11110101 1000____ - // 1111011_ 1000____ - // 11111___ 1000____ - constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ - constexpr const uint8_t FORBIDDEN = 0xff; - - const simd8 byte_1_high = prev1.shr<4>().lookup_16( - // 0_______ ________ - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - // 10______ ________ - TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, - // 1100____ ________ - TOO_SHORT | OVERLONG_2, - // 1101____ ________ - FORBIDDEN, - // 1110____ ________ - FORBIDDEN, - // 1111____ ________ - FORBIDDEN - ); - constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . - const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( - // ____0000 ________ - CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, - // ____0001 ________ - CARRY | OVERLONG_2, - // ____001_ ________ - CARRY, - CARRY, - - // ____0100 ________ - FORBIDDEN, - // ____0101 ________ - FORBIDDEN, - // ____011_ ________ - FORBIDDEN, - FORBIDDEN, + simdutf_warn_unused size_t convert_valid_utf32_to_latin1( + const char32_t *, size_t, char *) const noexcept final override { + return 0; + } +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 - // ____1___ ________ - FORBIDDEN, - FORBIDDEN, - FORBIDDEN, - FORBIDDEN, - FORBIDDEN, - // ____1101 ________ - FORBIDDEN, - FORBIDDEN, - FORBIDDEN - ); - const simd8 byte_2_high = input.shr<4>().lookup_16( - // ________ 0_______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, - - // ________ 1000____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, - // ________ 1001____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, - // ________ 101_____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - - // ________ 11______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT - ); - return (byte_1_high & byte_1_low & byte_2_high); +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t convert_utf32_to_utf8( + const char32_t *, size_t, char *) const noexcept final override { + return 0; } - struct validating_transcoder { - // If this is nonzero, there has been a UTF-8 error. - simd8 error; - - validating_transcoder() : error(uint8_t(0)) {} - // - // Check whether the current bytes are valid UTF-8. - // - simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { - // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes - // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) - simd8 prev1 = input.prev<1>(prev_input); - this->error |= check_special_cases(input, prev1); - } - - - simdutf_really_inline size_t convert(const char* in, size_t size, char* latin1_output) { - size_t pos = 0; - char* start{latin1_output}; - // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last 16 bytes, - // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate - // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, - // to give us a good margin. - size_t leading_byte = 0; - size_t margin = size; - for(; margin > 0 && leading_byte < 8; margin--) { - leading_byte += (int8_t(in[margin-1]) > -65); //twos complement of -65 is 1011 1111 ... - } - // If the input is long enough, then we have that margin-1 is the eight last leading byte. - const size_t safety_margin = size - margin + 1; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 input(reinterpret_cast(in + pos)); - if(input.is_ascii()) { - input.store((int8_t*)latin1_output); - latin1_output += 64; - pos += 64; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - auto zero = simd8{uint8_t(0)}; - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in this case, we also have ASCII to account for. - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_latin1(in + pos, - utf8_end_of_code_point_mask, latin1_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. - } - } - if(errors()) { return 0; } - if(pos < size) { - size_t howmany = scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output); - if(howmany == 0) { return 0; } - latin1_output += howmany; - } - return latin1_output - start; - } - - simdutf_really_inline result convert_with_errors(const char* in, size_t size, char* latin1_output) { - size_t pos = 0; - char* start{latin1_output}; - // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last 16 bytes, - // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate - // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, - // to give us a good margin. - size_t leading_byte = 0; - size_t margin = size; - for(; margin > 0 && leading_byte < 8; margin--) { - leading_byte += (int8_t(in[margin-1]) > -65); - } - // If the input is long enough, then we have that margin-1 is the eight last leading byte. - const size_t safety_margin = size - margin + 1; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 input(reinterpret_cast(in + pos)); - if(input.is_ascii()) { - input.store((int8_t*)latin1_output); - latin1_output += 64; - pos += 64; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - auto zero = simd8{uint8_t(0)}; - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - if (errors()) { - // rewind_and_convert_with_errors will seek a potential error from in+pos onward, - // with the ability to go back up to pos bytes, and read size-pos bytes forward. - result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, in + pos, size - pos, latin1_output); - res.count += pos; - return res; - } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_latin1(in + pos, - utf8_end_of_code_point_mask, latin1_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. - } - } - if(errors()) { - // rewind_and_convert_with_errors will seek a potential error from in+pos onward, - // with the ability to go back up to pos bytes, and read size-pos bytes forward. - result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, in + pos, size - pos, latin1_output); - res.count += pos; - return res; - } - if(pos < size) { - // rewind_and_convert_with_errors will seek a potential error from in+pos onward, - // with the ability to go back up to pos bytes, and read size-pos bytes forward. - result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, in + pos, size - pos, latin1_output); - if (res.error) { // In case of error, we want the error position - res.count += pos; - return res; - } else { // In case of success, we want the number of word written - latin1_output += res.count; - } - } - return result(error_code::SUCCESS, latin1_output - start); - } + simdutf_warn_unused result convert_utf32_to_utf8_with_errors( + const char32_t *, size_t, char *) const noexcept final override { + return result(error_code::OTHER, 0); + } - simdutf_really_inline bool errors() const { - return this->error.any_bits_set_anywhere(); - } + simdutf_warn_unused size_t convert_valid_utf32_to_utf8( + const char32_t *, size_t, char *) const noexcept final override { + return 0; + } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 - }; // struct utf8_checker -} // utf8_to_latin1 namespace -} // unnamed namespace -} // namespace arm64 -} // namespace simdutf -/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */ -/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */ +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t convert_utf32_to_utf16le( + const char32_t *, size_t, char16_t *) const noexcept final override { + return 0; + } + simdutf_warn_unused size_t convert_utf32_to_utf16be( + const char32_t *, size_t, char16_t *) const noexcept final override { + return 0; + } -namespace simdutf { -namespace arm64 { -namespace { -namespace utf8_to_latin1 { -using namespace simd; + simdutf_warn_unused result convert_utf32_to_utf16le_with_errors( + const char32_t *, size_t, char16_t *) const noexcept final override { + return result(error_code::OTHER, 0); + } + simdutf_warn_unused result convert_utf32_to_utf16be_with_errors( + const char32_t *, size_t, char16_t *) const noexcept final override { + return result(error_code::OTHER, 0); + } - simdutf_really_inline size_t convert_valid(const char* in, size_t size, char* latin1_output) { - size_t pos = 0; - char* start{latin1_output}; - // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last 16 bytes, - // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate - // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, - // to give us a good margin. - size_t leading_byte = 0; - size_t margin = size; - for(; margin > 0 && leading_byte < 8; margin--) { - leading_byte += (int8_t(in[margin-1]) > -65); //twos complement of -65 is 1011 1111 ... - } - // If the input is long enough, then we have that margin-1 is the eight last leading byte. - const size_t safety_margin = size - margin + 1; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 input(reinterpret_cast(in + pos)); - if(input.is_ascii()) { - input.store((int8_t*)latin1_output); - latin1_output += 64; - pos += 64; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - uint64_t utf8_continuation_mask = input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in this case, we also have ASCII to account for. - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_latin1(in + pos, - utf8_end_of_code_point_mask, latin1_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. - } - } - if(pos < size) { - size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos, latin1_output); - latin1_output += howmany; - } - return latin1_output - start; - } + simdutf_warn_unused size_t convert_valid_utf32_to_utf16le( + const char32_t *, size_t, char16_t *) const noexcept final override { + return 0; + } + simdutf_warn_unused size_t convert_valid_utf32_to_utf16be( + const char32_t *, size_t, char16_t *) const noexcept final override { + return 0; } -} // utf8_to_latin1 namespace -} // unnamed namespace -} // namespace arm64 - // namespace simdutf -/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */ -// placeholder scalars + simdutf_warn_unused size_t convert_utf16le_to_utf32( + const char16_t *, size_t, char32_t *) const noexcept final override { + return 0; + } -// -// Implementation-specific overrides -// -namespace simdutf { -namespace arm64 { + simdutf_warn_unused size_t convert_utf16be_to_utf32( + const char16_t *, size_t, char32_t *) const noexcept final override { + return 0; + } -simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept { - // If there is a BOM, then we trust it. - auto bom_encoding = simdutf::BOM::check_bom(input, length); - if(bom_encoding != encoding_type::unspecified) { return bom_encoding; } - if (length % 2 == 0) { - return arm_detect_encodings(input, length); - } else { - if (implementation::validate_utf8(input, length)) { - return simdutf::encoding_type::UTF8; - } else { - return simdutf::encoding_type::unspecified; - } + simdutf_warn_unused result convert_utf16le_to_utf32_with_errors( + const char16_t *, size_t, char32_t *) const noexcept final override { + return result(error_code::OTHER, 0); } -} -simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { - return arm64::utf8_validation::generic_validate_utf8(buf,len); -} + simdutf_warn_unused result convert_utf16be_to_utf32_with_errors( + const char16_t *, size_t, char32_t *) const noexcept final override { + return result(error_code::OTHER, 0); + } -simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept { - return arm64::utf8_validation::generic_validate_utf8_with_errors(buf,len); -} + simdutf_warn_unused size_t convert_valid_utf16le_to_utf32( + const char16_t *, size_t, char32_t *) const noexcept final override { + return 0; + } -simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept { - return arm64::utf8_validation::generic_validate_ascii(buf,len); -} + simdutf_warn_unused size_t convert_valid_utf16be_to_utf32( + const char16_t *, size_t, char32_t *) const noexcept final override { + return 0; + } +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 -simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept { - return arm64::utf8_validation::generic_validate_ascii_with_errors(buf,len); -} +#if SIMDUTF_FEATURE_UTF16 + void change_endianness_utf16(const char16_t *, size_t, + char16_t *) const noexcept final override {} -simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept { - const char16_t* tail = arm_validate_utf16(buf, len); - if (tail) { - return scalar::utf16::validate(tail, len - (tail - buf)); - } else { - return false; + simdutf_warn_unused size_t + count_utf16le(const char16_t *, size_t) const noexcept final override { + return 0; } -} -simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept { - const char16_t* tail = arm_validate_utf16(buf, len); - if (tail) { - return scalar::utf16::validate(tail, len - (tail - buf)); - } else { - return false; + simdutf_warn_unused size_t + count_utf16be(const char16_t *, size_t) const noexcept final override { + return 0; } -} +#endif // SIMDUTF_FEATURE_UTF16 -simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept { - result res = arm_validate_utf16_with_errors(buf, len); - if (res.count != len) { - result scalar_res = scalar::utf16::validate_with_errors(buf + res.count, len - res.count); - return result(scalar_res.error, res.count + scalar_res.count); - } else { - return res; +#if SIMDUTF_FEATURE_UTF8 + simdutf_warn_unused size_t count_utf8(const char *, + size_t) const noexcept final override { + return 0; } -} +#endif // SIMDUTF_FEATURE_UTF8 -simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept { - result res = arm_validate_utf16_with_errors(buf, len); - if (res.count != len) { - result scalar_res = scalar::utf16::validate_with_errors(buf + res.count, len - res.count); - return result(scalar_res.error, res.count + scalar_res.count); - } else { - return res; +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + latin1_length_from_utf8(const char *, size_t) const noexcept override { + return 0; } -} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 -simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept { - const char32_t* tail = arm_validate_utf32le(buf, len); - if (tail) { - return scalar::utf32::validate(tail, len - (tail - buf)); - } else { - return false; +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + simdutf_warn_unused size_t + utf8_length_from_latin1(const char *, size_t) const noexcept override { + return 0; } -} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 -simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept { - result res = arm_validate_utf32le_with_errors(buf, len); - if (res.count != len) { - result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count); - return result(scalar_res.error, res.count + scalar_res.count); - } else { - return res; +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t + utf8_length_from_utf16le(const char16_t *, size_t) const noexcept override { + return 0; } -} -simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept { - std::pair ret = arm_convert_latin1_to_utf8(buf, len, utf8_output); - size_t converted_chars = ret.second - utf8_output; + simdutf_warn_unused size_t + utf8_length_from_utf16be(const char16_t *, size_t) const noexcept override { + return 0; + } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 - if (ret.first != buf + len) { - const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert( - ret.first, len - (ret.first - buf), ret.second); - converted_chars += scalar_converted_chars; +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t + utf32_length_from_utf16le(const char16_t *, size_t) const noexcept override { + return 0; } - return converted_chars; -} -simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - std::pair ret = arm_convert_latin1_to_utf16(buf, len, utf16_output); - size_t converted_chars = ret.second - utf16_output; - if (ret.first != buf + len) { - const size_t scalar_converted_chars = scalar::latin1_to_utf16::convert( - ret.first, len - (ret.first - buf), ret.second); - converted_chars += scalar_converted_chars; + simdutf_warn_unused size_t + utf32_length_from_utf16be(const char16_t *, size_t) const noexcept override { + return 0; } - return converted_chars; -} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 -simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - std::pair ret = arm_convert_latin1_to_utf16(buf, len, utf16_output); - size_t converted_chars = ret.second - utf16_output; - if (ret.first != buf + len) { - const size_t scalar_converted_chars = scalar::latin1_to_utf16::convert( - ret.first, len - (ret.first - buf), ret.second); - converted_chars += scalar_converted_chars; +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + simdutf_warn_unused size_t + utf16_length_from_utf8(const char *, size_t) const noexcept override { + return 0; } - return converted_chars; -} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 -simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept { - std::pair ret = arm_convert_latin1_to_utf32(buf, len, utf32_output); - size_t converted_chars = ret.second - utf32_output; - if (ret.first != buf + len) { - const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert( - ret.first, len - (ret.first - buf), ret.second); - converted_chars += scalar_converted_chars; +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t + utf8_length_from_utf32(const char32_t *, size_t) const noexcept override { + return 0; } - return converted_chars; -} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 -simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept { - utf8_to_latin1::validating_transcoder converter; - return converter.convert(buf, len, latin1_output); -} +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t + utf16_length_from_utf32(const char32_t *, size_t) const noexcept override { + return 0; + } +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 -simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept { - utf8_to_latin1::validating_transcoder converter; - return converter.convert_with_errors(buf, len, latin1_output); -} +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + simdutf_warn_unused size_t + utf32_length_from_utf8(const char *, size_t) const noexcept override { + return 0; + } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept { - return arm64::utf8_to_latin1::convert_valid(buf,len,latin1_output); -} +#if SIMDUTF_FEATURE_BASE64 + simdutf_warn_unused result + base64_to_binary(const char *, size_t, char *, base64_options, + last_chunk_handling_options) const noexcept override { + return result(error_code::OTHER, 0); + } -simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - utf8_to_utf16::validating_transcoder converter; - return converter.convert(buf, len, utf16_output); -} + simdutf_warn_unused full_result base64_to_binary_details( + const char *, size_t, char *, base64_options, + last_chunk_handling_options) const noexcept override { + return full_result(error_code::OTHER, 0, 0); + } -simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - utf8_to_utf16::validating_transcoder converter; - return converter.convert(buf, len, utf16_output); -} + simdutf_warn_unused result + base64_to_binary(const char16_t *, size_t, char *, base64_options, + last_chunk_handling_options) const noexcept override { + return result(error_code::OTHER, 0); + } -simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - utf8_to_utf16::validating_transcoder converter; - return converter.convert_with_errors(buf, len, utf16_output); -} + simdutf_warn_unused full_result base64_to_binary_details( + const char16_t *, size_t, char *, base64_options, + last_chunk_handling_options) const noexcept override { + return full_result(error_code::OTHER, 0, 0); + } -simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - utf8_to_utf16::validating_transcoder converter; - return converter.convert_with_errors(buf, len, utf16_output); -} + size_t binary_to_base64(const char *, size_t, char *, + base64_options) const noexcept override { + return 0; + } + size_t binary_to_base64_with_lines(const char *, size_t, char *, size_t, + base64_options) const noexcept override { + return 0; + } + const char *find(const char *, const char *, char) const noexcept override { + return nullptr; + } + const char16_t *find(const char16_t *, const char16_t *, + char16_t) const noexcept override { + return nullptr; + } +#endif // SIMDUTF_FEATURE_BASE64 -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* input, size_t size, - char16_t* utf16_output) const noexcept { - return utf8_to_utf16::convert_valid(input, size, utf16_output); -} + unsupported_implementation() + : implementation("unsupported", + "Unsupported CPU (no detected SIMD instructions)", 0) {} +}; -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* input, size_t size, - char16_t* utf16_output) const noexcept { - return utf8_to_utf16::convert_valid(input, size, utf16_output); +const unsupported_implementation *get_unsupported_singleton() { + static const unsupported_implementation unsupported_singleton{}; + return &unsupported_singleton; } +static_assert(std::is_trivially_destructible::value, + "unsupported_singleton should be trivially destructible"); -simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept { - utf8_to_utf32::validating_transcoder converter; - return converter.convert(buf, len, utf32_output); +size_t available_implementation_list::size() const noexcept { + return internal::get_available_implementation_pointers().size(); } - -simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept { - utf8_to_utf32::validating_transcoder converter; - return converter.convert_with_errors(buf, len, utf32_output); +const implementation *const * +available_implementation_list::begin() const noexcept { + return internal::get_available_implementation_pointers().begin(); } - -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size, - char32_t* utf32_output) const noexcept { - return utf8_to_utf32::convert_valid(input, size, utf32_output); +const implementation *const * +available_implementation_list::end() const noexcept { + return internal::get_available_implementation_pointers().end(); } - -simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { - std::pair ret = arm_convert_utf16_to_latin1(buf, len, latin1_output); - if (ret.first == nullptr) { return 0; } - size_t saved_bytes = ret.second - latin1_output; - - if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf16_to_latin1::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; +const implementation * +available_implementation_list::detect_best_supported() const noexcept { + // They are prelisted in priority order, so we just go down the list + uint32_t supported_instruction_sets = + internal::detect_supported_architectures(); + for (const implementation *impl : + internal::get_available_implementation_pointers()) { + uint32_t required_instruction_sets = impl->required_instruction_sets(); + if ((supported_instruction_sets & required_instruction_sets) == + required_instruction_sets) { + return impl; + } } - return saved_bytes; + return get_unsupported_singleton(); // this should never happen? } -simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { - std::pair ret = arm_convert_utf16_to_latin1(buf, len, latin1_output); - if (ret.first == nullptr) { return 0; } - size_t saved_bytes = ret.second - latin1_output; - - if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf16_to_latin1::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; - } - return saved_bytes; -} +const implementation * +detect_best_supported_implementation_on_first_use::set_best() const noexcept { + SIMDUTF_PUSH_DISABLE_WARNINGS + SIMDUTF_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC: + // manually verified this is safe + char *force_implementation_name = getenv("SIMDUTF_FORCE_IMPLEMENTATION"); + SIMDUTF_POP_DISABLE_WARNINGS -simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept { - std::pair ret = arm_convert_utf16_to_latin1_with_errors(buf, len, latin1_output); - if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count - if (ret.first.count != len) { // All good so far, but not finished - result scalar_res = scalar::utf16_to_latin1::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); - if (scalar_res.error) { - scalar_res.count += ret.first.count; - return scalar_res; + if (force_implementation_name) { + auto force_implementation = + get_available_implementations()[force_implementation_name]; + if (force_implementation) { + return get_active_implementation() = force_implementation; } else { - ret.second += scalar_res.count; + // Note: abort() and stderr usage within the library is forbidden. + return get_active_implementation() = get_unsupported_singleton(); } } - ret.first.count = ret.second - latin1_output; // Set count to the number of 8-bit code units written - return ret.first; + return get_active_implementation() = + get_available_implementations().detect_best_supported(); } -simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept { - std::pair ret = arm_convert_utf16_to_latin1_with_errors(buf, len, latin1_output); - if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count - if (ret.first.count != len) { // All good so far, but not finished - result scalar_res = scalar::utf16_to_latin1::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); - if (scalar_res.error) { - scalar_res.count += ret.first.count; - return scalar_res; - } else { - ret.second += scalar_res.count; - } - } - ret.first.count = ret.second - latin1_output; // Set count to the number of 8-bit code units written - return ret.first; +} // namespace internal + +/** + * The list of available implementations compiled into simdutf. + */ +SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list & +get_available_implementations() { + static const internal::available_implementation_list + available_implementations{}; + return available_implementations; } -simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { - // optimization opportunity: implement a custom function. - return convert_utf16be_to_latin1(buf, len, latin1_output); +/** + * The active implementation. + */ +SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr & +get_active_implementation() { +#if SIMDUTF_SINGLE_IMPLEMENTATION + // skip runtime detection + static internal::atomic_ptr active_implementation{ + internal::get_single_implementation()}; + return active_implementation; +#else + static const internal::detect_best_supported_implementation_on_first_use + detect_best_supported_implementation_on_first_use_singleton; + static internal::atomic_ptr active_implementation{ + &detect_best_supported_implementation_on_first_use_singleton}; + return active_implementation; +#endif } -simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { - // optimization opportunity: implement a custom function. - return convert_utf16le_to_latin1(buf, len, latin1_output); +#if SIMDUTF_SINGLE_IMPLEMENTATION +const implementation *get_default_implementation() { + return internal::get_single_implementation(); } - -simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - std::pair ret = arm_convert_utf16_to_utf8(buf, len, utf8_output); - if (ret.first == nullptr) { return 0; } - size_t saved_bytes = ret.second - utf8_output; - if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; - } - return saved_bytes; +#else +internal::atomic_ptr &get_default_implementation() { + return get_active_implementation(); } +#endif +#define SIMDUTF_GET_CURRENT_IMPLEMENTATION -simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - std::pair ret = arm_convert_utf16_to_utf8(buf, len, utf8_output); - if (ret.first == nullptr) { return 0; } - size_t saved_bytes = ret.second - utf8_output; - if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; - } - return saved_bytes; +#if SIMDUTF_FEATURE_UTF8 +simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept { + return get_default_implementation()->validate_utf8(buf, len); } - -simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of code units written even if finished - std::pair ret = arm_convert_utf16_to_utf8_with_errors(buf, len, utf8_output); - if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count - if (ret.first.count != len) { // All good so far, but not finished - result scalar_res = scalar::utf16_to_utf8::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); - if (scalar_res.error) { - scalar_res.count += ret.first.count; - return scalar_res; - } else { - ret.second += scalar_res.count; - } - } - ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit code units written - return ret.first; +simdutf_warn_unused result validate_utf8_with_errors(const char *buf, + size_t len) noexcept { + return get_default_implementation()->validate_utf8_with_errors(buf, len); } +#endif // SIMDUTF_FEATURE_UTF8 -simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of code units written even if finished - std::pair ret = arm_convert_utf16_to_utf8_with_errors(buf, len, utf8_output); - if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count - if (ret.first.count != len) { // All good so far, but not finished - result scalar_res = scalar::utf16_to_utf8::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); - if (scalar_res.error) { - scalar_res.count += ret.first.count; - return scalar_res; - } else { - ret.second += scalar_res.count; - } - } - ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit code units written - return ret.first; +#if SIMDUTF_FEATURE_ASCII +simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept { + return get_default_implementation()->validate_ascii(buf, len); } - -simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - return convert_utf16le_to_utf8(buf, len, utf8_output); +simdutf_warn_unused result validate_ascii_with_errors(const char *buf, + size_t len) noexcept { + return get_default_implementation()->validate_ascii_with_errors(buf, len); } +#endif // SIMDUTF_FEATURE_ASCII -simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - return convert_utf16be_to_utf8(buf, len, utf8_output); +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII +simdutf_warn_unused bool validate_utf16le_as_ascii(const char16_t *buf, + size_t len) noexcept { + return get_default_implementation()->validate_utf16le_as_ascii(buf, len); } - -simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { - std::pair ret = arm_convert_utf32_to_utf8(buf, len, utf8_output); - if (ret.first == nullptr) { return 0; } - size_t saved_bytes = ret.second - utf8_output; - if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; - } - return saved_bytes; +simdutf_warn_unused bool validate_utf16be_as_ascii(const char16_t *buf, + size_t len) noexcept { + return get_default_implementation()->validate_utf16be_as_ascii(buf, len); } - -simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of code units written even if finished - std::pair ret = arm_convert_utf32_to_utf8_with_errors(buf, len, utf8_output); - if (ret.first.count != len) { - result scalar_res = scalar::utf32_to_utf8::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); - if (scalar_res.error) { - scalar_res.count += ret.first.count; - return scalar_res; - } else { - ret.second += scalar_res.count; - } - } - ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit code units written - return ret.first; +simdutf_warn_unused bool validate_utf16_as_ascii(const char16_t *input, + size_t length) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return validate_utf16be_as_ascii(input, length); + #else + return validate_utf16le_as_ascii(input, length); + #endif } +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII -simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - std::pair ret = arm_convert_utf16_to_utf32(buf, len, utf32_output); - if (ret.first == nullptr) { return 0; } - size_t saved_bytes = ret.second - utf32_output; - if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; - } - return saved_bytes; +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t convert_utf8_to_utf16( + const char *input, size_t length, char16_t *utf16_output) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return convert_utf8_to_utf16be(input, length, utf16_output); + #else + return convert_utf8_to_utf16le(input, length, utf16_output); + #endif } - -simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - std::pair ret = arm_convert_utf16_to_utf32(buf, len, utf32_output); - if (ret.first == nullptr) { return 0; } - size_t saved_bytes = ret.second - utf32_output; - if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; - } - return saved_bytes; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t convert_latin1_to_utf8(const char *buf, size_t len, + char *utf8_output) noexcept { + return get_default_implementation()->convert_latin1_to_utf8(buf, len, + utf8_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t convert_latin1_to_utf16le( + const char *buf, size_t len, char16_t *utf16_output) noexcept { + return get_default_implementation()->convert_latin1_to_utf16le(buf, len, + utf16_output); +} +simdutf_warn_unused size_t convert_latin1_to_utf16be( + const char *buf, size_t len, char16_t *utf16_output) noexcept { + return get_default_implementation()->convert_latin1_to_utf16be(buf, len, + utf16_output); +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t convert_latin1_to_utf32( + const char *buf, size_t len, char32_t *latin1_output) noexcept { + return get_default_implementation()->convert_latin1_to_utf32(buf, len, + latin1_output); +} +// moved to the header file +// simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) noexcept +// simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) noexcept +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t convert_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) noexcept { + return get_default_implementation()->convert_utf8_to_latin1(buf, len, + latin1_output); +} +simdutf_warn_unused result convert_utf8_to_latin1_with_errors( + const char *buf, size_t len, char *latin1_output) noexcept { + return get_default_implementation()->convert_utf8_to_latin1_with_errors( + buf, len, latin1_output); +} +simdutf_warn_unused size_t convert_valid_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) noexcept { + return get_default_implementation()->convert_valid_utf8_to_latin1( + buf, len, latin1_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t convert_utf8_to_utf16le( + const char *input, size_t length, char16_t *utf16_output) noexcept { + return get_default_implementation()->convert_utf8_to_utf16le(input, length, + utf16_output); +} +simdutf_warn_unused size_t convert_utf8_to_utf16be( + const char *input, size_t length, char16_t *utf16_output) noexcept { + return get_default_implementation()->convert_utf8_to_utf16be(input, length, + utf16_output); +} +simdutf_warn_unused result convert_utf8_to_utf16_with_errors( + const char *input, size_t length, char16_t *utf16_output) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return convert_utf8_to_utf16be_with_errors(input, length, utf16_output); + #else + return convert_utf8_to_utf16le_with_errors(input, length, utf16_output); + #endif } - -simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of code units written even if finished - std::pair ret = arm_convert_utf16_to_utf32_with_errors(buf, len, utf32_output); - if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count - if (ret.first.count != len) { // All good so far, but not finished - result scalar_res = scalar::utf16_to_utf32::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); - if (scalar_res.error) { - scalar_res.count += ret.first.count; - return scalar_res; - } else { - ret.second += scalar_res.count; - } - } - ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit code units written - return ret.first; +simdutf_warn_unused result convert_utf8_to_utf16le_with_errors( + const char *input, size_t length, char16_t *utf16_output) noexcept { + return get_default_implementation()->convert_utf8_to_utf16le_with_errors( + input, length, utf16_output); } - -simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of code units written even if finished - std::pair ret = arm_convert_utf16_to_utf32_with_errors(buf, len, utf32_output); - if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count - if (ret.first.count != len) { // All good so far, but not finished - result scalar_res = scalar::utf16_to_utf32::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); - if (scalar_res.error) { - scalar_res.count += ret.first.count; - return scalar_res; - } else { - ret.second += scalar_res.count; - } - } - ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit code units written - return ret.first; +simdutf_warn_unused result convert_utf8_to_utf16be_with_errors( + const char *input, size_t length, char16_t *utf16_output) noexcept { + return get_default_implementation()->convert_utf8_to_utf16be_with_errors( + input, length, utf16_output); } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 -simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept { - std::pair ret = arm_convert_utf32_to_latin1(buf, len, latin1_output); - if (ret.first == nullptr) { return 0; } - size_t saved_bytes = ret.second - latin1_output; - - if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; - } - return saved_bytes; +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t convert_utf8_to_utf32( + const char *input, size_t length, char32_t *utf32_output) noexcept { + return get_default_implementation()->convert_utf8_to_utf32(input, length, + utf32_output); } - -simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept { - std::pair ret = arm_convert_utf32_to_latin1_with_errors(buf, len, latin1_output); - if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count - if (ret.first.count != len) { // All good so far, but not finished - result scalar_res = scalar::utf32_to_latin1::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); - if (scalar_res.error) { - scalar_res.count += ret.first.count; - return scalar_res; - } else { - ret.second += scalar_res.count; - } - } - ret.first.count = ret.second - latin1_output; // Set count to the number of 8-bit code units written - return ret.first; +simdutf_warn_unused result convert_utf8_to_utf32_with_errors( + const char *input, size_t length, char32_t *utf32_output) noexcept { + return get_default_implementation()->convert_utf8_to_utf32_with_errors( + input, length, utf32_output); } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept { - std::pair ret = arm_convert_utf32_to_latin1(buf, len, latin1_output); - if (ret.first == nullptr) { return 0; } - size_t saved_bytes = ret.second - latin1_output; - - if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert_valid( - ret.first, len - (ret.first - buf), ret.second); - saved_bytes += scalar_saved_bytes; - } - return saved_bytes; +#if SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused bool validate_utf16(const char16_t *buf, + size_t len) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return validate_utf16be(buf, len); + #else + return validate_utf16le(buf, len); + #endif } - -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { - // optimization opportunity: implement a custom function. - return convert_utf32_to_utf8(buf, len, utf8_output); +void to_well_formed_utf16be(const char16_t *input, size_t len, + char16_t *output) noexcept { + return get_default_implementation()->to_well_formed_utf16be(input, len, + output); } - -simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - std::pair ret = arm_convert_utf32_to_utf16(buf, len, utf16_output); - if (ret.first == nullptr) { return 0; } - size_t saved_bytes = ret.second - utf16_output; - if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; - } - return saved_bytes; +void to_well_formed_utf16le(const char16_t *input, size_t len, + char16_t *output) noexcept { + return get_default_implementation()->to_well_formed_utf16le(input, len, + output); +} +void to_well_formed_utf16(const char16_t *input, size_t len, + char16_t *output) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + to_well_formed_utf16be(input, len, output); + #else + to_well_formed_utf16le(input, len, output); + #endif } +#endif // SIMDUTF_FEATURE_UTF16 -simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - std::pair ret = arm_convert_utf32_to_utf16(buf, len, utf16_output); - if (ret.first == nullptr) { return 0; } - size_t saved_bytes = ret.second - utf16_output; - if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; - } - return saved_bytes; +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused bool validate_utf16le(const char16_t *buf, + size_t len) noexcept { + return get_default_implementation()->validate_utf16le(buf, len); } +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_BASE64 + #if SIMDUTF_ATOMIC_REF +template +simdutf_warn_unused result atomic_base64_to_binary_safe_impl( + const char_type *input, size_t length, char *output, size_t &outlen, + base64_options options, + last_chunk_handling_options last_chunk_handling_options, + bool decode_up_to_bad_char) noexcept { + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) + // We use a smaller buffer during fuzzing to more easily detect bugs. + constexpr size_t buffer_size = 128; + #else + // Arbitrary block sizes: 4KB for input. + constexpr size_t buffer_size = 4096; + #endif + std::array temp_buffer; + const char_type *const input_init = input; + size_t actual_out = 0; + bool last_chunk = false; + const size_t length_init = length; + result r; + while (!last_chunk) { + last_chunk |= (temp_buffer.size() >= outlen - actual_out); + size_t temp_outlen = (std::min)(temp_buffer.size(), outlen - actual_out); + r = base64_to_binary_safe(input, length, temp_buffer.data(), temp_outlen, + options, last_chunk_handling_options, + decode_up_to_bad_char); + // We processed r.count characters of input. + // We wrote temp_outlen bytes to temp_buffer. + // If there is no ignorable characters, + // we should expect that values/4.0*3 == temp_outlen, + // except maybe at the tail end of the string. -simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of code units written even if finished - std::pair ret = arm_convert_utf32_to_utf16_with_errors(buf, len, utf16_output); - if (ret.first.count != len) { - result scalar_res = scalar::utf32_to_utf16::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); - if (scalar_res.error) { - scalar_res.count += ret.first.count; - return scalar_res; - } else { - ret.second += scalar_res.count; + // + // We are assuming that when r.error == error_code::OUTPUT_BUFFER_TOO_SMALL, + // we truncate the results so that a number of base64 characters divisible + // by four is processed. + // + + // + // We wrote temp_outlen bytes to temp_buffer. + // We need to copy them to output. + // Copy with relaxed atomic operations to the output + simdutf_log_assert(temp_outlen <= outlen - actual_out, + "Output buffer is too small"); + simdutf_log_assert(temp_outlen <= temp_buffer.size(), + "Output buffer is too small"); + + simdutf::scalar::memcpy_atomic_write(output + actual_out, + temp_buffer.data(), temp_outlen); + actual_out += temp_outlen; + length -= r.count; + input += r.count; + + if (r.error != error_code::OUTPUT_BUFFER_TOO_SMALL) { + break; } } - ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit code units written - return ret.first; -} - -simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of code units written even if finished - std::pair ret = arm_convert_utf32_to_utf16_with_errors(buf, len, utf16_output); - if (ret.first.count != len) { - result scalar_res = scalar::utf32_to_utf16::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); - if (scalar_res.error) { - scalar_res.count += ret.first.count; - return scalar_res; - } else { - ret.second += scalar_res.count; + if (size_t(input - input_init) != length_init) { + // We did not process all input characters. In such case, we + // should not end with an ignorable character. See + // https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64 + while (input > input_init && base64_ignorable(*(input - 1), options)) { + --input; } } - ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit code units written - return ret.first; -} - -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - return convert_utf32_to_utf16le(buf, len, utf16_output); + outlen = actual_out; + return {r.error, size_t(input - input_init)}; } -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - return convert_utf32_to_utf16be(buf, len, utf16_output); +simdutf_warn_unused result atomic_base64_to_binary_safe( + const char *input, size_t length, char *output, size_t &outlen, + base64_options options, + last_chunk_handling_options last_chunk_handling_options, + bool decode_up_to_bad_char) noexcept { + return atomic_base64_to_binary_safe_impl( + input, length, output, outlen, options, last_chunk_handling_options, + decode_up_to_bad_char); } - -simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - return convert_utf16le_to_utf32(buf, len, utf32_output); +simdutf_warn_unused result atomic_base64_to_binary_safe( + const char16_t *input, size_t length, char *output, size_t &outlen, + base64_options options, + last_chunk_handling_options last_chunk_handling_options, + bool decode_up_to_bad_char) noexcept { + return atomic_base64_to_binary_safe_impl( + input, length, output, outlen, options, last_chunk_handling_options, + decode_up_to_bad_char); } + #endif // SIMDUTF_ATOMIC_REF -simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - return convert_utf16be_to_utf32(buf, len, utf32_output); -} +#endif // SIMDUTF_FEATURE_BASE64 -void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept { - utf16::change_endianness_utf16(input, length, output); +#if SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused bool validate_utf16be(const char16_t *buf, + size_t len) noexcept { + return get_default_implementation()->validate_utf16be(buf, len); } - -simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept { - return utf16::count_code_points(input, length); +simdutf_warn_unused result validate_utf16_with_errors(const char16_t *buf, + size_t len) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return validate_utf16be_with_errors(buf, len); + #else + return validate_utf16le_with_errors(buf, len); + #endif } - -simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept { - return utf16::count_code_points(input, length); +simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, + size_t len) noexcept { + return get_default_implementation()->validate_utf16le_with_errors(buf, len); } - -simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept { - return utf8::count_code_points(input, length); +simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, + size_t len) noexcept { + return get_default_implementation()->validate_utf16be_with_errors(buf, len); } +#endif // SIMDUTF_FEATURE_UTF16 -simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept { - return count_utf8(buf,len); +#if SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused bool validate_utf32(const char32_t *buf, + size_t len) noexcept { + return get_default_implementation()->validate_utf32(buf, len); } - -simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept { - return scalar::utf16::latin1_length_from_utf16(length); +simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, + size_t len) noexcept { + return get_default_implementation()->validate_utf32_with_errors(buf, len); } +#endif // SIMDUTF_FEATURE_UTF32 -simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t length) const noexcept { - return scalar::utf32::latin1_length_from_utf32(length); +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t convert_valid_utf8_to_utf16( + const char *input, size_t length, char16_t *utf16_buffer) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return convert_valid_utf8_to_utf16be(input, length, utf16_buffer); + #else + return convert_valid_utf8_to_utf16le(input, length, utf16_buffer); + #endif } - -simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char * input, size_t length) const noexcept { - // See https://lemire.me/blog/2023/05/15/computing-the-utf-8-size-of-a-latin-1-string-quickly-arm-neon-edition/ - // credit to Pete Cawley - const uint8_t *data = reinterpret_cast(input); - uint64_t result = 0; - const int lanes = sizeof(uint8x16_t); - uint8_t rem = length % lanes; - const uint8_t *simd_end = data + (length / lanes) * lanes; - const uint8x16_t threshold = vdupq_n_u8(0x80); - for (; data < simd_end; data += lanes) { - // load 16 bytes - uint8x16_t input_vec = vld1q_u8(data); - // compare to threshold (0x80) - uint8x16_t withhighbit = vcgeq_u8(input_vec, threshold); - // vertical addition - result -= vaddvq_s8(vreinterpretq_s8_u8(withhighbit)); - } - return result + (length / lanes) * lanes + scalar::latin1::utf8_length_from_latin1((const char*)simd_end, rem); +simdutf_warn_unused size_t convert_valid_utf8_to_utf16le( + const char *input, size_t length, char16_t *utf16_buffer) noexcept { + return get_default_implementation()->convert_valid_utf8_to_utf16le( + input, length, utf16_buffer); } - -simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept { - return utf16::utf8_length_from_utf16(input, length); +simdutf_warn_unused size_t convert_valid_utf8_to_utf16be( + const char *input, size_t length, char16_t *utf16_buffer) noexcept { + return get_default_implementation()->convert_valid_utf8_to_utf16be( + input, length, utf16_buffer); } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 -simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept { - return utf16::utf8_length_from_utf16(input, length); +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t convert_valid_utf8_to_utf32( + const char *input, size_t length, char32_t *utf32_buffer) noexcept { + return get_default_implementation()->convert_valid_utf8_to_utf32( + input, length, utf32_buffer); } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 - -simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept { - return scalar::latin1::utf16_length_from_latin1(length); +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t *buf, + size_t len, + char *utf8_buffer) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return convert_utf16be_to_utf8(buf, len, utf8_buffer); + #else + return convert_utf16le_to_utf8(buf, len, utf8_buffer); + #endif } +simdutf_warn_unused size_t +convert_utf16_to_utf8_safe(const char16_t *buf, size_t len, char *utf8_output, + size_t utf8_len) noexcept { + const auto start{utf8_output}; + // We might be able to go faster by first scanning the input buffer to + // determine how many char16_t characters we can read without exceeding the + // utf8_len. This is a one-pass algorithm that has the benefit of not + // requiring a first pass to determine the length. + while (true) { + // The worst case for convert_utf16_to_utf8 is when you go from 1 char16_t + // to 3 characters of UTF-8. So we can read at most utf8_len / 3 char16_t + // characters. + auto read_len = std::min(len, utf8_len / 3); + if (read_len <= 16) { + break; + } + if (read_len < len) { + // If we have a high surrogate at the end of the buffer, we need to + // either read one more char16_t or backtrack. + if (scalar::utf16::high_surrogate(buf[read_len - 1])) { + read_len--; + } + } + if (read_len == 0) { + // If we cannot read anything, we are done. + break; + } + const auto write_len = + simdutf::convert_utf16_to_utf8(buf, read_len, utf8_output); + if (write_len == 0) { + // There was an error in the conversion, we cannot continue. + return 0; // indicating failure + } -simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept { - return scalar::latin1::utf32_length_from_latin1(length); + utf8_output += write_len; + utf8_len -= write_len; + buf += read_len; + len -= read_len; + } + #if SIMDUTF_IS_BIG_ENDIAN + full_result r = + scalar::utf16_to_utf8::convert_with_errors( + buf, len, utf8_output, utf8_len); + #else + full_result r = + scalar::utf16_to_utf8::convert_with_errors( + buf, len, utf8_output, utf8_len); + #endif + if (r.error != error_code::SUCCESS && + r.error != error_code::OUTPUT_BUFFER_TOO_SMALL) { + // If there was an error, we return 0 to indicate failure. + return 0; // indicating failure + } + return r.output_count + (utf8_output - start); } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 - - -simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept { - return utf16::utf32_length_from_utf16(input, length); +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t convert_utf16_to_latin1( + const char16_t *buf, size_t len, char *latin1_buffer) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return convert_utf16be_to_latin1(buf, len, latin1_buffer); + #else + return convert_utf16le_to_latin1(buf, len, latin1_buffer); + #endif } - -simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept { - return utf16::utf32_length_from_utf16(input, length); +simdutf_warn_unused size_t convert_latin1_to_utf16( + const char *buf, size_t len, char16_t *utf16_output) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return convert_latin1_to_utf16be(buf, len, utf16_output); + #else + return convert_latin1_to_utf16le(buf, len, utf16_output); + #endif } - -simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept { - return utf8::utf16_length_from_utf8(input, length); +simdutf_warn_unused size_t convert_utf16be_to_latin1( + const char16_t *buf, size_t len, char *latin1_buffer) noexcept { + return get_default_implementation()->convert_utf16be_to_latin1(buf, len, + latin1_buffer); +} +simdutf_warn_unused size_t convert_utf16le_to_latin1( + const char16_t *buf, size_t len, char *latin1_buffer) noexcept { + return get_default_implementation()->convert_utf16le_to_latin1(buf, len, + latin1_buffer); +} +simdutf_warn_unused size_t convert_valid_utf16be_to_latin1( + const char16_t *buf, size_t len, char *latin1_buffer) noexcept { + return get_default_implementation()->convert_valid_utf16be_to_latin1( + buf, len, latin1_buffer); +} +simdutf_warn_unused size_t convert_valid_utf16le_to_latin1( + const char16_t *buf, size_t len, char *latin1_buffer) noexcept { + return get_default_implementation()->convert_valid_utf16le_to_latin1( + buf, len, latin1_buffer); +} +simdutf_warn_unused result convert_utf16le_to_latin1_with_errors( + const char16_t *buf, size_t len, char *latin1_buffer) noexcept { + return get_default_implementation()->convert_utf16le_to_latin1_with_errors( + buf, len, latin1_buffer); +} +simdutf_warn_unused result convert_utf16be_to_latin1_with_errors( + const char16_t *buf, size_t len, char *latin1_buffer) noexcept { + return get_default_implementation()->convert_utf16be_to_latin1_with_errors( + buf, len, latin1_buffer); +} +// moved to header file +// simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) noexcept +// simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) noexcept +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t *buf, + size_t len, + char *utf8_buffer) noexcept { + return get_default_implementation()->convert_utf16le_to_utf8(buf, len, + utf8_buffer); +} +simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t *buf, + size_t len, + char *utf8_buffer) noexcept { + return get_default_implementation()->convert_utf16be_to_utf8(buf, len, + utf8_buffer); +} +simdutf_warn_unused result convert_utf16_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_buffer) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer); + #else + return convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer); + #endif } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 -simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept { - const uint32x4_t v_7f = vmovq_n_u32((uint32_t)0x7f); - const uint32x4_t v_7ff = vmovq_n_u32((uint32_t)0x7ff); - const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff); - const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1); - size_t pos = 0; - size_t count = 0; - for(;pos + 4 <= length; pos += 4) { - uint32x4_t in = vld1q_u32(reinterpret_cast(input + pos)); - const uint32x4_t ascii_bytes_bytemask = vcleq_u32(in, v_7f); - const uint32x4_t one_two_bytes_bytemask = vcleq_u32(in, v_7ff); - const uint32x4_t two_bytes_bytemask = veorq_u32(one_two_bytes_bytemask, ascii_bytes_bytemask); - const uint32x4_t three_bytes_bytemask = veorq_u32(vcleq_u32(in, v_ffff), one_two_bytes_bytemask); - - const uint16x8_t reduced_ascii_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(ascii_bytes_bytemask, v_1)); - const uint16x8_t reduced_two_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(two_bytes_bytemask, v_1)); - const uint16x8_t reduced_three_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(three_bytes_bytemask, v_1)); - - const uint16x8_t compressed_bytemask0 = vpaddq_u16(reduced_ascii_bytes_bytemask, reduced_two_bytes_bytemask); - const uint16x8_t compressed_bytemask1 = vpaddq_u16(reduced_three_bytes_bytemask, reduced_three_bytes_bytemask); - - size_t ascii_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 0)); - size_t two_bytes_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 1)); - size_t three_bytes_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask1), 0)); - - count += 16 - 3*ascii_count - 2*two_bytes_count - three_bytes_count; - } - return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos); +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused result convert_utf16_to_latin1_with_errors( + const char16_t *buf, size_t len, char *latin1_buffer) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return convert_utf16be_to_latin1_with_errors(buf, len, latin1_buffer); + #else + return convert_utf16le_to_latin1_with_errors(buf, len, latin1_buffer); + #endif } +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 -simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept { - const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff); - const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1); - size_t pos = 0; - size_t count = 0; - for(;pos + 4 <= length; pos += 4) { - uint32x4_t in = vld1q_u32(reinterpret_cast(input + pos)); - const uint32x4_t surrogate_bytemask = vcgtq_u32(in, v_ffff); - const uint16x8_t reduced_bytemask = vreinterpretq_u16_u32(vandq_u32(surrogate_bytemask, v_1)); - const uint16x8_t compressed_bytemask = vpaddq_u16(reduced_bytemask, reduced_bytemask); - size_t surrogate_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask), 0)); - count += 4 + surrogate_count; - } - return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos); +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused result convert_utf16le_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_buffer) noexcept { + return get_default_implementation()->convert_utf16le_to_utf8_with_errors( + buf, len, utf8_buffer); } - -simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept { - return utf8::count_code_points(input, length); +simdutf_warn_unused result convert_utf16be_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_buffer) noexcept { + return get_default_implementation()->convert_utf16be_to_utf8_with_errors( + buf, len, utf8_buffer); } - -simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char * input, size_t length) const noexcept { - return scalar::base64::maximal_binary_length_from_base64(input, length); +simdutf_warn_unused size_t convert_valid_utf16_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return convert_valid_utf16be_to_utf8(buf, len, utf8_buffer); + #else + return convert_valid_utf16le_to_utf8(buf, len, utf8_buffer); + #endif } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 -simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept { - return (options & base64_url) ? compress_decode_base64(output, input, length, options) : compress_decode_base64(output, input, length, options); +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t convert_valid_utf16_to_latin1( + const char16_t *buf, size_t len, char *latin1_buffer) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return convert_valid_utf16be_to_latin1(buf, len, latin1_buffer); + #else + return convert_valid_utf16le_to_latin1(buf, len, latin1_buffer); + #endif } +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 -simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept { - return scalar::base64::maximal_binary_length_from_base64(input, length); +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t convert_valid_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) noexcept { + return get_default_implementation()->convert_valid_utf16le_to_utf8( + buf, len, utf8_buffer); } - -simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept { - return (options & base64_url) ? compress_decode_base64(output, input, length, options) : compress_decode_base64(output, input, length, options); +simdutf_warn_unused size_t convert_valid_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_buffer) noexcept { + return get_default_implementation()->convert_valid_utf16be_to_utf8( + buf, len, utf8_buffer); } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 -simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length, base64_options options) const noexcept { - return scalar::base64::base64_length_from_binary(length, options); +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t *buf, + size_t len, + char *utf8_buffer) noexcept { + return get_default_implementation()->convert_utf32_to_utf8(buf, len, + utf8_buffer); } - -size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept { - return encode_base64(output, input, length, options); +simdutf_warn_unused result convert_utf32_to_utf8_with_errors( + const char32_t *buf, size_t len, char *utf8_buffer) noexcept { + return get_default_implementation()->convert_utf32_to_utf8_with_errors( + buf, len, utf8_buffer); } - - -} // namespace arm64 -} // namespace simdutf - -/* begin file src/simdutf/arm64/end.h */ -/* end file src/simdutf/arm64/end.h */ -/* end file src/arm64/implementation.cpp */ -#endif -#if SIMDUTF_IMPLEMENTATION_FALLBACK -/* begin file src/fallback/implementation.cpp */ -/* begin file src/simdutf/fallback/begin.h */ -// redefining SIMDUTF_IMPLEMENTATION to "fallback" -// #define SIMDUTF_IMPLEMENTATION fallback -/* end file src/simdutf/fallback/begin.h */ - - - - - - - - - -namespace simdutf { -namespace fallback { - -simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept { - // If there is a BOM, then we trust it. - auto bom_encoding = simdutf::BOM::check_bom(input, length); - if(bom_encoding != encoding_type::unspecified) { return bom_encoding; } - int out = 0; - if(validate_utf8(input, length)) { out |= encoding_type::UTF8; } - if((length % 2) == 0) { - if(validate_utf16le(reinterpret_cast(input), length/2)) { out |= encoding_type::UTF16_LE; } - } - if((length % 4) == 0) { - if(validate_utf32(reinterpret_cast(input), length/4)) { out |= encoding_type::UTF32_LE; } - } - - return out; +simdutf_warn_unused size_t convert_valid_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_buffer) noexcept { + return get_default_implementation()->convert_valid_utf32_to_utf8(buf, len, + utf8_buffer); } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 -simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { - return scalar::utf8::validate(buf, len); +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t convert_utf32_to_utf16( + const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return convert_utf32_to_utf16be(buf, len, utf16_buffer); + #else + return convert_utf32_to_utf16le(buf, len, utf16_buffer); + #endif } +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 -simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept { - return scalar::utf8::validate_with_errors(buf, len); +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t convert_utf32_to_latin1( + const char32_t *input, size_t length, char *latin1_output) noexcept { + return get_default_implementation()->convert_utf32_to_latin1(input, length, + latin1_output); } - -simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept { - return scalar::ascii::validate(buf, len); +simdutf_warn_unused result convert_utf32_to_latin1_with_errors( + const char32_t *input, size_t length, char *latin1_buffer) noexcept { + return get_default_implementation()->convert_utf32_to_latin1_with_errors( + input, length, latin1_buffer); } - -simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept { - return scalar::ascii::validate_with_errors(buf, len); +simdutf_warn_unused size_t convert_valid_utf32_to_latin1( + const char32_t *input, size_t length, char *latin1_buffer) noexcept { + return get_default_implementation()->convert_valid_utf32_to_latin1( + input, length, latin1_buffer); } +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 -simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept { - return scalar::utf16::validate(buf, len); +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t convert_utf32_to_utf16le( + const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept { + return get_default_implementation()->convert_utf32_to_utf16le(buf, len, + utf16_buffer); } - -simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept { - return scalar::utf16::validate(buf, len); +simdutf_warn_unused size_t convert_utf32_to_utf16be( + const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept { + return get_default_implementation()->convert_utf32_to_utf16be(buf, len, + utf16_buffer); } - -simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept { - return scalar::utf16::validate_with_errors(buf, len); +simdutf_warn_unused result convert_utf32_to_utf16_with_errors( + const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer); + #else + return convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer); + #endif } - -simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept { - return scalar::utf16::validate_with_errors(buf, len); +simdutf_warn_unused result convert_utf32_to_utf16le_with_errors( + const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept { + return get_default_implementation()->convert_utf32_to_utf16le_with_errors( + buf, len, utf16_buffer); } - -simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept { - return scalar::utf32::validate(buf, len); +simdutf_warn_unused result convert_utf32_to_utf16be_with_errors( + const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept { + return get_default_implementation()->convert_utf32_to_utf16be_with_errors( + buf, len, utf16_buffer); } - -simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept { - return scalar::utf32::validate_with_errors(buf, len); +simdutf_warn_unused size_t convert_valid_utf32_to_utf16( + const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return convert_valid_utf32_to_utf16be(buf, len, utf16_buffer); + #else + return convert_valid_utf32_to_utf16le(buf, len, utf16_buffer); + #endif } - -simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept { - return scalar::latin1_to_utf8::convert(buf,len,utf8_output); +simdutf_warn_unused size_t convert_valid_utf32_to_utf16le( + const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept { + return get_default_implementation()->convert_valid_utf32_to_utf16le( + buf, len, utf16_buffer); } - -simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::latin1_to_utf16::convert(buf, len, utf16_output); +simdutf_warn_unused size_t convert_valid_utf32_to_utf16be( + const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept { + return get_default_implementation()->convert_valid_utf32_to_utf16be( + buf, len, utf16_buffer); } - -simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::latin1_to_utf16::convert(buf, len, utf16_output); +simdutf_warn_unused size_t convert_utf16_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return convert_utf16be_to_utf32(buf, len, utf32_buffer); + #else + return convert_utf16le_to_utf32(buf, len, utf32_buffer); + #endif } - -simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept { - return scalar::latin1_to_utf32::convert(buf,len,utf32_output); +simdutf_warn_unused size_t convert_utf16le_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept { + return get_default_implementation()->convert_utf16le_to_utf32(buf, len, + utf32_buffer); } - -simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept { - return scalar::utf8_to_latin1::convert(buf, len, latin1_output); +simdutf_warn_unused size_t convert_utf16be_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept { + return get_default_implementation()->convert_utf16be_to_utf32(buf, len, + utf32_buffer); } - -simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept { - return scalar::utf8_to_latin1::convert_with_errors(buf, len, latin1_output); +simdutf_warn_unused result convert_utf16_to_utf32_with_errors( + const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer); + #else + return convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer); + #endif } - -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept { - return scalar::utf8_to_latin1::convert_valid(buf, len, latin1_output); +simdutf_warn_unused result convert_utf16le_to_utf32_with_errors( + const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept { + return get_default_implementation()->convert_utf16le_to_utf32_with_errors( + buf, len, utf32_buffer); } - -simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::utf8_to_utf16::convert(buf, len, utf16_output); +simdutf_warn_unused result convert_utf16be_to_utf32_with_errors( + const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept { + return get_default_implementation()->convert_utf16be_to_utf32_with_errors( + buf, len, utf32_buffer); } - -simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::utf8_to_utf16::convert(buf, len, utf16_output); +simdutf_warn_unused size_t convert_valid_utf16_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return convert_valid_utf16be_to_utf32(buf, len, utf32_buffer); + #else + return convert_valid_utf16le_to_utf32(buf, len, utf32_buffer); + #endif } - -simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::utf8_to_utf16::convert_with_errors(buf, len, utf16_output); +simdutf_warn_unused size_t convert_valid_utf16le_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept { + return get_default_implementation()->convert_valid_utf16le_to_utf32( + buf, len, utf32_buffer); } - -simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::utf8_to_utf16::convert_with_errors(buf, len, utf16_output); +simdutf_warn_unused size_t convert_valid_utf16be_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept { + return get_default_implementation()->convert_valid_utf16be_to_utf32( + buf, len, utf32_buffer); } +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::utf8_to_utf16::convert_valid(buf, len, utf16_output); +#if SIMDUTF_FEATURE_UTF16 +void change_endianness_utf16(const char16_t *input, size_t length, + char16_t *output) noexcept { + get_default_implementation()->change_endianness_utf16(input, length, output); } - -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::utf8_to_utf16::convert_valid(buf, len, utf16_output); +simdutf_warn_unused size_t count_utf16(const char16_t *input, + size_t length) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return count_utf16be(input, length); + #else + return count_utf16le(input, length); + #endif } - -simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept { - return scalar::utf8_to_utf32::convert(buf, len, utf32_output); +simdutf_warn_unused size_t count_utf16le(const char16_t *input, + size_t length) noexcept { + return get_default_implementation()->count_utf16le(input, length); } - -simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept { - return scalar::utf8_to_utf32::convert_with_errors(buf, len, utf32_output); +simdutf_warn_unused size_t count_utf16be(const char16_t *input, + size_t length) noexcept { + return get_default_implementation()->count_utf16be(input, length); } +#endif // SIMDUTF_FEATURE_UTF16 -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size, - char32_t* utf32_output) const noexcept { - return scalar::utf8_to_utf32::convert_valid(input, size, utf32_output); +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t count_utf8(const char *input, + size_t length) noexcept { + return get_default_implementation()->count_utf8(input, length); } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 -simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { - return scalar::utf16_to_latin1::convert(buf, len, latin1_output); +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t latin1_length_from_utf8(const char *buf, + size_t len) noexcept { + return get_default_implementation()->latin1_length_from_utf8(buf, len); } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 -simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { - return scalar::utf16_to_latin1::convert(buf, len, latin1_output); +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t utf8_length_from_latin1(const char *buf, + size_t len) noexcept { + return get_default_implementation()->utf8_length_from_latin1(buf, len); } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 -simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept { - return scalar::utf16_to_latin1::convert_with_errors(buf, len, latin1_output); +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t *input, + size_t length) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return utf8_length_from_utf16be(input, length); + #else + return utf8_length_from_utf16le(input, length); + #endif } - -simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept { - return scalar::utf16_to_latin1::convert_with_errors(buf, len, latin1_output); +simdutf_warn_unused result utf8_length_from_utf16_with_replacement( + const char16_t *input, size_t length) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return utf8_length_from_utf16be_with_replacement(input, length); + #else + return utf8_length_from_utf16le_with_replacement(input, length); + #endif } - -simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { - return scalar::utf16_to_latin1::convert_valid(buf, len, latin1_output); +simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *input, + size_t length) noexcept { + return get_default_implementation()->utf8_length_from_utf16le(input, length); } - -simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { - return scalar::utf16_to_latin1::convert_valid(buf, len, latin1_output); +simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *input, + size_t length) noexcept { + return get_default_implementation()->utf8_length_from_utf16be(input, length); } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 -simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - return scalar::utf16_to_utf8::convert(buf, len, utf8_output); +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t *input, + size_t length) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return utf32_length_from_utf16be(input, length); + #else + return utf32_length_from_utf16le(input, length); + #endif } - -simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - return scalar::utf16_to_utf8::convert(buf, len, utf8_output); +simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t *input, + size_t length) noexcept { + return get_default_implementation()->utf32_length_from_utf16le(input, length); } - -simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - return scalar::utf16_to_utf8::convert_with_errors(buf, len, utf8_output); +simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t *input, + size_t length) noexcept { + return get_default_implementation()->utf32_length_from_utf16be(input, length); } +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 -simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - return scalar::utf16_to_utf8::convert_with_errors(buf, len, utf8_output); +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t utf16_length_from_utf8(const char *input, + size_t length) noexcept { + return get_default_implementation()->utf16_length_from_utf8(input, length); } - -simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - return scalar::utf16_to_utf8::convert_valid(buf, len, utf8_output); +simdutf_warn_unused result utf8_length_from_utf16le_with_replacement( + const char16_t *input, size_t length) noexcept { + return get_default_implementation() + ->utf8_length_from_utf16le_with_replacement(input, length); } -simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - return scalar::utf16_to_utf8::convert_valid(buf, len, utf8_output); +simdutf_warn_unused result utf8_length_from_utf16be_with_replacement( + const char16_t *input, size_t length) noexcept { + return get_default_implementation() + ->utf8_length_from_utf16be_with_replacement(input, length); } -simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept { - return scalar::utf32_to_latin1::convert(buf, len, latin1_output); -} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 -simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept { - return scalar::utf32_to_latin1::convert_with_errors(buf, len, latin1_output); +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *input, + size_t length) noexcept { + return get_default_implementation()->utf8_length_from_utf32(input, length); } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept { - return scalar::utf32_to_latin1::convert_valid(buf, len, latin1_output); +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t *input, + size_t length) noexcept { + return get_default_implementation()->utf16_length_from_utf32(input, length); } +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 -simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { - return scalar::utf32_to_utf8::convert(buf, len, utf8_output); +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t utf32_length_from_utf8(const char *input, + size_t length) noexcept { + return get_default_implementation()->utf32_length_from_utf8(input, length); } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_BASE64 + +// this has been moved to implementation.h +// simdutf_warn_unused size_t +// base64_length_from_binary(size_t length, base64_options option) noexcept; + +// this has been moved to implementation.h +// simdutf_warn_unused size_t base64_length_from_binary_with_lines( +// size_t length, base64_options options, size_t line_length) noexcept; +// } + +simdutf_warn_unused const char *detail::find(const char *start, const char *end, + char character) noexcept { + return get_default_implementation()->find(start, end, character); +} +simdutf_warn_unused const char16_t *detail::find(const char16_t *start, + const char16_t *end, + char16_t character) noexcept { + return get_default_implementation()->find(start, end, character); +} + +simdutf_warn_unused size_t +maximal_binary_length_from_base64(const char *input, size_t length) noexcept { + return get_default_implementation()->maximal_binary_length_from_base64( + input, length); +} + +simdutf_warn_unused result base64_to_binary( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_handling_options) noexcept { + return get_default_implementation()->base64_to_binary( + input, length, output, options, last_chunk_handling_options); +} + +simdutf_warn_unused size_t maximal_binary_length_from_base64( + const char16_t *input, size_t length) noexcept { + return get_default_implementation()->maximal_binary_length_from_base64( + input, length); +} + +simdutf_warn_unused result base64_to_binary( + const char16_t *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_handling_options) noexcept { + return get_default_implementation()->base64_to_binary( + input, length, output, options, last_chunk_handling_options); +} + +// moved to implementation.h +// simdutf_warn_unused bool base64_ignorable(char input, +// base64_options options) noexcept +// simdutf_warn_unused bool base64_ignorable(char16_t input, +// base64_options options) noexcept +// simdutf_warn_unused bool base64_valid(char input, +// base64_options options) noexcept +// simdutf_warn_unused bool base64_valid(char16_t input, +// base64_options options) noexcept +// simdutf_warn_unused bool +// base64_valid_or_padding(char input, base64_options options) noexcept +// simdutf_warn_unused bool +// base64_valid_or_padding(char16_t input, base64_options options) noexcept + +// base64_to_binary_safe_impl is moved to +// include/simdutf/base64_implementation.h + + #if SIMDUTF_ATOMIC_REF +size_t atomic_binary_to_base64(const char *input, size_t length, char *output, + base64_options options) noexcept { + size_t retval = 0; + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) + // We use a smaller buffer during fuzzing to more easily detect bugs. + constexpr size_t input_block_size = 128 * 3; + #else + // Arbitrary block sizes: 3KB for input which produces 4KB in output. + constexpr size_t input_block_size = 1024 * 3; + #endif + std::array inbuf; + for (size_t i = 0; i < length; i += input_block_size) { + const size_t current_block_size = std::min(input_block_size, length - i); + simdutf::scalar::memcpy_atomic_read(inbuf.data(), input + i, + current_block_size); + const size_t written = binary_to_base64(inbuf.data(), current_block_size, + output + retval, options); + retval += written; + } + return retval; +} + #endif // SIMDUTF_ATOMIC_REF + +#endif // SIMDUTF_FEATURE_BASE64 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t convert_latin1_to_utf8_safe( + const char *buf, size_t len, char *utf8_output, size_t utf8_len) noexcept { + const auto start{utf8_output}; -simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept { - return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output); -} + while (true) { + // convert_latin1_to_utf8 will never write more than input length * 2 + auto read_len = std::min(len, utf8_len >> 1); + if (read_len <= 16) { + break; + } -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { - return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output); -} + const auto write_len = + simdutf::convert_latin1_to_utf8(buf, read_len, utf8_output); -simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::utf32_to_utf16::convert(buf, len, utf16_output); -} + utf8_output += write_len; + utf8_len -= write_len; + buf += read_len; + len -= read_len; + } -simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::utf32_to_utf16::convert(buf, len, utf16_output); -} + utf8_output += + scalar::latin1_to_utf8::convert_safe(buf, len, utf8_output, utf8_len); -simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::utf32_to_utf16::convert_with_errors(buf, len, utf16_output); + return utf8_output - start; } - -simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::utf32_to_utf16::convert_with_errors(buf, len, utf16_output); +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_BASE64 +simdutf_warn_unused result +base64_to_binary_safe(const char *input, size_t length, char *output, + size_t &outlen, base64_options options, + last_chunk_handling_options last_chunk_handling_options, + bool decode_up_to_bad_char) noexcept { + return base64_to_binary_safe_impl(input, length, output, outlen, + options, last_chunk_handling_options, + decode_up_to_bad_char); +} +simdutf_warn_unused result +base64_to_binary_safe(const char16_t *input, size_t length, char *output, + size_t &outlen, base64_options options, + last_chunk_handling_options last_chunk_handling_options, + bool decode_up_to_bad_char) noexcept { + return base64_to_binary_safe_impl( + input, length, output, outlen, options, last_chunk_handling_options, + decode_up_to_bad_char); +} + +size_t binary_to_base64(const char *input, size_t length, char *output, + base64_options options) noexcept { + return get_default_implementation()->binary_to_base64(input, length, output, + options); +} + +size_t binary_to_base64_with_lines(const char *input, size_t length, + char *output, size_t line_length, + base64_options options) noexcept { + return get_default_implementation()->binary_to_base64_with_lines( + input, length, output, line_length, options); +} +#endif // SIMDUTF_FEATURE_BASE64 + +#if SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused simdutf::encoding_type +autodetect_encoding(const char *buf, size_t length) noexcept { + return get_default_implementation()->autodetect_encoding(buf, length); } -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::utf32_to_utf16::convert_valid(buf, len, utf16_output); +simdutf_warn_unused int detect_encodings(const char *buf, + size_t length) noexcept { + return get_default_implementation()->detect_encodings(buf, length); } +#endif // SIMDUTF_FEATURE_DETECT_ENCODING -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::utf32_to_utf16::convert_valid(buf, len, utf16_output); +const implementation *builtin_implementation() { + static const implementation *builtin_impl = + get_available_implementations()[SIMDUTF_STRINGIFY( + SIMDUTF_BUILTIN_IMPLEMENTATION)]; + return builtin_impl; } -simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - return scalar::utf16_to_utf32::convert(buf, len, utf32_output); +#if SIMDUTF_FEATURE_UTF8 +simdutf_warn_unused size_t trim_partial_utf8(const char *input, size_t length) { + return scalar::utf8::trim_partial_utf8(input, length); } +#endif // SIMDUTF_FEATURE_UTF8 -simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - return scalar::utf16_to_utf32::convert(buf, len, utf32_output); +#if SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t trim_partial_utf16be(const char16_t *input, + size_t length) { + return scalar::utf16::trim_partial_utf16(input, length); } -simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - return scalar::utf16_to_utf32::convert_with_errors(buf, len, utf32_output); +simdutf_warn_unused size_t trim_partial_utf16le(const char16_t *input, + size_t length) { + return scalar::utf16::trim_partial_utf16(input, length); } -simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - return scalar::utf16_to_utf32::convert_with_errors(buf, len, utf32_output); +simdutf_warn_unused size_t trim_partial_utf16(const char16_t *input, + size_t length) { + #if SIMDUTF_IS_BIG_ENDIAN + return trim_partial_utf16be(input, length); + #else + return trim_partial_utf16le(input, length); + #endif } +#endif // SIMDUTF_FEATURE_UTF16 -simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - return scalar::utf16_to_utf32::convert_valid(buf, len, utf32_output); -} +} // namespace simdutf +/* end file src/implementation.cpp */ -simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - return scalar::utf16_to_utf32::convert_valid(buf, len, utf32_output); -} +SIMDUTF_PUSH_DISABLE_WARNINGS +SIMDUTF_DISABLE_UNDESIRED_WARNINGS -void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept { - scalar::utf16::change_endianness_utf16(input, length, output); +#if SIMDUTF_IMPLEMENTATION_ARM64 +/* begin file src/arm64/implementation.cpp */ +/* begin file src/simdutf/arm64/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "arm64" +// #define SIMDUTF_IMPLEMENTATION arm64 +#define SIMDUTF_SIMD_HAS_BYTEMASK 1 +/* end file src/simdutf/arm64/begin.h */ +namespace simdutf { +namespace arm64 { +namespace { +#ifndef SIMDUTF_ARM64_H + #error "arm64.h must be included" +#endif +using namespace simd; + +#if SIMDUTF_FEATURE_ASCII || SIMDUTF_FEATURE_DETECT_ENCODING || \ + SIMDUTF_FEATURE_UTF8 +simdutf_really_inline bool is_ascii(const simd8x64 &input) { + simd8 bits = input.reduce_or(); + return bits.max_val() < 0b10000000u; } +#endif // SIMDUTF_FEATURE_ASCII || SIMDUTF_FEATURE_DETECT_ENCODING || + // SIMDUTF_FEATURE_UTF8 -simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept { - return scalar::utf16::count_code_points(input, length); +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_really_inline simd8 +must_be_2_3_continuation(const simd8 prev2, + const simd8 prev3) { + simd8 is_third_byte = prev2 >= uint8_t(0b11100000u); + simd8 is_fourth_byte = prev3 >= uint8_t(0b11110000u); + return is_third_byte ^ is_fourth_byte; } +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING -simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept { - return scalar::utf16::count_code_points(input, length); -} - -simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept { - return scalar::utf8::count_code_points(input, length); -} - -simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept { - return scalar::utf8::count_code_points(buf,len); -} - -simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept { - return scalar::utf16::latin1_length_from_utf16(length); -} - -simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t length) const noexcept { - return length; +#if SIMDUTF_FEATURE_UTF8 && (SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32) +// common functions for utf8 conversions +simdutf_really_inline uint16x4_t convert_utf8_3_byte_to_utf16(uint8x16_t in) { + // Low half contains 10cccccc|1110aaaa + // High half contains 10bbbbbb|10bbbbbb + #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x16_t sh = simdutf_make_uint8x16_t(0, 2, 3, 5, 6, 8, 9, 11, 1, 1, + 4, 4, 7, 7, 10, 10); + #else + const uint8x16_t sh = {0, 2, 3, 5, 6, 8, 9, 11, 1, 1, 4, 4, 7, 7, 10, 10}; + #endif + uint8x16_t perm = vqtbl1q_u8(in, sh); + // Split into half vectors. + // 10cccccc|1110aaaa + uint8x8_t perm_low = vget_low_u8(perm); // no-op + // 10bbbbbb|10bbbbbb + uint8x8_t perm_high = vget_high_u8(perm); + // xxxxxxxx 10bbbbbb + uint16x4_t mid = vreinterpret_u16_u8(perm_high); // no-op + // xxxxxxxx 1110aaaa + uint16x4_t high = vreinterpret_u16_u8(perm_low); // no-op + // Assemble with shift left insert. + // xxxxxxaa aabbbbbb + uint16x4_t mid_high = vsli_n_u16(mid, high, 6); + // (perm_low << 8) | (perm_low >> 8) + // xxxxxxxx 10cccccc + uint16x4_t low = vreinterpret_u16_u8(vrev16_u8(perm_low)); + // Shift left insert into the low bits + // aaaabbbb bbcccccc + uint16x4_t composed = vsli_n_u16(low, mid_high, 6); + return composed; } -simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char * input, size_t length) const noexcept { - return scalar::latin1::utf8_length_from_latin1(input,length); -} +simdutf_really_inline uint16x8_t convert_utf8_2_byte_to_utf16(uint8x16_t in) { + // Converts 6 2 byte UTF-8 characters to 6 UTF-16 characters. + // Technically this calculates 8, but 6 does better and happens more often + // (The languages which use these codepoints use ASCII spaces so 8 would need + // to be in the middle of a very long word). -simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept { - return scalar::utf16::utf8_length_from_utf16(input, length); + // 10bbbbbb 110aaaaa + uint16x8_t upper = vreinterpretq_u16_u8(in); + // (in << 8) | (in >> 8) + // 110aaaaa 10bbbbbb + uint16x8_t lower = vreinterpretq_u16_u8(vrev16q_u8(in)); + // 00000000 000aaaaa + uint16x8_t upper_masked = vandq_u16(upper, vmovq_n_u16(0x1F)); + // Assemble with shift left insert. + // 00000aaa aabbbbbb + uint16x8_t composed = vsliq_n_u16(lower, upper_masked, 6); + return composed; } -simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept { - return scalar::utf16::utf8_length_from_utf16(input, length); +simdutf_really_inline uint16x8_t +convert_utf8_1_to_2_byte_to_utf16(uint8x16_t in, size_t shufutf8_idx) { + // Converts 6 1-2 byte UTF-8 characters to 6 UTF-16 characters. + // This is a relatively easy scenario + // we process SIX (6) input code-code units. The max length in bytes of six + // code code units spanning between 1 and 2 bytes each is 12 bytes. + uint8x16_t sh = vld1q_u8(reinterpret_cast( + simdutf::tables::utf8_to_utf16::shufutf8[shufutf8_idx])); + // Shuffle + // 1 byte: 00000000 0bbbbbbb + // 2 byte: 110aaaaa 10bbbbbb + uint16x8_t perm = vreinterpretq_u16_u8(vqtbl1q_u8(in, sh)); + // Mask + // 1 byte: 00000000 0bbbbbbb + // 2 byte: 00000000 00bbbbbb + uint16x8_t ascii = vandq_u16(perm, vmovq_n_u16(0x7f)); // 6 or 7 bits + // 1 byte: 00000000 00000000 + // 2 byte: 000aaaaa 00000000 + uint16x8_t highbyte = vandq_u16(perm, vmovq_n_u16(0x1f00)); // 5 bits + // Combine with a shift right accumulate + // 1 byte: 00000000 0bbbbbbb + // 2 byte: 00000aaa aabbbbbb + uint16x8_t composed = vsraq_n_u16(ascii, highbyte, 2); + return composed; } +#endif // SIMDUTF_FEATURE_UTF8 && (SIMDUTF_FEATURE_UTF16 || + // SIMDUTF_FEATURE_UTF32) -simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept { - return scalar::utf16::utf32_length_from_utf16(input, length); -} +#if SIMDUTF_FEATURE_UTF16 +/* begin file src/arm64/arm_utf16fix.cpp */ -simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept { - return scalar::utf16::utf32_length_from_utf16(input, length); +/* + * Returns if a vector of type uint8x16_t is all zero. + */ +simdutf_really_inline int veq_non_zero(uint8x16_t v) { + // might compile to two instructions: + // umaxv s0, v0.4s + // fmov w0, s0 + // On Apple hardware, they both have a latency of 3 cycles, with a throughput + // of four instructions per cycle. So that's 6 cycles of latency (!!!) for the + // two instructions. A narrowing shift has the same latency and throughput. + return vmaxvq_u32(vreinterpretq_u32_u8(v)); } -simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept { - return scalar::latin1::utf16_length_from_latin1(length); -} +/* + * Process one block of 16 characters. If in_place is false, + * copy the block from in to out. If there is a sequencing + * error in the block, overwrite the illsequenced characters + * with the replacement character. This function reads one + * character before the beginning of the buffer as a lookback. + * If that character is illsequenced, it too is overwritten. + */ +template +void utf16fix_block(char16_t *out, const char16_t *in) { + const char16_t replacement = scalar::utf16::replacement(); + uint8x16x2_t lb, block; + uint8x16_t lb_masked, block_masked, lb_is_high, block_is_low; + uint8x16_t illseq; + + constexpr int idx = !match_system(big_endian) ? 0 : 1; + + /* TODO: compute lookback using shifts */ + lb = vld2q_u8((const uint8_t *)(in - 1)); + block = vld2q_u8((const uint8_t *)in); + lb_masked = vandq_u8(lb.val[idx], vdupq_n_u8(0xfc)); + block_masked = vandq_u8(block.val[idx], vdupq_n_u8(0xfc)); + lb_is_high = vceqq_u8(lb_masked, vdupq_n_u8(0xd8)); + block_is_low = vceqq_u8(block_masked, vdupq_n_u8(0xdc)); + + illseq = veorq_u8(lb_is_high, block_is_low); + if (veq_non_zero(illseq)) { + uint8x16_t lb_illseq, block_illseq; + char16_t lbc; + int ill; + + /* compute the cause of the illegal sequencing */ + lb_illseq = vbicq_u8(lb_is_high, block_is_low); + block_illseq = vorrq_u8(vbicq_u8(block_is_low, lb_is_high), + vextq_u8(lb_illseq, vdupq_n_u8(0), 1)); + + /* fix illegal sequencing in the lookback */ + ill = vgetq_lane_u8(lb_illseq, 0); + lbc = out[-1]; + out[-1] = ill ? replacement : lbc; + + /* fix illegal sequencing in the main block */ + if simdutf_constexpr (!match_system(big_endian)) { + block.val[1] = vbslq_u8(block_illseq, vdupq_n_u8(0xfd), block.val[1]); + block.val[0] = vorrq_u8(block_illseq, block.val[0]); + } else { + block.val[0] = vbslq_u8(block_illseq, vdupq_n_u8(0xfd), block.val[0]); + block.val[1] = vorrq_u8(block_illseq, block.val[1]); + } -simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept { - return scalar::utf8::utf16_length_from_utf8(input, length); + vst2q_u8((uint8_t *)out, block); + } else if (!inplace) { + vst2q_u8((uint8_t *)out, block); + } } -simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept { - return scalar::utf32::utf8_length_from_utf32(input, length); +template +uint8x16_t get_mismatch_copy(const char16_t *in, char16_t *out) { + constexpr int idx = !match_system(big_endian) ? 0 : 1; + uint8x16x2_t lb = vld2q_u8((const uint8_t *)(in - 1)); + uint8x16x2_t block = vld2q_u8((const uint8_t *)in); + uint8x16_t lb_masked = vandq_u8(lb.val[idx], vdupq_n_u8(0xfc)); + uint8x16_t block_masked = vandq_u8(block.val[idx], vdupq_n_u8(0xfc)); + uint8x16_t lb_is_high = vceqq_u8(lb_masked, vdupq_n_u8(0xd8)); + uint8x16_t block_is_low = vceqq_u8(block_masked, vdupq_n_u8(0xdc)); + uint8x16_t illseq = veorq_u8(lb_is_high, block_is_low); + if (!inplace) { + vst2q_u8((uint8_t *)out, block); + } + return illseq; } -simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept { - return scalar::utf32::utf16_length_from_utf32(input, length); -} +simdutf_really_inline uint64_t get_mask(uint8x16_t illse0, uint8x16_t illse1, + uint8x16_t illse2, uint8x16_t illse3) { +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + uint8x16_t bit_mask = + simdutf_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80); +#else + uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; +#endif + uint8x16_t sum0 = + vpaddq_u8(vandq_u8(illse0, bit_mask), vandq_u8(illse1, bit_mask)); + uint8x16_t sum1 = + vpaddq_u8(vandq_u8(illse2, bit_mask), vandq_u8(illse3, bit_mask)); + sum0 = vpaddq_u8(sum0, sum1); + sum0 = vpaddq_u8(sum0, sum0); + return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0); +} + +// The idea is to process 64 characters at a time, and if there is a mismatch +// we can fix it with a bit of scalar code. When the input is correct, this +// function might be faster than alternative implementations working on small +// blocks of input. +template +bool utf16fix_block64(char16_t *out, const char16_t *in) { + const char16_t replacement = scalar::utf16::replacement(); + + uint8x16_t illse0 = inplace ? get_mismatch_copy(in, out) + : get_mismatch_copy(in, out); + uint8x16_t illse1 = + inplace ? get_mismatch_copy(in + 16, out + 16) + : get_mismatch_copy(in + 16, out + 16); + uint8x16_t illse2 = + inplace ? get_mismatch_copy(in + 32, out + 32) + : get_mismatch_copy(in + 32, out + 32); + uint8x16_t illse3 = + inplace ? get_mismatch_copy(in + 48, out + 48) + : get_mismatch_copy(in + 48, out + 48); + // this branch could be marked as unlikely: + if (veq_non_zero( + vorrq_u8(vorrq_u8(illse0, illse1), vorrq_u8(illse2, illse3)))) { + uint64_t matches = get_mask(illse0, illse1, illse2, illse3); + // Given that ARM has a fast bitreverse instruction, we can + // reverse once and then use clz to find the first bit set. + // It is how it is done in simdjson and *might* be beneficial. + // + // We might also proceed in reverse to reduce the RAW hazard, + // but it might require more instructions. -simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept { - return scalar::latin1::utf32_length_from_latin1(length); + while (matches != 0) { + int r = trailing_zeroes(matches); // generates rbit + clz + // Either we have a high surrogate followed by a non-low surrogate + // or we have a low surrogate not preceded by a high surrogate. + bool is_high = scalar::utf16::is_high_surrogate(in[r - 1]); + out[r - is_high] = replacement; + matches = clear_least_significant_bit(matches); + } + return false; + } + return true; } -simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept { - return scalar::utf8::count_code_points(input, length); -} +template +void utf16fix_neon_64bits(const char16_t *in, size_t n, char16_t *out) { + size_t i; + const char16_t replacement = scalar::utf16::replacement(); + if (n < 17) { + return scalar::utf16::to_well_formed_utf16(in, n, out); + } + out[0] = + scalar::utf16::is_low_surrogate(in[0]) ? replacement : in[0]; + i = 1; -simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char * input, size_t length) const noexcept { - return scalar::base64::maximal_binary_length_from_base64(input, length); -} + /* duplicate code to have the compiler specialise utf16fix_block() */ + if (in == out) { + for (i = 1; i + 64 < n; i += 64) { + utf16fix_block64(out + i, in + i); + } -simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept { - while(length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) { - length--; - } - size_t equallocation = length; // location of the first padding character if any - size_t equalsigns = 0; - if(length > 0 && input[length - 1] == '=') { - length -= 1; - equalsigns++; - while(length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) { - length--; + for (; i + 16 < n; i += 16) { + utf16fix_block(out + i, in + i); } - if(length > 0 && input[length - 1] == '=') { - equalsigns++; - length -= 1; + + /* tbd: find carry */ + utf16fix_block(out + n - 16, in + n - 16); + } else { + for (i = 1; i + 64 < n; i += 64) { + utf16fix_block64(out + i, in + i); } - } - if(length == 0) { - if(equalsigns > 0) { - return {INVALID_BASE64_CHARACTER, equallocation}; + for (; i + 16 < n; i += 16) { + utf16fix_block(out + i, in + i); } - return {SUCCESS, 0}; + + utf16fix_block(out + n - 16, in + n - 16); } - result r = scalar::base64::base64_tail_decode(output, input, length, options); - if(r.error == error_code::SUCCESS && equalsigns > 0) { - // additional checks - if((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) { - return {INVALID_BASE64_CHARACTER, equallocation}; + out[n - 1] = scalar::utf16::is_high_surrogate(out[n - 1]) + ? replacement + : out[n - 1]; +} +/* end file src/arm64/arm_utf16fix.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +/* begin file src/arm64/arm_validate_utf16.cpp */ +template +const char16_t *arm_validate_utf16(const char16_t *input, size_t size) { + const char16_t *end = input + size; + const auto v_d8 = simd8::splat(0xd8); + const auto v_f8 = simd8::splat(0xf8); + const auto v_fc = simd8::splat(0xfc); + const auto v_dc = simd8::splat(0xdc); + while (end - input >= 16) { + // 0. Load data: since the validation takes into account only higher + // byte of each word, we compress the two vectors into one which + // consists only the higher bytes. + auto in0 = simd16(input); + auto in1 = + simd16(input + simd16::SIZE / sizeof(char16_t)); + if simdutf_constexpr (!match_system(big_endian)) { + in0 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in0))); + in1 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in1))); + } + const auto t0 = in0.shr<8>(); + const auto t1 = in1.shr<8>(); + const simd8 in = simd16::pack(t0, t1); + // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). + const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64(); + if (surrogates_wordmask == 0) { + input += 16; + } else { + // 2. We have some surrogates that have to be distinguished: + // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) + // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) + // + // Fact: high surrogate has 11th bit set (3rd bit in the higher word) + + // V - non-surrogate code units + // V = not surrogates_wordmask + const uint64_t V = ~surrogates_wordmask; + + // H - word-mask for high surrogates: the six highest bits are 0b1101'11 + const auto vH = ((in & v_fc) == v_dc); + const uint64_t H = vH.to_bitmask64(); + + // L - word mask for low surrogates + // L = not H and surrogates_wordmask + const uint64_t L = ~H & surrogates_wordmask; + + const uint64_t a = + L & (H >> 4); // A low surrogate must be followed by high one. + // (A low surrogate placed in the 7th register's word + // is an exception we handle.) + const uint64_t b = + a << 4; // Just mark that the opposite fact is hold, + // thanks to that we have only two masks for valid case. + const uint64_t c = V | a | b; // Combine all the masks into the final one. + if (c == ~0ull) { + // The whole input register contains valid UTF-16, i.e., + // either single code units or proper surrogate pairs. + input += 16; + } else if (c == 0xfffffffffffffffull) { + // The 15 lower code units of the input register contains valid UTF-16. + // The 15th word may be either a low or high surrogate. It the next + // iteration we 1) check if the low surrogate is followed by a high + // one, 2) reject sole high surrogate. + input += 15; + } else { + return nullptr; + } } } - return r; -} - -simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept { - return scalar::base64::maximal_binary_length_from_base64(input, length); + return input; } -simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept { - while(length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) { - length--; - } - size_t equallocation = length; // location of the first padding character if any - size_t equalsigns = 0; - if(length > 0 && input[length - 1] == '=') { - length -= 1; - equalsigns++; - while(length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) { - length--; - } - if(length > 0 && input[length - 1] == '=') { - equalsigns++; - length -= 1; - } - } - if(length == 0) { - if(equalsigns > 0) { - return {INVALID_BASE64_CHARACTER, equallocation}; - } - return {SUCCESS, 0}; - } - result r = scalar::base64::base64_tail_decode(output, input, length, options); - if(r.error == error_code::SUCCESS && equalsigns > 0) { - // additional checks - if((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) { - return {INVALID_BASE64_CHARACTER, equallocation}; +template +const char16_t *arm_validate_utf16_as_ascii(const char16_t *input, + size_t size) { + const char16_t *end = input + size; + while (end - input >= 16) { + uint16x8_t in1 = vld1q_u16(reinterpret_cast(input)); + uint16x8_t in2 = vld1q_u16(reinterpret_cast(input + 8)); + uint16x8_t inor = vorrq_u16(in1, in2); + if simdutf_constexpr (!match_system(big_endian)) { + inor = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(inor))); + } + // next we compute inor > 0x7f + uint16x8_t cmp = vcgtq_u16(inor, vdupq_n_u16(0x7f)); + uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(cmp, 4)), 0); + if (mask) { + return nullptr; } + input += 16; } - return r; + return input; } -simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length, base64_options options) const noexcept { - return scalar::base64::base64_length_from_binary(length, options); +template +const result arm_validate_utf16_with_errors(const char16_t *input, + size_t size) { + const char16_t *start = input; + const char16_t *end = input + size; + + const auto v_d8 = simd8::splat(0xd8); + const auto v_f8 = simd8::splat(0xf8); + const auto v_fc = simd8::splat(0xfc); + const auto v_dc = simd8::splat(0xdc); + while (input + 16 < end) { + // 0. Load data: since the validation takes into account only higher + // byte of each word, we compress the two vectors into one which + // consists only the higher bytes. + auto in0 = simd16(input); + auto in1 = + simd16(input + simd16::SIZE / sizeof(char16_t)); + + if simdutf_constexpr (!match_system(big_endian)) { + in0 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in0))); + in1 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in1))); + } + const auto t0 = in0.shr<8>(); + const auto t1 = in1.shr<8>(); + const simd8 in = simd16::pack(t0, t1); + // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). + const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64(); + if (surrogates_wordmask == 0) { + input += 16; + } else { + // 2. We have some surrogates that have to be distinguished: + // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) + // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) + // + // Fact: high surrogate has 11th bit set (3rd bit in the higher word) + + // V - non-surrogate code units + // V = not surrogates_wordmask + const uint64_t V = ~surrogates_wordmask; + + // H - word-mask for high surrogates: the six highest bits are 0b1101'11 + const auto vH = ((in & v_fc) == v_dc); + const uint64_t H = vH.to_bitmask64(); + + // L - word mask for low surrogates + // L = not H and surrogates_wordmask + const uint64_t L = ~H & surrogates_wordmask; + + const uint64_t a = + L & (H >> 4); // A low surrogate must be followed by high one. + // (A low surrogate placed in the 7th register's word + // is an exception we handle.) + const uint64_t b = + a << 4; // Just mark that the opposite fact is hold, + // thanks to that we have only two masks for valid case. + const uint64_t c = V | a | b; // Combine all the masks into the final one. + if (c == ~0ull) { + // The whole input register contains valid UTF-16, i.e., + // either single code units or proper surrogate pairs. + input += 16; + } else if (c == 0xfffffffffffffffull) { + // The 15 lower code units of the input register contains valid UTF-16. + // The 15th word may be either a low or high surrogate. It the next + // iteration we 1) check if the low surrogate is followed by a high + // one, 2) reject sole high surrogate. + input += 15; + } else { + return result(error_code::SURROGATE, input - start); + } + } + } + return result(error_code::SUCCESS, input - start); } +/* end file src/arm64/arm_validate_utf16.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING +/* begin file src/arm64/arm_validate_utf32le.cpp */ -size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept { - return scalar::base64::tail_encode_base64(output, input, length, options); -} -} // namespace fallback -} // namespace simdutf +const char32_t *arm_validate_utf32le(const char32_t *input, size_t size) { + const char32_t *end = input + size; -/* begin file src/simdutf/fallback/end.h */ -/* end file src/simdutf/fallback/end.h */ -/* end file src/fallback/implementation.cpp */ -#endif -#if SIMDUTF_IMPLEMENTATION_ICELAKE -/* begin file src/icelake/implementation.cpp */ + const uint32x4_t standardmax = vmovq_n_u32(0x10ffff); + const uint32x4_t offset = vmovq_n_u32(0xffff2000); + const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff); + uint32x4_t currentmax = vmovq_n_u32(0x0); + uint32x4_t currentoffsetmax = vmovq_n_u32(0x0); + while (end - input >= 4) { + const uint32x4_t in = vld1q_u32(reinterpret_cast(input)); + currentmax = vmaxq_u32(in, currentmax); + currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax); + input += 4; + } -/* begin file src/simdutf/icelake/begin.h */ -// redefining SIMDUTF_IMPLEMENTATION to "icelake" -// #define SIMDUTF_IMPLEMENTATION icelake + uint32x4_t is_zero = + veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax); + if (vmaxvq_u32(is_zero) != 0) { + return nullptr; + } -#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE -// nothing needed. -#else -SIMDUTF_TARGET_ICELAKE -#endif + is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), + standardoffsetmax); + if (vmaxvq_u32(is_zero) != 0) { + return nullptr; + } -#if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593 -SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized) -#endif // end of workaround -/* end file src/simdutf/icelake/begin.h */ -namespace simdutf { -namespace icelake { -namespace { -#ifndef SIMDUTF_ICELAKE_H -#error "icelake.h must be included" -#endif -/* begin file src/icelake/icelake_utf8_common.inl.cpp */ -// Common procedures for both validating and non-validating conversions from UTF-8. -enum block_processing_mode { SIMDUTF_FULL, SIMDUTF_TAIL}; + return input; +} -using utf8_to_utf16_result = std::pair; -using utf8_to_utf32_result = std::pair; +const result arm_validate_utf32le_with_errors(const char32_t *input, + size_t size) { + const char32_t *start = input; + const char32_t *end = input + size; -/* - process_block_utf8_to_utf16 converts up to 64 bytes from 'in' from UTF-8 - to UTF-16. When tail = SIMDUTF_FULL, then the full input buffer (64 bytes) - might be used. When tail = SIMDUTF_TAIL, we take into account 'gap' which - indicates how many input bytes are relevant. + const uint32x4_t standardmax = vmovq_n_u32(0x10ffff); + const uint32x4_t offset = vmovq_n_u32(0xffff2000); + const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff); + uint32x4_t currentmax = vmovq_n_u32(0x0); + uint32x4_t currentoffsetmax = vmovq_n_u32(0x0); - Returns true when the result is correct, otherwise it returns false. + while (end - input >= 4) { + const uint32x4_t in = vld1q_u32(reinterpret_cast(input)); + currentmax = vmaxq_u32(in, currentmax); + currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax); - The provided in and out pointers are advanced according to how many input - bytes have been processed, upon success. -*/ -template -simdutf_really_inline bool process_block_utf8_to_utf16(const char *&in, char16_t *&out, size_t gap) { - // constants - __m512i mask_identity = _mm512_set_epi8(63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - __m512i mask_c0c0c0c0 = _mm512_set1_epi32(0xc0c0c0c0); - __m512i mask_80808080 = _mm512_set1_epi32(0x80808080); - __m512i mask_f0f0f0f0 = _mm512_set1_epi32(0xf0f0f0f0); - __m512i mask_dfdfdfdf_tail = _mm512_set_epi64(0xffffdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf); - __m512i mask_c2c2c2c2 = _mm512_set1_epi32(0xc2c2c2c2); - __m512i mask_ffffffff = _mm512_set1_epi32(0xffffffff); - __m512i mask_d7c0d7c0 = _mm512_set1_epi32(0xd7c0d7c0); - __m512i mask_dc00dc00 = _mm512_set1_epi32(0xdc00dc00); - __m512i byteflip = _mm512_setr_epi64( - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809 - ); - // Note that 'tail' is a compile-time constant ! - __mmask64 b = (tail == SIMDUTF_FULL) ? 0xFFFFFFFFFFFFFFFF : (uint64_t(1) << gap) - 1; - __m512i input = (tail == SIMDUTF_FULL) ? _mm512_loadu_si512(in) : _mm512_maskz_loadu_epi8(b, in); - __mmask64 m1 = (tail == SIMDUTF_FULL) ? _mm512_cmplt_epu8_mask(input, mask_80808080) : _mm512_mask_cmplt_epu8_mask(b, input, mask_80808080); - if(_ktestc_mask64_u8(m1, b)) {// NOT(m1) AND b -- if all zeroes, then all ASCII - // alternatively, we could do 'if (m1 == b) { ' - if (tail == SIMDUTF_FULL) { - in += 64; // consumed 64 bytes - // we convert a full 64-byte block, writing 128 bytes. - __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input)); - if(big_endian) { input1 = _mm512_shuffle_epi8(input1, byteflip); } - _mm512_storeu_si512(out, input1); - out += 32; - __m512i input2 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1)); - if(big_endian) { input2 = _mm512_shuffle_epi8(input2, byteflip); } - _mm512_storeu_si512(out, input2); - out += 32; - return true; // we are done - } else { - in += gap; - if (gap <= 32) { - __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input)); - if(big_endian) { input1 = _mm512_shuffle_epi8(input1, byteflip); } - _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << (gap)) - 1), input1); - out += gap; - } else { - __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input)); - if(big_endian) { input1 = _mm512_shuffle_epi8(input1, byteflip); } - _mm512_storeu_si512(out, input1); - out += 32; - __m512i input2 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1)); - if(big_endian) { input2 = _mm512_shuffle_epi8(input2, byteflip); } - _mm512_mask_storeu_epi16(out, __mmask32((uint32_t(1) << (gap - 32)) - 1), input2); - out += gap - 32; - } - return true; // we are done + uint32x4_t is_zero = + veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax); + if (vmaxvq_u32(is_zero) != 0) { + return result(error_code::TOO_LARGE, input - start); } - } - // classify characters further - __mmask64 m234 = _mm512_cmp_epu8_mask(mask_c0c0c0c0, input, - _MM_CMPINT_LE); // 0xc0 <= input, 2, 3, or 4 leading byte - __mmask64 m34 = _mm512_cmp_epu8_mask(mask_dfdfdfdf_tail, input, - _MM_CMPINT_LT); // 0xdf < input, 3 or 4 leading byte - - __mmask64 milltwobytes = _mm512_mask_cmp_epu8_mask(m234, input, mask_c2c2c2c2, - _MM_CMPINT_LT); // 0xc0 <= input < 0xc2 (illegal two byte sequence) - // Overlong 2-byte sequence - if (_ktestz_mask64_u8(milltwobytes, milltwobytes) == 0) { - // Overlong 2-byte sequence - return false; - } - if (_ktestz_mask64_u8(m34, m34) == 0) { - // We have a 3-byte sequence and/or a 2-byte sequence, or possibly even a 4-byte sequence! - __mmask64 m4 = _mm512_cmp_epu8_mask(input, mask_f0f0f0f0, - _MM_CMPINT_NLT); // 0xf0 <= zmm0 (4 byte start bytes) - __mmask64 mask_not_ascii = (tail == SIMDUTF_FULL) ? _knot_mask64(m1) : _kand_mask64(_knot_mask64(m1), b); + is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), + standardoffsetmax); + if (vmaxvq_u32(is_zero) != 0) { + return result(error_code::SURROGATE, input - start); + } - __mmask64 mp1 = _kshiftli_mask64(m234, 1); - __mmask64 mp2 = _kshiftli_mask64(m34, 2); - // We could do it as follows... - // if (_kortestz_mask64_u8(m4,m4)) { // compute the bitwise OR of the 64-bit masks a and b and return 1 if all zeroes - // but GCC generates better code when we do: - if (m4 == 0) { // compute the bitwise OR of the 64-bit masks a and b and return 1 if all zeroes - // Fast path with 1,2,3 bytes - __mmask64 mc = _kor_mask64(mp1, mp2); // expected continuation bytes - __mmask64 m1234 = _kor_mask64(m1, m234); - // mismatched continuation bytes: - if (tail == SIMDUTF_FULL) { - __mmask64 xnormcm1234 = _kxnor_mask64(mc, m1234); // XNOR of mc and m1234 should be all zero if they differ - // the presence of a 1 bit indicates that they overlap. - // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1 if all zeroes. - if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) { return false; } - } else { - __mmask64 bxorm1234 = _kxor_mask64(b, m1234); - if (mc != bxorm1234) { return false; } - } - // mend: identifying the last bytes of each sequence to be decoded - __mmask64 mend = _kshiftri_mask64(m1234, 1); - if (tail != SIMDUTF_FULL) { - mend = _kor_mask64(mend, (uint64_t(1) << (gap - 1))); - } + input += 4; + } + return result(error_code::SUCCESS, input - start); +} +/* end file src/arm64/arm_validate_utf32le.cpp */ +#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING - __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity); - __m512i last_and_thirdu16 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third)); - - __m512i nonasciitags = _mm512_maskz_mov_epi8(mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000 other: 11000000 - __m512i clearedbytes = _mm512_andnot_si512(nonasciitags, input); // high two bits cleared where not ASCII - __m512i lastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, last_and_thirdu16, - clearedbytes); // the last byte of each character - - __mmask64 mask_before_non_ascii = _kshiftri_mask64(mask_not_ascii, 1); // bytes that precede non-ASCII bytes - __m512i indexofsecondlastbytes = _mm512_add_epi16(mask_ffffffff, last_and_thirdu16); // indices of the second last bytes - __m512i beforeasciibytes = _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes); - __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofsecondlastbytes, - beforeasciibytes); // the second last bytes (of two, three byte seq, - // surrogates) - secondlastbytes = _mm512_slli_epi16(secondlastbytes, 6); // shifted into position - - __m512i indexofthirdlastbytes = _mm512_add_epi16(mask_ffffffff, - indexofsecondlastbytes); // indices of the second last bytes - __m512i thirdlastbyte = _mm512_maskz_mov_epi8(m34, - clearedbytes); // only those that are the third last byte of a sequence - __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofthirdlastbytes, - thirdlastbyte); // the third last bytes (of three byte sequences, hi - // surrogate) - thirdlastbytes = _mm512_slli_epi16(thirdlastbytes, 12); // shifted into position - __m512i Wout = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes, thirdlastbytes, 254); - // the elements of Wout excluding the last element if it happens to be a high surrogate: - - __mmask64 mprocessed = (tail == SIMDUTF_FULL) ? _pdep_u64(0xFFFFFFFF, mend) : _pdep_u64(0xFFFFFFFF, _kand_mask64(mend, b)); // we adjust mend at the end of the output. +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/arm64/arm_convert_latin1_to_utf16.cpp */ +template +std::pair +arm_convert_latin1_to_utf16(const char *buf, size_t len, + char16_t *utf16_output) { + const char *end = buf + len; + + while (end - buf >= 16) { + uint8x16_t in8 = vld1q_u8(reinterpret_cast(buf)); + uint16x8_t inlow = vmovl_u8(vget_low_u8(in8)); + if simdutf_constexpr (!match_system(big_endian)) { + inlow = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(inlow))); + } + vst1q_u16(reinterpret_cast(utf16_output), inlow); + uint16x8_t inhigh = vmovl_u8(vget_high_u8(in8)); + if simdutf_constexpr (!match_system(big_endian)) { + inhigh = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(inhigh))); + } + vst1q_u16(reinterpret_cast(utf16_output + 8), inhigh); + utf16_output += 16; + buf += 16; + } + return std::make_pair(buf, utf16_output); +} +/* end file src/arm64/arm_convert_latin1_to_utf16.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/arm64/arm_convert_latin1_to_utf32.cpp */ +std::pair +arm_convert_latin1_to_utf32(const char *buf, size_t len, + char32_t *utf32_output) { + const char *end = buf + len; + + while (end - buf >= 16) { + uint8x16_t in8 = vld1q_u8(reinterpret_cast(buf)); + uint16x8_t in8low = vmovl_u8(vget_low_u8(in8)); + uint32x4_t in16lowlow = vmovl_u16(vget_low_u16(in8low)); + uint32x4_t in16lowhigh = vmovl_u16(vget_high_u16(in8low)); + uint16x8_t in8high = vmovl_u8(vget_high_u8(in8)); + uint32x4_t in8highlow = vmovl_u16(vget_low_u16(in8high)); + uint32x4_t in8highhigh = vmovl_u16(vget_high_u16(in8high)); + vst1q_u32(reinterpret_cast(utf32_output), in16lowlow); + vst1q_u32(reinterpret_cast(utf32_output + 4), in16lowhigh); + vst1q_u32(reinterpret_cast(utf32_output + 8), in8highlow); + vst1q_u32(reinterpret_cast(utf32_output + 12), in8highhigh); + + utf32_output += 16; + buf += 16; + } - // Encodings out of range... - { - // the location of 3-byte sequence start bytes in the input - __mmask64 m3 = m34 & (b ^ m4); - // code units in Wout corresponding to 3-byte sequences. - __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend)); - __m512i mask_08000800 = _mm512_set1_epi32(0x08000800); - __mmask32 Msmall800 = _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800); - __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800); - __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800); - __mmask32 M3s = _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800); - if (_kor_mask32(Msmall800, M3s)) { return false; } - } - int64_t nout = _mm_popcnt_u64(mprocessed); - in += 64 - _lzcnt_u64(mprocessed); - if(big_endian) { Wout = _mm512_shuffle_epi8(Wout, byteflip); } - _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout); - out += nout; - return true; // ok + return std::make_pair(buf, utf32_output); +} +/* end file src/arm64/arm_convert_latin1_to_utf32.cpp */ +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/arm64/arm_convert_latin1_to_utf8.cpp */ +/* + Returns a pair: the first unprocessed byte from buf and utf8_output + A scalar routing should carry on the conversion of the tail. +*/ +std::pair +arm_convert_latin1_to_utf8(const char *latin1_input, size_t len, + char *utf8_out) { + uint8_t *utf8_output = reinterpret_cast(utf8_out); + const char *end = latin1_input + len; + const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080); + // We always write 16 bytes, of which more than the first 8 bytes + // are valid. A safety margin of 8 is more than sufficient. + while (end - latin1_input >= 16 + 8) { + uint8x16_t in8 = vld1q_u8(reinterpret_cast(latin1_input)); + if (vmaxvq_u8(in8) <= 0x7F) { // ASCII fast path!!!! + vst1q_u8(utf8_output, in8); + utf8_output += 16; + latin1_input += 16; + continue; } - // - // We have a 4-byte sequence, this is the general case. - // Slow! - __mmask64 mp3 = _kshiftli_mask64(m4, 3); - __mmask64 mc = _kor_mask64(_kor_mask64(mp1, mp2), mp3); // expected continuation bytes - __mmask64 m1234 = _kor_mask64(m1, m234); - // mend: identifying the last bytes of each sequence to be decoded - __mmask64 mend = _kor_mask64(_kshiftri_mask64(_kor_mask64(mp3, m1234), 1), mp3); - if (tail != SIMDUTF_FULL) { - mend = _kor_mask64(mend, __mmask64(uint64_t(1) << (gap - 1))); - } - __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity); - __m512i last_and_thirdu16 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third)); - - __m512i nonasciitags = _mm512_maskz_mov_epi8(mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000 other: 11000000 - __m512i clearedbytes = _mm512_andnot_si512(nonasciitags, input); // high two bits cleared where not ASCII - __m512i lastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, last_and_thirdu16, - clearedbytes); // the last byte of each character - - __mmask64 mask_before_non_ascii = _kshiftri_mask64(mask_not_ascii, 1); // bytes that precede non-ASCII bytes - __m512i indexofsecondlastbytes = _mm512_add_epi16(mask_ffffffff, last_and_thirdu16); // indices of the second last bytes - __m512i beforeasciibytes = _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes); - __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofsecondlastbytes, - beforeasciibytes); // the second last bytes (of two, three byte seq, - // surrogates) - secondlastbytes = _mm512_slli_epi16(secondlastbytes, 6); // shifted into position - - __m512i indexofthirdlastbytes = _mm512_add_epi16(mask_ffffffff, - indexofsecondlastbytes); // indices of the second last bytes - __m512i thirdlastbyte = _mm512_maskz_mov_epi8(m34, - clearedbytes); // only those that are the third last byte of a sequence - __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofthirdlastbytes, - thirdlastbyte); // the third last bytes (of three byte sequences, hi - // surrogate) - thirdlastbytes = _mm512_slli_epi16(thirdlastbytes, 12); // shifted into position - __m512i thirdsecondandlastbytes = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes, thirdlastbytes, 254); - uint64_t Mlo_uint64 = _pext_u64(mp3, mend); - __mmask32 Mlo = __mmask32(Mlo_uint64); - __mmask32 Mhi = __mmask32(Mlo_uint64 >> 1); - __m512i lo_surr_mask = _mm512_maskz_mov_epi16(Mlo, - mask_dc00dc00); // lo surr: 1101110000000000, other: 0000000000000000 - __m512i shifted4_thirdsecondandlastbytes = _mm512_srli_epi16(thirdsecondandlastbytes, - 4); // hi surr: 00000WVUTSRQPNML vuts = WVUTS - 1 - __m512i tagged_lo_surrogates = _mm512_or_si512(thirdsecondandlastbytes, - lo_surr_mask); // lo surr: 110111KJHGFEDCBA, other: unchanged - __m512i Wout = _mm512_mask_add_epi16(tagged_lo_surrogates, Mhi, shifted4_thirdsecondandlastbytes, - mask_d7c0d7c0); // hi sur: 110110vutsRQPNML, other: unchanged - // the elements of Wout excluding the last element if it happens to be a high surrogate: - __mmask32 Mout = ~(Mhi & 0x80000000); - __mmask64 mprocessed = (tail == SIMDUTF_FULL) ? _pdep_u64(Mout, mend) : _pdep_u64(Mout, _kand_mask64(mend, b)); // we adjust mend at the end of the output. + // We just fallback on UTF-16 code. This could be optimized/simplified + // further. + uint16x8_t in16 = vmovl_u8(vget_low_u8(in8)); + // 1. prepare 2-byte values + // input 8-bit word : [aabb|bbbb] x 8 + // expected output : [1100|00aa|10bb|bbbb] x 8 + const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00); + const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f); + // t0 = [0000|00aa|bbbb|bb00] + const uint16x8_t t0 = vshlq_n_u16(in16, 2); + // t1 = [0000|00aa|0000|0000] + const uint16x8_t t1 = vandq_u16(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const uint16x8_t t2 = vandq_u16(in16, v_003f); + // t3 = [0000|00aa|00bb|bbbb] + const uint16x8_t t3 = vorrq_u16(t1, t2); + // t4 = [1100|00aa|10bb|bbbb] + const uint16x8_t t4 = vorrq_u16(t3, v_c080); + // 2. merge ASCII and 2-byte codewords + const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); + const uint16x8_t one_byte_bytemask = vcleq_u16(in16, v_007f); + const uint8x16_t utf8_unpacked = + vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in16, t4)); + // 3. prepare bitmask for 8-bit lookup +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t mask = simdutf_make_uint16x8_t( + 0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080); +#else + const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040, + 0x0002, 0x0008, 0x0020, 0x0080}; +#endif + uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask)); + // 4. pack the bytes + const uint8_t *row = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; + const uint8x16_t shuffle = vld1q_u8(row + 1); + const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle); - // mismatched continuation bytes: - if (tail == SIMDUTF_FULL) { - __mmask64 xnormcm1234 = _kxnor_mask64(mc, m1234); // XNOR of mc and m1234 should be all zero if they differ - // the presence of a 1 bit indicates that they overlap. - // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1 if all zeroes. - if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) { return false; } - } else { - __mmask64 bxorm1234 = _kxor_mask64(b, m1234); - if (mc != bxorm1234) { return false; } - } - // Encodings out of range... - { - // the location of 3-byte sequence start bytes in the input - __mmask64 m3 = m34 & (b ^ m4); - // code units in Wout corresponding to 3-byte sequences. - __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend)); - __m512i mask_08000800 = _mm512_set1_epi32(0x08000800); - __mmask32 Msmall800 = _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800); - __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800); - __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800); - __mmask32 M3s = _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800); - __m512i mask_04000400 = _mm512_set1_epi32(0x04000400); - __mmask32 M4s = _mm512_mask_cmpge_epu16_mask(Mhi, Moutminusd800, mask_04000400); - if (!_kortestz_mask32_u8(M4s, _kor_mask32(Msmall800, M3s))) { return false; } - } - in += 64 - _lzcnt_u64(mprocessed); - int64_t nout = _mm_popcnt_u64(mprocessed); - if(big_endian) { Wout = _mm512_shuffle_epi8(Wout, byteflip); } - _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout); - out += nout; - return true; // ok - } - // Fast path 2: all ASCII or 2 byte - __mmask64 continuation_or_ascii = (tail == SIMDUTF_FULL) ? _knot_mask64(m234) : _kand_mask64(_knot_mask64(m234), b); - // on top of -0xc0 we subtract -2 which we get back later of the - // continuation byte tags - __m512i leading2byte = _mm512_maskz_sub_epi8(m234, input, mask_c2c2c2c2); - __mmask64 leading = tail == (tail == SIMDUTF_FULL) ? _kor_mask64(m1, m234) : _kand_mask64(_kor_mask64(m1, m234), b); // first bytes of each sequence - if (tail == SIMDUTF_FULL) { - __mmask64 xnor234leading = _kxnor_mask64(_kshiftli_mask64(m234, 1), leading); - if (!_kortestz_mask64_u8(xnor234leading, xnor234leading)) { return false; } - } else { - __mmask64 bxorleading = _kxor_mask64(b, leading); - if (_kshiftli_mask64(m234, 1) != bxorleading) { return false; } - } - // - if (tail == SIMDUTF_FULL) { - // In the two-byte/ASCII scenario, we are easily latency bound, so we want - // to increment the input buffer as quickly as possible. - // We process 32 bytes unless the byte at index 32 is a continuation byte, - // in which case we include it as well for a total of 33 bytes. - // Note that if x is an ASCII byte, then the following is false: - // int8_t(x) <= int8_t(0xc0) under two's complement. - in += 32; - if(int8_t(*in) <= int8_t(0xc0)) in++; - // The alternative is to do - // in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii)); - // but it requires loading the input, doing the mask computation, and converting - // back the mask to a general register. It just takes too long, leaving the - // processor likely to be idle. - } else { - in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii)); - } - __m512i lead = _mm512_maskz_compress_epi8(leading, leading2byte); // will contain zero for ascii, and the data - lead = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(lead)); // ... zero extended into code units - __m512i follow = _mm512_maskz_compress_epi8(continuation_or_ascii, input); // the last bytes of each sequence - follow = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(follow)); // ... zero extended into code units - lead = _mm512_slli_epi16(lead, 6); // shifted into position - __m512i final = _mm512_add_epi16(follow, lead); // combining lead and follow + // 5. store bytes + vst1q_u8(utf8_output, utf8_packed); + // 6. adjust pointers + latin1_input += 8; + utf8_output += row[0]; - if(big_endian) { final = _mm512_shuffle_epi8(final, byteflip); } - if (tail == SIMDUTF_FULL) { - // Next part is UTF-16 specific and can be generalized to UTF-32. - int nout = _mm_popcnt_u32(uint32_t(leading)); - _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final); - out += nout; // UTF-8 to UTF-16 is only expansionary in this case. - } else { - int nout = int(_mm_popcnt_u64(_pdep_u64(0xFFFFFFFF, leading))); - _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final); - out += nout; // UTF-8 to UTF-16 is only expansionary in this case. - } + } // while - return true; // we are fine. + return std::make_pair(latin1_input, reinterpret_cast(utf8_output)); } +/* end file src/arm64/arm_convert_latin1_to_utf8.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/arm64/arm_convert_utf8_to_latin1.cpp */ +// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 16, usually 12). +size_t convert_masked_utf8_to_latin1(const char *input, + uint64_t utf8_end_of_code_point_mask, + char *&latin1_output) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + uint8x16_t in = vld1q_u8(reinterpret_cast(input)); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & 0xfff; + // + // Optimization note: our main path below is load-latency dependent. Thus it + // is maybe beneficial to have fast paths that depend on branch prediction but + // have less latency. This results in more instructions but, potentially, also + // higher speeds. - - -/* - utf32_to_utf16_masked converts `count` lower UTF-32 code units - from input `utf32` into UTF-16. It differs from utf32_to_utf16 - in that it 'masks' the writes. - - Returns how many 16-bit code units were stored. - - byteflip is used for flipping 16-bit code units, and it should be - __m512i byteflip = _mm512_setr_epi64( - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809 - ); - We pass it to the (always inlined) function to encourage the compiler to - keep the value in a (constant) register. -*/ + // We first try a few fast paths. + // The obvious first test is ASCII, which actually consumes the full 16. + if (utf8_end_of_code_point_mask == 0xfff) { + // We process in chunks of 12 bytes + vst1q_u8(reinterpret_cast(latin1_output), in); + latin1_output += 12; // We wrote 12 18-bit characters. + return 12; // We consumed 12 bytes. + } + /// We do not have a fast path available, or the fast path is unimportant, so + /// we fallback. + const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex + [input_utf8_end_of_code_point_mask][0]; + + const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex + [input_utf8_end_of_code_point_mask][1]; + // this indicates an invalid input: + if (idx >= 64) { + return consumed; + } + // Here we should have (idx < 64), if not, there is a bug in the validation or + // elsewhere. SIX (6) input code-code units this is a relatively easy scenario + // we process SIX (6) input code-code units. The max length in bytes of six + // code code units spanning between 1 and 2 bytes each is 12 bytes. Converts 6 + // 1-2 byte UTF-8 characters to 6 UTF-16 characters. This is a relatively easy + // scenario we process SIX (6) input code-code units. The max length in bytes + // of six code code units spanning between 1 and 2 bytes each is 12 bytes. + uint8x16_t sh = vld1q_u8(reinterpret_cast( + simdutf::tables::utf8_to_utf16::shufutf8[idx])); + // Shuffle + // 1 byte: 00000000 0bbbbbbb + // 2 byte: 110aaaaa 10bbbbbb + uint16x8_t perm = vreinterpretq_u16_u8(vqtbl1q_u8(in, sh)); + // Mask + // 1 byte: 00000000 0bbbbbbb + // 2 byte: 00000000 00bbbbbb + uint16x8_t ascii = vandq_u16(perm, vmovq_n_u16(0x7f)); // 6 or 7 bits + // 1 byte: 00000000 00000000 + // 2 byte: 000aaaaa 00000000 + uint16x8_t highbyte = vandq_u16(perm, vmovq_n_u16(0x1f00)); // 5 bits + // Combine with a shift right accumulate + // 1 byte: 00000000 0bbbbbbb + // 2 byte: 00000aaa aabbbbbb + uint16x8_t composed = vsraq_n_u16(ascii, highbyte, 2); + // writing 8 bytes even though we only care about the first 6 bytes. + uint8x8_t latin1_packed = vmovn_u16(composed); + vst1_u8(reinterpret_cast(latin1_output), latin1_packed); + latin1_output += 6; // We wrote 6 bytes. + return consumed; +} +/* end file src/arm64/arm_convert_utf8_to_latin1.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +/* begin file src/arm64/arm_convert_utf8_to_utf16.cpp */ +// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 16, usually 12). template -simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip, __m512i utf32, unsigned int count, char16_t* output) { +size_t convert_masked_utf8_to_utf16(const char *input, + uint64_t utf8_end_of_code_point_mask, + char16_t *&utf16_output) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + uint8x16_t in = vld1q_u8(reinterpret_cast(input)); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & 0xfff; + // + // Optimization note: our main path below is load-latency dependent. Thus it + // is maybe beneficial to have fast paths that depend on branch prediction but + // have less latency. This results in more instructions but, potentially, also + // higher speeds. - const __mmask16 valid = uint16_t((1 << count) - 1); - // 1. check if we have any surrogate pairs - const __m512i v_0000_ffff = _mm512_set1_epi32(0x0000ffff); - const __mmask16 sp_mask = _mm512_mask_cmpgt_epu32_mask(valid, utf32, v_0000_ffff); + // We first try a few fast paths. + // The obvious first test is ASCII, which actually consumes the full 16. + if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xffff) { + // We process in chunks of 16 bytes + // The routine in simd.h is reused. + simd8 temp{vreinterpretq_s8_u8(in)}; + temp.store_ascii_as_utf16(utf16_output); + utf16_output += 16; // We wrote 16 16-bit characters. + return 16; // We consumed 16 bytes. + } - if (sp_mask == 0) { - if(big_endian) { - _mm256_mask_storeu_epi16((__m256i*)output, valid, _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32), _mm512_castsi512_si256(byteflip))); + // 3 byte sequences are the next most common, as seen in CJK, which has long + // sequences of these. + if (input_utf8_end_of_code_point_mask == 0x924) { + // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte + // UTF-16 code units. + uint16x4_t composed = convert_utf8_3_byte_to_utf16(in); + // Byte swap if necessary + if simdutf_constexpr (!match_system(big_endian)) { + composed = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(composed))); + } + vst1_u16(reinterpret_cast(utf16_output), composed); + utf16_output += 4; // We wrote 4 16-bit characters. + return 12; // We consumed 12 bytes. + } - } else { - _mm256_mask_storeu_epi16((__m256i*)output, valid, _mm512_cvtepi32_epi16(utf32)); - } - return count; + // 2 byte sequences occur in short bursts in languages like Greek and Russian. + if ((utf8_end_of_code_point_mask & 0xFFF) == 0xaaa) { + // We want to take 6 2-byte UTF-8 code units and turn them into 6 2-byte + // UTF-16 code units. + uint16x8_t composed = convert_utf8_2_byte_to_utf16(in); + // Byte swap if necessary + if simdutf_constexpr (!match_system(big_endian)) { + composed = + vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed))); } + vst1q_u16(reinterpret_cast(utf16_output), composed); - { - // build surrogate pair code units in 32-bit lanes - - // t0 = 8 x [000000000000aaaa|aaaaaabbbbbbbbbb] - const __m512i v_0001_0000 = _mm512_set1_epi32(0x00010000); - const __m512i t0 = _mm512_sub_epi32(utf32, v_0001_0000); - - // t1 = 8 x [000000aaaaaaaaaa|bbbbbbbbbb000000] - const __m512i t1 = _mm512_slli_epi32(t0, 6); - - // t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1 to t0 - // 0xe4 = (t1 and v_ffff_0000) or (t0 and not v_ffff_0000) - const __m512i v_ffff_0000 = _mm512_set1_epi32(0xffff0000); - const __m512i t2 = _mm512_ternarylogic_epi32(t1, t0, v_ffff_0000, 0xe4); - - // t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1 to t0 - // 0xba = (t2 and not v_fc00_fc000) or v_d800_dc00 - const __m512i v_fc00_fc00 = _mm512_set1_epi32(0xfc00fc00); - const __m512i v_d800_dc00 = _mm512_set1_epi32(0xd800dc00); - const __m512i t3 = _mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba); - const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3); - __m512i t5 = _mm512_ror_epi32(t4, 16); - // Here we want to trim all of the upper 16-bit code units from the 2-byte - // characters represented as 4-byte values. We can compute it from - // sp_mask or the following... It can be more optimized! - const __mmask32 nonzero = _kor_mask32(0xaaaaaaaa,_mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512())); - const __mmask32 nonzero_masked = _kand_mask32(nonzero, __mmask32((uint64_t(1) << (2*count)) - 1)); - if(big_endian) { t5 = _mm512_shuffle_epi8(t5, byteflip); } - // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability (zen4) - __m512i compressed = _mm512_maskz_compress_epi16(nonzero_masked, t5); - _mm512_mask_storeu_epi16(output, (1<<(count + static_cast(count_ones(sp_mask)))) - 1, compressed); - //_mm512_mask_compressstoreu_epi16(output, nonzero_masked, t5); - } - - return count + static_cast(count_ones(sp_mask)); -} + utf16_output += 6; // We wrote 6 16-bit characters. + return 12; // We consumed 12 bytes. + } -/* - utf32_to_utf16 converts `count` lower UTF-32 code units - from input `utf32` into UTF-16. It may overflow. + /// We do not have a fast path available, or the fast path is unimportant, so + /// we fallback. + const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex + [input_utf8_end_of_code_point_mask][0]; - Returns how many 16-bit code units were stored. + const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex + [input_utf8_end_of_code_point_mask][1]; - byteflip is used for flipping 16-bit code units, and it should be - __m512i byteflip = _mm512_setr_epi64( - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809 - ); - We pass it to the (always inlined) function to encourage the compiler to - keep the value in a (constant) register. -*/ -template -simdutf_really_inline size_t utf32_to_utf16(const __m512i byteflip, __m512i utf32, unsigned int count, char16_t* output) { - // check if we have any surrogate pairs - const __m512i v_0000_ffff = _mm512_set1_epi32(0x0000ffff); - const __mmask16 sp_mask = _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff); - - if (sp_mask == 0) { - // technically, it should be _mm256_storeu_epi16 - if(big_endian) { - _mm256_storeu_si256((__m256i*)output, _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32),_mm512_castsi512_si256(byteflip))); - } else { - _mm256_storeu_si256((__m256i*)output, _mm512_cvtepi32_epi16(utf32)); - } - return count; + if (idx < 64) { + // SIX (6) input code-code units + // Convert to UTF-16 + uint16x8_t composed = convert_utf8_1_to_2_byte_to_utf16(in, idx); + // Byte swap if necessary + if simdutf_constexpr (!match_system(big_endian)) { + composed = + vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed))); } - - { - // build surrogate pair code units in 32-bit lanes - - // t0 = 8 x [000000000000aaaa|aaaaaabbbbbbbbbb] - const __m512i v_0001_0000 = _mm512_set1_epi32(0x00010000); - const __m512i t0 = _mm512_sub_epi32(utf32, v_0001_0000); - - // t1 = 8 x [000000aaaaaaaaaa|bbbbbbbbbb000000] - const __m512i t1 = _mm512_slli_epi32(t0, 6); - - // t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1 to t0 - // 0xe4 = (t1 and v_ffff_0000) or (t0 and not v_ffff_0000) - const __m512i v_ffff_0000 = _mm512_set1_epi32(0xffff0000); - const __m512i t2 = _mm512_ternarylogic_epi32(t1, t0, v_ffff_0000, 0xe4); - - // t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1 to t0 - // 0xba = (t2 and not v_fc00_fc000) or v_d800_dc00 - const __m512i v_fc00_fc00 = _mm512_set1_epi32(0xfc00fc00); - const __m512i v_d800_dc00 = _mm512_set1_epi32(0xd800dc00); - const __m512i t3 = _mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba); - const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3); - __m512i t5 = _mm512_ror_epi32(t4, 16); - const __mmask32 nonzero = _kor_mask32(0xaaaaaaaa,_mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512())); - if(big_endian) { t5 = _mm512_shuffle_epi8(t5, byteflip); } - // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability (zen4) - __m512i compressed = _mm512_maskz_compress_epi16(nonzero, t5); - _mm512_mask_storeu_epi16(output, (1<<(count + static_cast(count_ones(sp_mask)))) - 1, compressed); - //_mm512_mask_compressstoreu_epi16(output, nonzero, t5); + // Store + vst1q_u16(reinterpret_cast(utf16_output), composed); + utf16_output += 6; // We wrote 6 16-bit characters. + return consumed; + } else if (idx < 145) { + // FOUR (4) input code-code units + // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing. + uint8x16_t sh = vld1q_u8(reinterpret_cast( + simdutf::tables::utf8_to_utf16::shufutf8[idx])); + // XXX: depending on the system scalar instructions might be faster. + // 1 byte: 00000000 00000000 0ccccccc + // 2 byte: 00000000 110bbbbb 10cccccc + // 3 byte: 1110aaaa 10bbbbbb 10cccccc + uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh)); + // 1 byte: 00000000 0ccccccc + // 2 byte: xx0bbbbb x0cccccc + // 3 byte: xxbbbbbb x0cccccc + uint16x4_t lowperm = vmovn_u32(perm); + // Partially mask with bic (doesn't require a temporary register unlike and) + // The shift left insert below will clear the top bits. + // 1 byte: 00000000 00000000 + // 2 byte: xx0bbbbb 00000000 + // 3 byte: xxbbbbbb 00000000 + uint16x4_t middlebyte = vbic_u16(lowperm, vmov_n_u16(uint16_t(~0xFF00))); + // ASCII + // 1 byte: 00000000 0ccccccc + // 2+byte: 00000000 00cccccc + uint16x4_t ascii = vand_u16(lowperm, vmov_n_u16(0x7F)); + // Split into narrow vectors. + // 2 byte: 00000000 00000000 + // 3 byte: 00000000 xxxxaaaa + uint16x4_t highperm = vshrn_n_u32(perm, 16); + // Shift right accumulate the middle byte + // 1 byte: 00000000 0ccccccc + // 2 byte: 00xx0bbb bbcccccc + // 3 byte: 00xxbbbb bbcccccc + uint16x4_t middlelow = vsra_n_u16(ascii, middlebyte, 2); + // Shift left and insert the top 4 bits, overwriting the garbage + // 1 byte: 00000000 0ccccccc + // 2 byte: 00000bbb bbcccccc + // 3 byte: aaaabbbb bbcccccc + uint16x4_t composed = vsli_n_u16(middlelow, highperm, 12); + // Byte swap if necessary + if simdutf_constexpr (!match_system(big_endian)) { + composed = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(composed))); } + vst1_u16(reinterpret_cast(utf16_output), composed); - return count + static_cast(count_ones(sp_mask)); -} + utf16_output += 4; // We wrote 4 16-bit codepoints + return consumed; + } else if (idx < 209) { + // THREE (3) input code-code units + if (input_utf8_end_of_code_point_mask == 0x888) { + // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte + // UTF-16 pairs. Generating surrogate pairs is a little tricky though, but + // it is easier when we can assume they are all pairs. This version does + // not use the LUT, but 4 byte sequences are less common and the overhead + // of the extra memory access is less important than the early branch + // overhead in shorter sequences. -/** - * Store the last N bytes of previous followed by 512-N bytes from input. - */ -template -__m512i prev(__m512i input, __m512i previous) { - static_assert(N<=32, "N must be no larger than 32"); - const __m512i movemask = _mm512_setr_epi32(28,29,30,31,0,1,2,3,4,5,6,7,8,9,10,11); - const __m512i rotated = _mm512_permutex2var_epi32(input, movemask, previous); -#if SIMDUTF_GCC8 || SIMDUTF_GCC9 - constexpr int shift = 16-N; // workaround for GCC8,9 - return _mm512_alignr_epi8(input, rotated, shift); + // Swap byte pairs + // 10dddddd 10cccccc|10bbbbbb 11110aaa + // 10cccccc 10dddddd|11110aaa 10bbbbbb + uint8x16_t swap = vrev16q_u8(in); + // Shift left 2 bits + // cccccc00 dddddd00 xxxxxxxx bbbbbb00 + uint32x4_t shift = vreinterpretq_u32_u8(vshlq_n_u8(swap, 2)); + // Create a magic number containing the low 2 bits of the trail surrogate + // and all the corrections needed to create the pair. UTF-8 4b prefix = + // -0x0000|0xF000 surrogate offset = -0x0000|0x0040 (0x10000 << 6) + // surrogate high = +0x0000|0xD800 + // surrogate low = +0xDC00|0x0000 + // ------------------------------- + // = +0xDC00|0xE7C0 + uint32x4_t magic = vmovq_n_u32(0xDC00E7C0); + // Generate unadjusted trail surrogate minus lowest 2 bits + // xxxxxxxx xxxxxxxx|11110aaa bbbbbb00 + uint32x4_t trail = + vbslq_u32(vmovq_n_u32(0x0000FF00), vreinterpretq_u32_u8(swap), shift); + // Insert low 2 bits of trail surrogate to magic number for later + // 11011100 00000000 11100111 110000cc + uint16x8_t magic_with_low_2 = + vreinterpretq_u16_u32(vsraq_n_u32(magic, shift, 30)); + // Generate lead surrogate + // xxxxcccc ccdddddd|xxxxxxxx xxxxxxxx + uint32x4_t lead = vreinterpretq_u32_u16( + vsliq_n_u16(vreinterpretq_u16_u8(swap), vreinterpretq_u16_u8(in), 6)); + // Mask out lead + // 000000cc ccdddddd|xxxxxxxx xxxxxxxx + lead = vbicq_u32(lead, vmovq_n_u32(uint32_t(~0x03FFFFFF))); + // Blend pairs + // 000000cc ccdddddd|11110aaa bbbbbb00 + uint16x8_t blend = vreinterpretq_u16_u32( + vbslq_u32(vmovq_n_u32(0x0000FFFF), trail, lead)); + // Add magic number to finish the result + // 110111CC CCDDDDDD|110110AA BBBBBBCC + uint16x8_t composed = vaddq_u16(blend, magic_with_low_2); + // Byte swap if necessary + if simdutf_constexpr (!match_system(big_endian)) { + composed = + vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed))); + } + uint16_t buffer[8]; + vst1q_u16(reinterpret_cast(buffer), composed); + for (int k = 0; k < 6; k++) { + utf16_output[k] = buffer[k]; + } // the loop might compiler to a couple of instructions. + // We need some validation. See + // https://github.com/simdutf/simdutf/pull/631 +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + uint8x16_t expected_mask = simdutf_make_uint8x16_t( + 0xf8, 0xc0, 0xc0, 0xc0, 0xf8, 0xc0, 0xc0, 0xc0, 0xf8, 0xc0, 0xc0, + 0xc0, 0x0, 0x0, 0x0, 0x0); #else - return _mm512_alignr_epi8(input, rotated, 16-N); -#endif // SIMDUTF_GCC8 || SIMDUTF_GCC9 -} - -template -__m512i shuffle_epi128(__m512i v) { - static_assert((idx0 >= 0 && idx0 <= 3), "idx0 must be in range 0..3"); - static_assert((idx1 >= 0 && idx1 <= 3), "idx1 must be in range 0..3"); - static_assert((idx2 >= 0 && idx2 <= 3), "idx2 must be in range 0..3"); - static_assert((idx3 >= 0 && idx3 <= 3), "idx3 must be in range 0..3"); - - constexpr unsigned shuffle = idx0 | (idx1 << 2) | (idx2 << 4) | (idx3 << 6); - return _mm512_shuffle_i32x4(v, v, shuffle); -} - -template -constexpr __m512i broadcast_epi128(__m512i v) { - return shuffle_epi128(v); -} - -/** - * Current unused. - */ -template -__m512i rotate_by_N_epi8(const __m512i input) { - - // lanes order: 1, 2, 3, 0 => 0b00_11_10_01 - const __m512i permuted = _mm512_shuffle_i32x4(input, input, 0x39); - - return _mm512_alignr_epi8(permuted, input, N); -} - -/* - expanded_utf8_to_utf32 converts expanded UTF-8 characters (`utf8`) - stored at separate 32-bit lanes. - - For each lane we have also a character class (`char_class), given in form - 0x8080800N, where N is 4 highest bits from the leading byte; 0x80 resets - corresponding bytes during pshufb. -*/ -simdutf_really_inline __m512i expanded_utf8_to_utf32(__m512i char_class, __m512i utf8) { - /* - Input: - - utf8: bytes stored at separate 32-bit code units - - valid: which code units have valid UTF-8 characters - - Bit layout of single word. We show 4 cases for each possible - UTF-8 character encoding. The `?` denotes bits we must not - assume their value. - - |10dd.dddd|10cc.cccc|10bb.bbbb|1111.0aaa| 4-byte char - |????.????|10cc.cccc|10bb.bbbb|1110.aaaa| 3-byte char - |????.????|????.????|10bb.bbbb|110a.aaaa| 2-byte char - |????.????|????.????|????.????|0aaa.aaaa| ASCII char - byte 3 byte 2 byte 1 byte 0 - */ - - /* 1. Reset control bits of continuation bytes and the MSB - of the leading byte; this makes all bytes unsigned (and - does not alter ASCII char). - - |00dd.dddd|00cc.cccc|00bb.bbbb|0111.0aaa| 4-byte char - |00??.????|00cc.cccc|00bb.bbbb|0110.aaaa| 3-byte char - |00??.????|00??.????|00bb.bbbb|010a.aaaa| 2-byte char - |00??.????|00??.????|00??.????|0aaa.aaaa| ASCII char - ^^ ^^ ^^ ^ - */ - __m512i values; - const __m512i v_3f3f_3f7f = _mm512_set1_epi32(0x3f3f3f7f); - values = _mm512_and_si512(utf8, v_3f3f_3f7f); - - /* 2. Swap and join fields A-B and C-D - - |0000.cccc|ccdd.dddd|0001.110a|aabb.bbbb| 4-byte char - |0000.cccc|cc??.????|0001.10aa|aabb.bbbb| 3-byte char - |0000.????|????.????|0001.0aaa|aabb.bbbb| 2-byte char - |0000.????|????.????|000a.aaaa|aa??.????| ASCII char */ - const __m512i v_0140_0140 = _mm512_set1_epi32(0x01400140); - values = _mm512_maddubs_epi16(values, v_0140_0140); - - /* 3. Swap and join fields AB & CD - - |0000.0001|110a.aabb|bbbb.cccc|ccdd.dddd| 4-byte char - |0000.0001|10aa.aabb|bbbb.cccc|cc??.????| 3-byte char - |0000.0001|0aaa.aabb|bbbb.????|????.????| 2-byte char - |0000.000a|aaaa.aa??|????.????|????.????| ASCII char */ - const __m512i v_0001_1000 = _mm512_set1_epi32(0x00011000); - values = _mm512_madd_epi16(values, v_0001_1000); - - /* 4. Shift left the values by variable amounts to reset highest UTF-8 bits - |aaab.bbbb|bccc.cccd|dddd.d000|0000.0000| 4-byte char -- by 11 - |aaaa.bbbb|bbcc.cccc|????.??00|0000.0000| 3-byte char -- by 10 - |aaaa.abbb|bbb?.????|????.???0|0000.0000| 2-byte char -- by 9 - |aaaa.aaa?|????.????|????.????|?000.0000| ASCII char -- by 7 */ - { - /** pshufb - - continuation = 0 - ascii = 7 - _2_bytes = 9 - _3_bytes = 10 - _4_bytes = 11 - - shift_left_v3 = 4 * [ - ascii, # 0000 - ascii, # 0001 - ascii, # 0010 - ascii, # 0011 - ascii, # 0100 - ascii, # 0101 - ascii, # 0110 - ascii, # 0111 - continuation, # 1000 - continuation, # 1001 - continuation, # 1010 - continuation, # 1011 - _2_bytes, # 1100 - _2_bytes, # 1101 - _3_bytes, # 1110 - _4_bytes, # 1111 - ] */ - const __m512i shift_left_v3 = _mm512_setr_epi64( - 0x0707070707070707, - 0x0b0a090900000000, - 0x0707070707070707, - 0x0b0a090900000000, - 0x0707070707070707, - 0x0b0a090900000000, - 0x0707070707070707, - 0x0b0a090900000000 - ); - - const __m512i shift = _mm512_shuffle_epi8(shift_left_v3, char_class); - values = _mm512_sllv_epi32(values, shift); + uint8x16_t expected_mask = {0xf8, 0xc0, 0xc0, 0xc0, 0xf8, 0xc0, + 0xc0, 0xc0, 0xf8, 0xc0, 0xc0, 0xc0, + 0x0, 0x0, 0x0, 0x0}; +#endif +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + uint8x16_t expected = simdutf_make_uint8x16_t( + 0xf0, 0x80, 0x80, 0x80, 0xf0, 0x80, 0x80, 0x80, 0xf0, 0x80, 0x80, + 0x80, 0x0, 0x0, 0x0, 0x0); +#else + uint8x16_t expected = {0xf0, 0x80, 0x80, 0x80, 0xf0, 0x80, 0x80, 0x80, + 0xf0, 0x80, 0x80, 0x80, 0x0, 0x0, 0x0, 0x0}; +#endif + uint8x16_t check = vceqq_u8(vandq_u8(in, expected_mask), expected); + bool correct = (vminvq_u32(vreinterpretq_u32_u8(check)) == 0xFFFFFFFF); + // The validation is just three instructions and it is not on a critical + // path. + if (correct) { + utf16_output += 6; // We wrote 3 32-bit surrogate pairs. + } + return 12; // We consumed 12 bytes. } + // 3 1-4 byte sequences + uint8x16_t sh = vld1q_u8(reinterpret_cast( + simdutf::tables::utf8_to_utf16::shufutf8[idx])); - /* 5. Shift right the values by variable amounts to reset lowest bits - |0000.0000|000a.aabb|bbbb.cccc|ccdd.dddd| 4-byte char -- by 11 - |0000.0000|0000.0000|aaaa.bbbb|bbcc.cccc| 3-byte char -- by 16 - |0000.0000|0000.0000|0000.0aaa|aabb.bbbb| 2-byte char -- by 21 - |0000.0000|0000.0000|0000.0000|0aaa.aaaa| ASCII char -- by 25 */ - { - // 4 * [25, 25, 25, 25, 25, 25, 25, 25, 0, 0, 0, 0, 21, 21, 16, 11] - const __m512i shift_right = _mm512_setr_epi64( - 0x1919191919191919, - 0x0b10151500000000, - 0x1919191919191919, - 0x0b10151500000000, - 0x1919191919191919, - 0x0b10151500000000, - 0x1919191919191919, - 0x0b10151500000000 - ); + // 1 byte: 00000000 00000000 00000000 0ddddddd + // 3 byte: 00000000 00000000 110ccccc 10dddddd + // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd + // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd + uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh)); + // added to fix issue https://github.com/simdutf/simdutf/issues/514 + // We only want to write 2 * 16-bit code units when that is actually what we + // have. Unfortunately, we cannot trust the input. So it is possible to get + // 0xff as an input byte and it should not result in a surrogate pair. We + // need to check for that. + uint32_t permbuffer[4]; + vst1q_u32(permbuffer, perm); + // Mask the low and middle bytes + // 00000000 00000000 00000000 0ddddddd + uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7f)); + // Because the surrogates need more work, the high surrogate is computed + // first. + uint32x4_t middlehigh = vshlq_n_u32(perm, 2); + // 00000000 00000000 00cccccc 00000000 + uint32x4_t middlebyte = vandq_u32(perm, vmovq_n_u32(0x3F00)); + // Start assembling the sequence. Since the 4th byte is in the same position + // as it would be in a surrogate and there is no dependency, shift left + // instead of right. 3 byte: 00000000 10bbbbxx xxxxxxxx xxxxxxxx 4 byte: + // 11110aaa bbbbbbxx xxxxxxxx xxxxxxxx + uint32x4_t ab = vbslq_u32(vmovq_n_u32(0xFF000000), perm, middlehigh); + // Top 16 bits contains the high ten bits of the surrogate pair before + // correction 3 byte: 00000000 10bbbbcc|cccc0000 00000000 4 byte: 11110aaa + // bbbbbbcc|cccc0000 00000000 - high 10 bits correct w/o correction + uint32x4_t abc = + vbslq_u32(vmovq_n_u32(0xFFFC0000), ab, vshlq_n_u32(middlebyte, 4)); + // Combine the low 6 or 7 bits by a shift right accumulate + // 3 byte: 00000000 00000010|bbbbcccc ccdddddd - low 16 bits correct + // 4 byte: 00000011 110aaabb|bbbbcccc ccdddddd - low 10 bits correct w/o + // correction + uint32x4_t composed = vsraq_n_u32(ascii, abc, 6); + // After this is for surrogates + // Blend the low and high surrogates + // 4 byte: 11110aaa bbbbbbcc|bbbbcccc ccdddddd + uint32x4_t mixed = vbslq_u32(vmovq_n_u32(0xFFFF0000), abc, composed); + // Clear the upper 6 bits of the low surrogate. Don't clear the upper bits + // yet as 0x10000 was not subtracted from the codepoint yet. 4 byte: + // 11110aaa bbbbbbcc|000000cc ccdddddd + uint16x8_t masked_pair = vreinterpretq_u16_u32( + vbicq_u32(mixed, vmovq_n_u32(uint32_t(~0xFFFF03FF)))); + // Correct the remaining UTF-8 prefix, surrogate offset, and add the + // surrogate prefixes in one magic 16-bit addition. similar magic number but + // without the continue byte adjust and halfword swapped UTF-8 4b prefix = + // -0xF000|0x0000 surrogate offset = -0x0040|0x0000 (0x10000 << 6) + // surrogate high = +0xD800|0x0000 + // surrogate low = +0x0000|0xDC00 + // ----------------------------------- + // = +0xE7C0|0xDC00 + uint16x8_t magic = vreinterpretq_u16_u32(vmovq_n_u32(0xE7C0DC00)); + // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD - surrogate pair complete + uint32x4_t surrogates = + vreinterpretq_u32_u16(vaddq_u16(masked_pair, magic)); + // If the high bit is 1 (s32 less than zero), this needs a surrogate pair + uint32x4_t is_pair = vcltzq_s32(vreinterpretq_s32_u32(perm)); - const __m512i shift = _mm512_shuffle_epi8(shift_right, char_class); - values = _mm512_srlv_epi32(values, shift); + // Select either the 4 byte surrogate pair or the 2 byte solo codepoint + // 3 byte: 0xxxxxxx xxxxxxxx|bbbbcccc ccdddddd + // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD + uint32x4_t selected = vbslq_u32(is_pair, surrogates, composed); + // Byte swap if necessary + if simdutf_constexpr (!match_system(big_endian)) { + selected = + vreinterpretq_u32_u8(vrev16q_u8(vreinterpretq_u8_u32(selected))); } - - return values; -} - - -simdutf_really_inline __m512i expand_and_identify(__m512i lane0, __m512i lane1, int &count) { - const __m512i merged = _mm512_mask_mov_epi32(lane0, 0x1000, lane1); - const __m512i expand_ver2 = _mm512_setr_epi64( - 0x0403020103020100, - 0x0605040305040302, - 0x0807060507060504, - 0x0a09080709080706, - 0x0c0b0a090b0a0908, - 0x0e0d0c0b0d0c0b0a, - 0x000f0e0d0f0e0d0c, - 0x0201000f01000f0e - ); - const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2); - const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0); - const __m512i t0 = _mm512_and_si512(input, v_0000_00c0); - const __m512i v_0000_0080 = _mm512_set1_epi32(0x80); - const __mmask16 leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080); - count = static_cast(count_ones(leading_bytes)); - return _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes, input); -} - -simdutf_really_inline __m512i expand_utf8_to_utf32(__m512i input) { - __m512i char_class = _mm512_srli_epi32(input, 4); - /* char_class = ((input >> 4) & 0x0f) | 0x80808000 */ - const __m512i v_0000_000f = _mm512_set1_epi32(0x0f); - const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000); - char_class = _mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea); - return expanded_utf8_to_utf32(char_class, input); + // Attempting to shuffle and store would be complex, just scalarize. + uint32_t buffer[4]; + vst1q_u32(buffer, selected); + // Test for the top bit of the surrogate mask. Remove due to issue 514 + // const uint32_t SURROGATE_MASK = match_system(big_endian) ? 0x80000000 : + // 0x00800000; + for (size_t i = 0; i < 3; i++) { + // Surrogate + // Used to be if (buffer[i] & SURROGATE_MASK) { + // See discussion above. + // patch for issue https://github.com/simdutf/simdutf/issues/514 + if ((permbuffer[i] & 0xf8000000) == 0xf0000000) { + utf16_output[0] = uint16_t(buffer[i] >> 16); + utf16_output[1] = uint16_t(buffer[i] & 0xFFFF); + utf16_output += 2; + } else { + utf16_output[0] = uint16_t(buffer[i] & 0xFFFF); + utf16_output++; + } + } + return consumed; + } else { + // here we know that there is an error but we do not handle errors + return 12; + } } -/* end file src/icelake/icelake_utf8_common.inl.cpp */ -/* begin file src/icelake/icelake_macros.inl.cpp */ - -/* - This upcoming macro (SIMDUTF_ICELAKE_TRANSCODE16) takes 16 + 4 bytes (of a UTF-8 string) - and loads all possible 4-byte substring into an AVX512 register. - - For example if we have bytes abcdefgh... we create following 32-bit lanes +/* end file src/arm64/arm_convert_utf8_to_utf16.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +/* begin file src/arm64/arm_convert_utf8_to_utf32.cpp */ +// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 12). +size_t convert_masked_utf8_to_utf32(const char *input, + uint64_t utf8_end_of_code_point_mask, + char32_t *&utf32_out) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + uint32_t *&utf32_output = reinterpret_cast(utf32_out); + uint8x16_t in = vld1q_u8(reinterpret_cast(input)); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & 0xFFF; + // + // Optimization note: our main path below is load-latency dependent. Thus it + // is maybe beneficial to have fast paths that depend on branch prediction but + // have less latency. This results in more instructions but, potentially, also + // higher speeds. + // + // We first try a few fast paths. + if (utf8_end_of_code_point_mask == 0xfff) { + // We process in chunks of 12 bytes. + // use fast implementation in src/simdutf/arm64/simd.h + // Ideally the compiler can keep the tables in registers. + simd8 temp{vreinterpretq_s8_u8(in)}; + temp.store_ascii_as_utf32_tbl(utf32_out); + utf32_output += 12; // We wrote 12 32-bit characters. + return 12; // We consumed 12 bytes. + } + if (input_utf8_end_of_code_point_mask == 0x924) { + // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte + // UTF-32 code units. Convert to UTF-16 + uint16x4_t composed_utf16 = convert_utf8_3_byte_to_utf16(in); + // Zero extend and store via ST2 with a zero. + uint16x4x2_t interleaver = {{composed_utf16, vmov_n_u16(0)}}; + vst2_u16(reinterpret_cast(utf32_output), interleaver); + utf32_output += 4; // We wrote 4 32-bit characters. + return 12; // We consumed 12 bytes. + } - [abcd|bcde|cdef|defg|efgh|...] - ^ ^ - byte 0 of reg byte 63 of reg -*/ -/** pshufb - # lane{0,1,2} have got bytes: [ 0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15] - # lane3 has got bytes: [ 16, 17, 18, 19, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15] + // 2 byte sequences occur in short bursts in languages like Greek and Russian. + if (input_utf8_end_of_code_point_mask == 0xaaa) { + // We want to take 6 2-byte UTF-8 code units and turn them into 6 4-byte + // UTF-32 code units. Convert to UTF-16 + uint16x8_t composed_utf16 = convert_utf8_2_byte_to_utf16(in); + // Zero extend and store via ST2 with a zero. + uint16x8x2_t interleaver = {{composed_utf16, vmovq_n_u16(0)}}; + vst2q_u16(reinterpret_cast(utf32_output), interleaver); + utf32_output += 6; // We wrote 6 32-bit characters. + return 12; // We consumed 12 bytes. + } + /// Either no fast path or an unimportant fast path. - expand_ver2 = [ - # lane 0: - 0, 1, 2, 3, - 1, 2, 3, 4, - 2, 3, 4, 5, - 3, 4, 5, 6, + const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex + [input_utf8_end_of_code_point_mask][0]; + const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex + [input_utf8_end_of_code_point_mask][1]; - # lane 1: - 4, 5, 6, 7, - 5, 6, 7, 8, - 6, 7, 8, 9, - 7, 8, 9, 10, - - # lane 2: - 8, 9, 10, 11, - 9, 10, 11, 12, - 10, 11, 12, 13, - 11, 12, 13, 14, + if (idx < 64) { + // SIX (6) input code-code units + // Convert to UTF-16 + uint16x8_t composed_utf16 = convert_utf8_1_to_2_byte_to_utf16(in, idx); + // Zero extend and store with ST2 and zero + uint16x8x2_t interleaver = {{composed_utf16, vmovq_n_u16(0)}}; + vst2q_u16(reinterpret_cast(utf32_output), interleaver); + utf32_output += 6; // We wrote 6 32-bit characters. + return consumed; + } else if (idx < 145) { + // FOUR (4) input code-code units + // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing. + uint8x16_t sh = vld1q_u8(reinterpret_cast( + simdutf::tables::utf8_to_utf16::shufutf8[idx])); + // Shuffle + // 1 byte: 00000000 00000000 0ccccccc + // 2 byte: 00000000 110bbbbb 10cccccc + // 3 byte: 1110aaaa 10bbbbbb 10cccccc + uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh)); + // Split + // 00000000 00000000 0ccccccc + uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7F)); // 6 or 7 bits + // Note: unmasked + // xxxxxxxx aaaaxxxx xxxxxxxx + uint32x4_t high = vshrq_n_u32(perm, 4); // 4 bits + // Use 16 bit bic instead of and. + // The top bits will be corrected later in the bsl + // 00000000 10bbbbbb 00000000 + uint32x4_t middle = vreinterpretq_u32_u16( + vbicq_u16(vreinterpretq_u16_u32(perm), + vmovq_n_u16(uint16_t(~0xff00)))); // 5 or 6 bits + // Combine low and middle with shift right accumulate + // 00000000 00xxbbbb bbcccccc + uint32x4_t lowmid = vsraq_n_u32(ascii, middle, 2); + // Insert top 4 bits from high byte with bitwise select + // 00000000 aaaabbbb bbcccccc + uint32x4_t composed = vbslq_u32(vmovq_n_u32(0x0000F000), high, lowmid); + vst1q_u32(utf32_output, composed); + utf32_output += 4; // We wrote 4 32-bit characters. + return consumed; + } else if (idx < 209) { + // THREE (3) input code-code units + if (input_utf8_end_of_code_point_mask == 0x888) { + // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte + // UTF-32 code units. This uses the same method as the fixed 3 byte + // version, reversing and shift left insert. However, there is no need for + // a shuffle mask now, just rev16 and rev32. + // + // This version does not use the LUT, but 4 byte sequences are less common + // and the overhead of the extra memory access is less important than the + // early branch overhead in shorter sequences, so it comes last. - # lane 3 order: 13, 14, 15, 16 14, 15, 16, 17, 15, 16, 17, 18, 16, 17, 18, 19 - 12, 13, 14, 15, - 13, 14, 15, 0, - 14, 15, 0, 1, - 15, 0, 1, 2, - ] -*/ + // Swap pairs of bytes + // 10dddddd|10cccccc|10bbbbbb|11110aaa + // 10cccccc 10dddddd|11110aaa 10bbbbbb + uint16x8_t swap1 = vreinterpretq_u16_u8(vrev16q_u8(in)); + // Shift left and insert + // xxxxcccc ccdddddd|xxxxxxxa aabbbbbb + uint16x8_t merge1 = vsliq_n_u16(swap1, vreinterpretq_u16_u8(in), 6); + // Swap 16-bit lanes + // xxxxcccc ccdddddd xxxxxxxa aabbbbbb + // xxxxxxxa aabbbbbb xxxxcccc ccdddddd + uint32x4_t swap2 = vreinterpretq_u32_u16(vrev32q_u16(merge1)); + // Shift insert again + // xxxxxxxx xxxaaabb bbbbcccc ccdddddd + uint32x4_t merge2 = vsliq_n_u32(swap2, vreinterpretq_u32_u16(merge1), 12); + // Clear the garbage + // 00000000 000aaabb bbbbcccc ccdddddd + uint32x4_t composed = vandq_u32(merge2, vmovq_n_u32(0x1FFFFF)); + // Store + vst1q_u32(utf32_output, composed); -#define SIMDUTF_ICELAKE_TRANSCODE16(LANE0, LANE1, MASKED) \ - { \ - const __m512i merged = _mm512_mask_mov_epi32(LANE0, 0x1000, LANE1); \ - const __m512i expand_ver2 = _mm512_setr_epi64( \ - 0x0403020103020100, \ - 0x0605040305040302, \ - 0x0807060507060504, \ - 0x0a09080709080706, \ - 0x0c0b0a090b0a0908, \ - 0x0e0d0c0b0d0c0b0a, \ - 0x000f0e0d0f0e0d0c, \ - 0x0201000f01000f0e \ - ); \ - const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2); \ - \ - __mmask16 leading_bytes; \ - const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0); \ - const __m512i t0 = _mm512_and_si512(input, v_0000_00c0); \ - const __m512i v_0000_0080 = _mm512_set1_epi32(0x80); \ - leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080); \ - \ - __m512i char_class; \ - char_class = _mm512_srli_epi32(input, 4); \ - /* char_class = ((input >> 4) & 0x0f) | 0x80808000 */ \ - const __m512i v_0000_000f = _mm512_set1_epi32(0x0f); \ - const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000); \ - char_class = _mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea); \ - \ - const int valid_count = static_cast(count_ones(leading_bytes)); \ - const __m512i utf32 = expanded_utf8_to_utf32(char_class, input); \ - \ - const __m512i out = _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes, utf32); \ - \ - if (UTF32) { \ - if(MASKED) { \ - const __mmask16 valid = uint16_t((1 << valid_count) - 1); \ - _mm512_mask_storeu_epi32((__m512i*)output, valid, out); \ - } else { \ - _mm512_storeu_si512((__m512i*)output, out); \ - } \ - output += valid_count; \ - } else { \ - if(MASKED) { \ - output += utf32_to_utf16_masked(byteflip, out, valid_count, reinterpret_cast(output)); \ - } else { \ - output += utf32_to_utf16(byteflip, out, valid_count, reinterpret_cast(output)); \ - } \ - } \ - } + utf32_output += 3; // We wrote 3 32-bit characters. + return 12; // We consumed 12 bytes. + } + // Unlike UTF-16, doing a fast codepath doesn't have nearly as much benefit + // due to surrogates no longer being involved. + uint8x16_t sh = vld1q_u8(reinterpret_cast( + simdutf::tables::utf8_to_utf16::shufutf8[idx])); + // 1 byte: 00000000 00000000 00000000 0ddddddd + // 2 byte: 00000000 00000000 110ccccc 10dddddd + // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd + // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd + uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh)); + // Ascii + uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7F)); + uint32x4_t middle = vandq_u32(perm, vmovq_n_u32(0x3f00)); + // When converting the way we do, the 3 byte prefix will be interpreted as + // the 18th bit being set, since the code would interpret the lead byte + // (0b1110bbbb) as a continuation byte (0b10bbbbbb). To fix this, we can + // either xor or do an 8 bit add of the 6th bit shifted right by 1. Since + // NEON has shift right accumulate, we use that. + // 4 byte 3 byte + // 10bbbbbb 1110bbbb + // 00000000 01000000 6th bit + // 00000000 00100000 shift right + // 10bbbbbb 0000bbbb add + // 00bbbbbb 0000bbbb mask + uint8x16_t correction = + vreinterpretq_u8_u32(vandq_u32(perm, vmovq_n_u32(0x00400000))); + uint32x4_t corrected = vreinterpretq_u32_u8( + vsraq_n_u8(vreinterpretq_u8_u32(perm), correction, 1)); + // 00000000 00000000 0000cccc ccdddddd + uint32x4_t cd = vsraq_n_u32(ascii, middle, 2); + // Insert twice + // xxxxxxxx xxxaaabb bbbbxxxx xxxxxxxx + uint32x4_t ab = vbslq_u32(vmovq_n_u32(0x01C0000), vshrq_n_u32(corrected, 6), + vshrq_n_u32(corrected, 4)); + // 00000000 000aaabb bbbbcccc ccdddddd + uint32x4_t composed = vbslq_u32(vmovq_n_u32(0xFFE00FFF), cd, ab); + // Store + vst1q_u32(utf32_output, composed); + utf32_output += 3; // We wrote 3 32-bit characters. + return consumed; + } else { + // here we know that there is an error but we do not handle errors + return 12; + } +} +/* end file src/arm64/arm_convert_utf8_to_utf32.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 -#define SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(INPUT, VALID_COUNT, MASKED) \ -{ \ - if (UTF32) { \ - if(MASKED) { \ - const __mmask16 valid_mask = uint16_t((1 << VALID_COUNT) - 1); \ - _mm512_mask_storeu_epi32((__m512i*)output, valid_mask, INPUT); \ - } else { \ - _mm512_storeu_si512((__m512i*)output, INPUT); \ - } \ - output += VALID_COUNT; \ - } else { \ - if(MASKED) { \ - output += utf32_to_utf16_masked(byteflip, INPUT, VALID_COUNT, reinterpret_cast(output)); \ - } else { \ - output += utf32_to_utf16(byteflip, INPUT, VALID_COUNT, reinterpret_cast(output)); \ - } \ - } \ -} - - -#define SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output) \ - if (UTF32) { \ - const __m128i t0 = _mm512_castsi512_si128(utf8); \ - const __m128i t1 = _mm512_extracti32x4_epi32(utf8, 1); \ - const __m128i t2 = _mm512_extracti32x4_epi32(utf8, 2); \ - const __m128i t3 = _mm512_extracti32x4_epi32(utf8, 3); \ - _mm512_storeu_si512((__m512i*)(output + 0*16), _mm512_cvtepu8_epi32(t0)); \ - _mm512_storeu_si512((__m512i*)(output + 1*16), _mm512_cvtepu8_epi32(t1)); \ - _mm512_storeu_si512((__m512i*)(output + 2*16), _mm512_cvtepu8_epi32(t2)); \ - _mm512_storeu_si512((__m512i*)(output + 3*16), _mm512_cvtepu8_epi32(t3)); \ - } else { \ - const __m256i h0 = _mm512_castsi512_si256(utf8); \ - const __m256i h1 = _mm512_extracti64x4_epi64(utf8, 1); \ - if(big_endian) { \ - _mm512_storeu_si512((__m512i*)(output + 0*16), _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h0), byteflip)); \ - _mm512_storeu_si512((__m512i*)(output + 2*16), _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h1), byteflip)); \ - } else { \ - _mm512_storeu_si512((__m512i*)(output + 0*16), _mm512_cvtepu8_epi16(h0)); \ - _mm512_storeu_si512((__m512i*)(output + 2*16), _mm512_cvtepu8_epi16(h1)); \ - } \ - } -/* end file src/icelake/icelake_macros.inl.cpp */ -/* begin file src/icelake/icelake_from_valid_utf8.inl.cpp */ -// file included directly +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/arm64/arm_convert_utf16_to_latin1.cpp */ -// File contains conversion procedure from VALID UTF-8 strings. +template +std::pair +arm_convert_utf16_to_latin1(const char16_t *buf, size_t len, + char *latin1_output) { + const char16_t *end = buf + len; + while (end - buf >= 8) { + uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); + if simdutf_constexpr (!match_system(big_endian)) { + in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in))); + } + if (vmaxvq_u16(in) <= 0xff) { + // 1. pack the bytes + uint8x8_t latin1_packed = vmovn_u16(in); + // 2. store (8 bytes) + vst1_u8(reinterpret_cast(latin1_output), latin1_packed); + // 3. adjust pointers + buf += 8; + latin1_output += 8; + } else { + return std::make_pair(nullptr, reinterpret_cast(latin1_output)); + } + } // while + return std::make_pair(buf, latin1_output); +} +template +std::pair +arm_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len, + char *latin1_output) { + const char16_t *start = buf; + const char16_t *end = buf + len; + while (end - buf >= 8) { + uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); + if simdutf_constexpr (!match_system(big_endian)) { + in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in))); + } + if (vmaxvq_u16(in) <= 0xff) { + // 1. pack the bytes + uint8x8_t latin1_packed = vmovn_u16(in); + // 2. store (8 bytes) + vst1_u8(reinterpret_cast(latin1_output), latin1_packed); + // 3. adjust pointers + buf += 8; + latin1_output += 8; + } else { + // Let us do a scalar fallback. + for (int k = 0; k < 8; k++) { + uint16_t word = scalar::utf16::swap_if_needed(buf[k]); + if (word <= 0xff) { + *latin1_output++ = char(word); + } else { + return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), + latin1_output); + } + } + } + } // while + return std::make_pair(result(error_code::SUCCESS, buf - start), + latin1_output); +} +/* end file src/arm64/arm_convert_utf16_to_latin1.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +/* begin file src/arm64/arm_convert_utf16_to_utf32.cpp */ /* - valid_utf8_to_fixed_length converts a valid UTF-8 string into UTF-32. + The vectorized algorithm works on single SSE register i.e., it + loads eight 16-bit code units. - The `OUTPUT` template type decides what to do with UTF-32: store - it directly or convert into UTF-16 (with AVX512). + We consider three cases: + 1. an input register contains no surrogates and each value + is in range 0x0000 .. 0x07ff. + 2. an input register contains no surrogates and values are + is in range 0x0000 .. 0xffff. + 3. an input register contains surrogates --- i.e. codepoints + can have 16 or 32 bits. - Input: - - str - valid UTF-8 string - - len - string length - - out_buffer - output buffer + Ad 1. - Result: - - pair.first - the first unprocessed input byte - - pair.second - the first unprocessed output word -*/ -template -std::pair valid_utf8_to_fixed_length(const char* str, size_t len, OUTPUT* dwords) { - constexpr bool UTF32 = std::is_same::value; - constexpr bool UTF16 = std::is_same::value; - static_assert(UTF32 or UTF16, "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)"); - static_assert(!(UTF32 and big_endian), "we do not currently support big-endian UTF-32"); + When values are less than 0x0800, it means that a 16-bit code unit + can be converted into: 1) single UTF8 byte (when it is an ASCII + char) or 2) two UTF8 bytes. - __m512i byteflip = _mm512_setr_epi64( - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809 - ); - const char* ptr = str; - const char* end = ptr + len; - - OUTPUT* output = dwords; - /** - * In the main loop, we consume 64 bytes per iteration, - * but we access 64 + 4 bytes. - * We check for ptr + 64 + 64 <= end because - * we want to be do maskless writes without overruns. - */ - while (ptr + 64 + 64 <= end) { - const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr); - const __m512i v_80 = _mm512_set1_epi8(char(0x80)); - const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80); - if(ascii == 0) { - SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output) - output += 64; - ptr += 64; - continue; - } + For this case we do only some shuffle to obtain these 2-byte + codes and finally compress the whole SSE register with a single + shuffle. - const __m512i lane0 = broadcast_epi128<0>(utf8); - const __m512i lane1 = broadcast_epi128<1>(utf8); - int valid_count0; - __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0); - const __m512i lane2 = broadcast_epi128<2>(utf8); - int valid_count1; - __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1); - if(valid_count0 + valid_count1 <= 16) { - vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<(utf8); - int valid_count2; - __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2); - uint32_t tmp1; - ::memcpy(&tmp1, ptr + 64, sizeof(tmp1)); - const __m512i lane4 = _mm512_set1_epi32(tmp1); - int valid_count3; - __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3); - if(valid_count2 + valid_count3 <= 16) { - vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1<(utf8); - const __m512i lane1 = broadcast_epi128<1>(utf8); - int valid_count0; - __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0); - const __m512i lane2 = broadcast_epi128<2>(utf8); - int valid_count1; - __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1); - if(valid_count0 + valid_count1 <= 16) { - vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<(utf8); - SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true) + When values fit in 16-bit code units, but are above 0x07ff, then + a single word may produce one, two or three UTF8 bytes. - ptr += 3*16; - } - } - return {ptr, output}; -} + We prepare data for all these three cases in two registers. + The first register contains lower two UTF8 bytes (used in all + cases), while the second one contains just the third byte for + the three-UTF8-bytes case. + Finally these two registers are interleaved forming eight-element + array of 32-bit values. The array spans two SSE registers. + The bytes from the registers are compressed using two shuffles. -using utf8_to_utf16_result = std::pair; -/* end file src/icelake/icelake_from_valid_utf8.inl.cpp */ -/* begin file src/icelake/icelake_utf8_validation.inl.cpp */ -// file included directly + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. -simdutf_really_inline __m512i check_special_cases(__m512i input, const __m512i prev1) { - __m512i mask1 = _mm512_setr_epi64( - 0x0202020202020202, - 0x4915012180808080, - 0x0202020202020202, - 0x4915012180808080, - 0x0202020202020202, - 0x4915012180808080, - 0x0202020202020202, - 0x4915012180808080); - const __m512i v_0f = _mm512_set1_epi8(0x0f); - __m512i index1 = _mm512_and_si512(_mm512_srli_epi16(prev1, 4), v_0f); - - __m512i byte_1_high = _mm512_shuffle_epi8(mask1, index1); - __m512i mask2 = _mm512_setr_epi64( - 0xcbcbcb8b8383a3e7, - 0xcbcbdbcbcbcbcbcb, - 0xcbcbcb8b8383a3e7, - 0xcbcbdbcbcbcbcbcb, - 0xcbcbcb8b8383a3e7, - 0xcbcbdbcbcbcbcbcb, - 0xcbcbcb8b8383a3e7, - 0xcbcbdbcbcbcbcbcb); - __m512i index2 = _mm512_and_si512(prev1, v_0f); - - __m512i byte_1_low = _mm512_shuffle_epi8(mask2, index2); - __m512i mask3 = _mm512_setr_epi64( - 0x101010101010101, - 0x1010101babaaee6, - 0x101010101010101, - 0x1010101babaaee6, - 0x101010101010101, - 0x1010101babaaee6, - 0x101010101010101, - 0x1010101babaaee6 - ); - __m512i index3 = _mm512_and_si512(_mm512_srli_epi16(input, 4), v_0f); - __m512i byte_2_high = _mm512_shuffle_epi8(mask3, index3); - return _mm512_ternarylogic_epi64(byte_1_high, byte_1_low, byte_2_high, 128); - } - - simdutf_really_inline __m512i check_multibyte_lengths(const __m512i input, - const __m512i prev_input, const __m512i sc) { - __m512i prev2 = prev<2>(input, prev_input); - __m512i prev3 = prev<3>(input, prev_input); - __m512i is_third_byte = _mm512_subs_epu8(prev2, _mm512_set1_epi8(0b11100000u-1)); // Only 111_____ will be > 0 - __m512i is_fourth_byte = _mm512_subs_epu8(prev3, _mm512_set1_epi8(0b11110000u-1)); // Only 1111____ will be > 0 - __m512i is_third_or_fourth_byte = _mm512_or_si512(is_third_byte, is_fourth_byte); - const __m512i v_7f = _mm512_set1_epi8(char(0x7f)); - is_third_or_fourth_byte = _mm512_adds_epu8(v_7f, is_third_or_fourth_byte); - // We want to compute (is_third_or_fourth_byte AND v80) XOR sc. - const __m512i v_80 = _mm512_set1_epi8(char(0x80)); - return _mm512_ternarylogic_epi32(is_third_or_fourth_byte, v_80, sc, 0b1101010); - //__m512i is_third_or_fourth_byte_mask = _mm512_and_si512(is_third_or_fourth_byte, v_80); - //return _mm512_xor_si512(is_third_or_fourth_byte_mask, sc); - } - // - // Return nonzero if there are incomplete multibyte characters at the end of the block: - // e.g. if there is a 4-byte character, but it's 3 bytes from the end. - // - simdutf_really_inline __m512i is_incomplete(const __m512i input) { - // If the previous input's last 3 bytes match this, they're too short (they ended at EOF): - // ... 1111____ 111_____ 11______ - __m512i max_value = _mm512_setr_epi64( - 0xffffffffffffffff, - 0xffffffffffffffff, - 0xffffffffffffffff, - 0xffffffffffffffff, - 0xffffffffffffffff, - 0xffffffffffffffff, - 0xffffffffffffffff, - 0xbfdfefffffffffff); - return _mm512_subs_epu8(input, max_value); - } - - struct avx512_utf8_checker { - // If this is nonzero, there has been a UTF-8 error. - __m512i error{}; - - // The last input we received - __m512i prev_input_block{}; - // Whether the last input we received was incomplete (used for ASCII fast path) - __m512i prev_incomplete{}; + To summarize: + - We need two 256-entry tables that have 8704 bytes in total. +*/ +/* + Returns a pair: the first unprocessed byte from buf and utf8_output + A scalar routing should carry on the conversion of the tail. +*/ +template +std::pair +arm_convert_utf16_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_out) { + uint32_t *utf32_output = reinterpret_cast(utf32_out); + const char16_t *end = buf + len; - // - // Check whether the current bytes are valid UTF-8. - // - simdutf_really_inline void check_utf8_bytes(const __m512i input, const __m512i prev_input) { - // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes - // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) - __m512i prev1 = prev<1>(input, prev_input); - __m512i sc = check_special_cases(input, prev1); - this->error = _mm512_or_si512(check_multibyte_lengths(input, prev_input, sc), this->error); - } - - // The only problem that can happen at EOF is that a multibyte character is too short - // or a byte value too large in the last bytes: check_special_cases only checks for bytes - // too large in the first of two bytes. - simdutf_really_inline void check_eof() { - // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't - // possibly finish them. - this->error = _mm512_or_si512(this->error, this->prev_incomplete); - } + const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800); + const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); - // returns true if ASCII. - simdutf_really_inline bool check_next_input(const __m512i input) { - const __m512i v_80 = _mm512_set1_epi8(char(0x80)); - const __mmask64 ascii = _mm512_test_epi8_mask(input, v_80); - if(ascii == 0) { - this->error = _mm512_or_si512(this->error, this->prev_incomplete); - return true; - } else { - this->check_utf8_bytes(input, this->prev_input_block); - this->prev_incomplete = is_incomplete(input); - this->prev_input_block = input; - return false; - } - } - // do not forget to call check_eof! - simdutf_really_inline bool errors() const { - return _mm512_test_epi8_mask(this->error, this->error) != 0; + while (end - buf >= 8) { + uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); + if simdutf_constexpr (!match_system(big_endian)) { + in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in))); } - }; // struct avx512_utf8_checker -/* end file src/icelake/icelake_utf8_validation.inl.cpp */ -/* begin file src/icelake/icelake_from_utf8.inl.cpp */ -// file included directly - -// File contains conversion procedure from possibly invalid UTF-8 strings. - -/** - * Attempts to convert up to len 1-byte code units from in (in UTF-8 format) to - * out. - * Returns the position of the input and output after the processing is - * completed. Upon error, the output is set to null. - */ - -template -utf8_to_utf16_result fast_avx512_convert_utf8_to_utf16(const char *in, size_t len, char16_t *out) { - const char *const final_in = in + len; - bool result = true; - while (result) { - if (in + 64 <= final_in) { - result = process_block_utf8_to_utf16(in, out, final_in - in); - } else if(in < final_in) { - result = process_block_utf8_to_utf16(in, out, final_in - in); - } else { break; } - } - if(!result) { out = nullptr; } - return std::make_pair(in, out); + const uint16x8_t surrogates_bytemask = + vceqq_u16(vandq_u16(in, v_f800), v_d800); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. + // However, it is likely an uncommon occurrence. + if (vmaxvq_u16(surrogates_bytemask) == 0) { + // case: no surrogate pairs, extend all 16-bit code units to 32-bit code + // units + vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(in))); + vst1q_u32(utf32_output + 4, vmovl_high_u16(in)); + utf32_output += 8; + buf += 8; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint16_t word = scalar::utf16::swap_if_needed(buf[k]); + if ((word & 0xF800) != 0xD800) { + *utf32_output++ = char32_t(word); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = + scalar::utf16::swap_if_needed(buf[k + 1]); + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if ((diff | diff2) > 0x3FF) { + return std::make_pair(nullptr, + reinterpret_cast(utf32_output)); + } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf32_output++ = char32_t(value); + } + } + buf += k; + } + } // while + return std::make_pair(buf, reinterpret_cast(utf32_output)); } +/* + Returns a pair: a result struct and utf8_output. + If there is an error, the count field of the result is the position of the + error. Otherwise, it is the position of the first unprocessed byte in buf + (even if finished). A scalar routing should carry on the conversion of the + tail if needed. +*/ template -simdutf::result fast_avx512_convert_utf8_to_utf16_with_errors(const char *in, size_t len, char16_t *out) { - const char *const init_in = in; - const char16_t *const init_out = out; - const char *const final_in = in + len; - bool result = true; - while (result) { - if (in + 64 <= final_in) { - result = process_block_utf8_to_utf16(in, out, final_in - in); - } else if(in < final_in) { - result = process_block_utf8_to_utf16(in, out, final_in - in); - } else { break; } - } - if(!result) { - // rewind_and_convert_with_errors will seek a potential error from in onward, - // with the ability to go back up to in - init_in bytes, and read final_in - in bytes forward. - simdutf::result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(in - init_in, in, final_in - in, out); - res.count += (in - init_in); - return res; - } else { - return simdutf::result(error_code::SUCCESS,out - init_out); - } -} - - -template -std::pair validating_utf8_to_fixed_length(const char* str, size_t len, OUTPUT* dwords) { - constexpr bool UTF32 = std::is_same::value; - constexpr bool UTF16 = std::is_same::value; - static_assert(UTF32 or UTF16, "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)"); - static_assert(!(UTF32 and big_endian), "we do not currently support big-endian UTF-32"); - - const char* ptr = str; - const char* end = ptr + len; - __m512i byteflip = _mm512_setr_epi64( - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809 - ); - OUTPUT* output = dwords; - avx512_utf8_checker checker{}; - /** - * In the main loop, we consume 64 bytes per iteration, - * but we access 64 + 4 bytes. - * We check for ptr + 64 + 64 <= end because - * we want to be do maskless writes without overruns. - */ - while (ptr + 64 + 64 <= end) { - const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr); - if(checker.check_next_input(utf8)) { - SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output) - output += 64; - ptr += 64; - continue; - } - const __m512i lane0 = broadcast_epi128<0>(utf8); - const __m512i lane1 = broadcast_epi128<1>(utf8); - int valid_count0; - __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0); - const __m512i lane2 = broadcast_epi128<2>(utf8); - int valid_count1; - __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1); - if(valid_count0 + valid_count1 <= 16) { - vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<(utf8); - int valid_count2; - __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2); - uint32_t tmp1; - ::memcpy(&tmp1, ptr + 64, sizeof(tmp1)); - const __m512i lane4 = _mm512_set1_epi32(tmp1); - int valid_count3; - __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3); - if(valid_count2 + valid_count3 <= 16) { - vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1<(utf8); - const __m512i lane1 = broadcast_epi128<1>(utf8); - int valid_count0; - __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0); - const __m512i lane2 = broadcast_epi128<2>(utf8); - int valid_count1; - __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1); - if(valid_count0 + valid_count1 <= 16) { - vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1< +arm_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len, + char32_t *utf32_out) { + uint32_t *utf32_output = reinterpret_cast(utf32_out); + const char16_t *start = buf; + const char16_t *end = buf + len; - const __m512i lane3 = broadcast_epi128<3>(utf8); - SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true) + const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800); + const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); - ptr += 3*16; - } - validatedptr += 4*16; - } - { - const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - validatedptr))-1, (const __m512i*)validatedptr); - checker.check_next_input(utf8); - } - checker.check_eof(); - if(checker.errors()) { - return {ptr, nullptr}; // We found an error. + while ((end - buf) >= 8) { + uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); + if simdutf_constexpr (!match_system(big_endian)) { + in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in))); } - return {ptr, output}; -} -// Like validating_utf8_to_fixed_length but returns as soon as an error is identified -template -std::tuple validating_utf8_to_fixed_length_with_constant_checks(const char* str, size_t len, OUTPUT* dwords) { - constexpr bool UTF32 = std::is_same::value; - constexpr bool UTF16 = std::is_same::value; - static_assert(UTF32 or UTF16, "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)"); - static_assert(!(UTF32 and big_endian), "we do not currently support big-endian UTF-32"); - - const char* ptr = str; - const char* end = ptr + len; - __m512i byteflip = _mm512_setr_epi64( - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809 - ); - OUTPUT* output = dwords; - avx512_utf8_checker checker{}; - /** - * In the main loop, we consume 64 bytes per iteration, - * but we access 64 + 4 bytes. - * We check for ptr + 64 + 64 <= end because - * we want to be do maskless writes without overruns. - */ - while (ptr + 64 + 64 <= end) { - const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr); - if(checker.check_next_input(utf8)) { - SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output) - output += 64; - ptr += 64; - continue; - } - if(checker.errors()) { - return {ptr, output, false}; // We found an error. - } - const __m512i lane0 = broadcast_epi128<0>(utf8); - const __m512i lane1 = broadcast_epi128<1>(utf8); - int valid_count0; - __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0); - const __m512i lane2 = broadcast_epi128<2>(utf8); - int valid_count1; - __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1); - if(valid_count0 + valid_count1 <= 16) { - vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<(utf8); - int valid_count2; - __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2); - uint32_t tmp1; - ::memcpy(&tmp1, ptr + 64, sizeof(tmp1)); - const __m512i lane4 = _mm512_set1_epi32(tmp1); - int valid_count3; - __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3); - if(valid_count2 + valid_count3 <= 16) { - vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1<(buf[k]); + if ((word & 0xF800) != 0xD800) { + *utf32_output++ = char32_t(word); } else { - const __m512i lane0 = broadcast_epi128<0>(utf8); - const __m512i lane1 = broadcast_epi128<1>(utf8); - int valid_count0; - __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0); - const __m512i lane2 = broadcast_epi128<2>(utf8); - int valid_count1; - __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1); - if(valid_count0 + valid_count1 <= 16) { - vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<(utf8); - SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true) - - ptr += 3*16; + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = + scalar::utf16::swap_if_needed(buf[k + 1]); + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if ((diff | diff2) > 0x3FF) { + return std::make_pair( + result(error_code::SURROGATE, buf - start + k - 1), + reinterpret_cast(utf32_output)); + } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf32_output++ = char32_t(value); } - validatedptr += 4*16; - } - { - const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - validatedptr))-1, (const __m512i*)validatedptr); - checker.check_next_input(utf8); - } - checker.check_eof(); - if(checker.errors()) { - return {ptr, output, false}; // We found an error. + } + buf += k; } - return {ptr, output, true}; + } // while + return std::make_pair(result(error_code::SUCCESS, buf - start), + reinterpret_cast(utf32_output)); } -/* end file src/icelake/icelake_from_utf8.inl.cpp */ -/* begin file src/icelake/icelake_convert_utf8_to_latin1.inl.cpp */ -// file included directly - -// File contains conversion procedure from possibly invalid UTF-8 strings. +/* end file src/arm64/arm_convert_utf16_to_utf32.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF8 +/* begin file src/arm64/arm_convert_utf16_to_utf8.cpp */ +/* + The vectorized algorithm works on single SSE register i.e., it + loads eight 16-bit code units. -// template -template -simdutf_really_inline size_t process_block_from_utf8_to_latin1(const char *buf, size_t len, - char *latin_output, __m512i minus64, - __m512i one, - __mmask64 *next_leading_ptr, - __mmask64 *next_bit6_ptr) { - __mmask64 load_mask = - is_remaining ? _bzhi_u64(~0ULL, (unsigned int)len) : ~0ULL; - __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)buf); - __mmask64 nonascii = _mm512_movepi8_mask(input); + We consider three cases: + 1. an input register contains no surrogates and each value + is in range 0x0000 .. 0x07ff. + 2. an input register contains no surrogates and values are + is in range 0x0000 .. 0xffff. + 3. an input register contains surrogates --- i.e. codepoints + can have 16 or 32 bits. - if (nonascii == 0) { - is_remaining - ? _mm512_mask_storeu_epi8((__m512i *)latin_output, load_mask, input) - : _mm512_storeu_si512((__m512i *)latin_output, input); - return len; - } + Ad 1. - __mmask64 leading = _mm512_cmpge_epu8_mask(input, minus64); + When values are less than 0x0800, it means that a 16-bit code unit + can be converted into: 1) single UTF8 byte (when it is an ASCII + char) or 2) two UTF8 bytes. - __m512i highbits = _mm512_xor_si512(input, _mm512_set1_epi8(-62)); - __mmask64 invalid_leading_bytes = - _mm512_mask_cmpgt_epu8_mask(leading, highbits, one); + For this case we do only some shuffle to obtain these 2-byte + codes and finally compress the whole SSE register with a single + shuffle. - if (invalid_leading_bytes) { - return 0; // Indicates error - } + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. - __mmask64 leading_shift = (leading << 1) | *next_leading_ptr; - *next_leading_ptr = leading >> 63; + Ad 2. - if ((nonascii ^ leading) != leading_shift) { - return 0; // Indicates error - } + When values fit in 16-bit code units, but are above 0x07ff, then + a single word may produce one, two or three UTF8 bytes. - __mmask64 bit6 = _mm512_cmpeq_epi8_mask(highbits, one); - input = - _mm512_mask_sub_epi8(input, (bit6 << 1) | *next_bit6_ptr, input, minus64); - *next_bit6_ptr = bit6 >> 63; + We prepare data for all these three cases in two registers. + The first register contains lower two UTF8 bytes (used in all + cases), while the second one contains just the third byte for + the three-UTF8-bytes case. - __mmask64 retain = ~leading & load_mask; - __m512i output = _mm512_maskz_compress_epi8(retain, input); - int64_t written_out = count_ones(retain); - __mmask64 store_mask = (1ULL << written_out) - 1; - - // *************************** - // Possible optimization? (Nick Nuon) - // This commented out line is 5% faster but sadly it'll also write past - // memory bounds for latin1_output: is_remaining ? - // _mm512_mask_storeu_epi8((__m512i *)latin_output, store_mask, output) : - // _mm512_storeu_si512((__m512i *)latin_output, output); I tried using - // _mm512_storeu_si512 and have the next process_block start from the - // "written_out" point but the compiler shuffles memory in such a way that it - // is significantly slower... - // **************************** - _mm512_mask_storeu_epi8((__m512i *)latin_output, store_mask, output); + Finally these two registers are interleaved forming eight-element + array of 32-bit values. The array spans two SSE registers. + The bytes from the registers are compressed using two shuffles. - return written_out; -} + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. -size_t utf8_to_latin1_avx512(const char *buf, size_t len, char *latin_output) { - char *start = latin_output; - size_t pos = 0; - __m512i minus64 = _mm512_set1_epi8(-64); // 11111111111 ... 1100 0000 - __m512i one = _mm512_set1_epi8(1); - __mmask64 next_leading = 0; - __mmask64 next_bit6 = 0; - while (pos + 64 <= len) { - size_t written = process_block_from_utf8_to_latin1(buf + pos, 64, latin_output, minus64, - one, &next_leading, &next_bit6); - if (written == 0) { - return 0; // Indicates error - } - latin_output += written; - pos += 64; - } + To summarize: + - We need two 256-entry tables that have 8704 bytes in total. +*/ +/* + Returns a pair: the first unprocessed byte from buf and utf8_output + A scalar routing should carry on the conversion of the tail. +*/ +template +std::pair +arm_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) { + uint8_t *utf8_output = reinterpret_cast(utf8_out); + const char16_t *end = buf + len; - if (pos < len) { - size_t remaining = len - pos; - size_t written = - process_block_from_utf8_to_latin1(buf + pos, remaining, latin_output, minus64, one, - &next_leading, &next_bit6); - if (written == 0) { - return 0; // Indicates error + const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800); + const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); + const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080); + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 + while (end - buf >= std::ptrdiff_t(16 + safety_margin)) { + uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); + if simdutf_constexpr (!match_system(big_endian)) { + in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in))); + } + if (vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!! + // It is common enough that we have sequences of 16 consecutive ASCII + // characters. + uint16x8_t nextin = + vld1q_u16(reinterpret_cast(buf) + 8); + if simdutf_constexpr (!match_system(big_endian)) { + nextin = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(nextin))); + } + if (vmaxvq_u16(nextin) > 0x7F) { + // 1. pack the bytes + // obviously suboptimal. + uint8x8_t utf8_packed = vmovn_u16(in); + // 2. store (8 bytes) + vst1_u8(utf8_output, utf8_packed); + // 3. adjust pointers + buf += 8; + utf8_output += 8; + in = nextin; + } else { + // 1. pack the bytes + // obviously suboptimal. + uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin); + // 2. store (16 bytes) + vst1q_u8(utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } } - latin_output += written; - } - - return (size_t)(latin_output - start); -} -/* end file src/icelake/icelake_convert_utf8_to_latin1.inl.cpp */ -/* begin file src/icelake/icelake_convert_valid_utf8_to_latin1.inl.cpp */ -// file included directly -// File contains conversion procedure from valid UTF-8 strings. + if (vmaxvq_u16(in) <= 0x7FF) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00); + const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f); -template -simdutf_really_inline size_t process_valid_block_from_utf8_to_latin1(const char *buf, size_t len, - char *latin_output, - __m512i minus64, __m512i one, - __mmask64 *next_leading_ptr, - __mmask64 *next_bit6_ptr) { - __mmask64 load_mask = - is_remaining ? _bzhi_u64(~0ULL, (unsigned int)len) : ~0ULL; - __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)buf); - __mmask64 nonascii = _mm512_movepi8_mask(input); + // t0 = [000a|aaaa|bbbb|bb00] + const uint16x8_t t0 = vshlq_n_u16(in, 2); + // t1 = [000a|aaaa|0000|0000] + const uint16x8_t t1 = vandq_u16(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const uint16x8_t t2 = vandq_u16(in, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const uint16x8_t t3 = vorrq_u16(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const uint16x8_t t4 = vorrq_u16(t3, v_c080); + // 2. merge ASCII and 2-byte codewords + const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); + const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f); + const uint8x16_t utf8_unpacked = + vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4)); + // 3. prepare bitmask for 8-bit lookup +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t mask = simdutf_make_uint16x8_t( + 0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080); +#else + const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040, + 0x0002, 0x0008, 0x0020, 0x0080}; +#endif + uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask)); + // 4. pack the bytes + const uint8_t *row = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; + const uint8x16_t shuffle = vld1q_u8(row + 1); + const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle); - if (nonascii == 0) { - is_remaining - ? _mm512_mask_storeu_epi8((__m512i *)latin_output, load_mask, input) - : _mm512_storeu_si512((__m512i *)latin_output, input); - return len; - } + // 5. store bytes + vst1q_u8(utf8_output, utf8_packed); - __mmask64 leading = _mm512_cmpge_epu8_mask(input, minus64); + // 6. adjust pointers + buf += 8; + utf8_output += row[0]; + continue; + } + const uint16x8_t surrogates_bytemask = + vceqq_u16(vandq_u16(in, v_f800), v_d800); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. + // However, it is likely an uncommon occurrence. + if (vmaxvq_u16(surrogates_bytemask) == 0) { + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t dup_even = simdutf_make_uint16x8_t( + 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); +#else + const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e}; +#endif + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - + single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two + UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - + three UTF-8 bytes - __m512i highbits = _mm512_xor_si512(input, _mm512_set1_epi8(-62)); + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. - *next_leading_ptr = leading >> 63; + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. - __mmask64 bit6 = _mm512_cmpeq_epi8_mask(highbits, one); - input = - _mm512_mask_sub_epi8(input, (bit6 << 1) | *next_bit6_ptr, input, minus64); - *next_bit6_ptr = bit6 >> 63; + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. - __mmask64 retain = ~leading & load_mask; - __m512i output = _mm512_maskz_compress_epi8(retain, input); - int64_t written_out = count_ones(retain); - __mmask64 store_mask = (1ULL << written_out) - 1; - // Optimization opportunity: sometimes, masked writes are not needed. - _mm512_mask_storeu_epi8((__m512i *)latin_output, store_mask, output); - return written_out; -} + Finally from these two code units we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define simdutf_vec(x) vmovq_n_u16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const uint16x8_t t0 = vreinterpretq_u16_u8( + vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even))); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000)); + + // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] + const uint16x8_t s0 = vshrq_n_u16(in, 12); + // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] + const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000)); + // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000] + const uint16x8_t s1s = vshlq_n_u16(s1, 2); + // [00bb|bbbb|0000|aaaa] + const uint16x8_t s2 = vorrq_u16(s0, s1s); + // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000)); + const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF); + const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff); + const uint16x8_t m0 = + vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask); + const uint16x8_t s4 = veorq_u16(s3, m0); +#undef simdutf_vec -size_t valid_utf8_to_latin1_avx512(const char *buf, size_t len, - char *latin_output) { - char *start = latin_output; - size_t pos = 0; - __m512i minus64 = _mm512_set1_epi8(-64); // 11111111111 ... 1100 0000 - __m512i one = _mm512_set1_epi8(1); - __mmask64 next_leading = 0; - __mmask64 next_bit6 = 0; + // 4. expand code units 16-bit => 32-bit + const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4)); + const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4)); - while (pos + 64 <= len) { - size_t written = process_valid_block_from_utf8_to_latin1( - buf + pos, 64, latin_output, minus64, one, &next_leading, &next_bit6); - latin_output += written; - pos += 64; - } + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); + const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f); +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t onemask = simdutf_make_uint16x8_t( + 0x0001, 0x0004, 0x0010, 0x0040, 0x0100, 0x0400, 0x1000, 0x4000); + const uint16x8_t twomask = simdutf_make_uint16x8_t( + 0x0002, 0x0008, 0x0020, 0x0080, 0x0200, 0x0800, 0x2000, 0x8000); +#else + const uint16x8_t onemask = {0x0001, 0x0004, 0x0010, 0x0040, + 0x0100, 0x0400, 0x1000, 0x4000}; + const uint16x8_t twomask = {0x0002, 0x0008, 0x0020, 0x0080, + 0x0200, 0x0800, 0x2000, 0x8000}; +#endif + const uint16x8_t combined = + vorrq_u16(vandq_u16(one_byte_bytemask, onemask), + vandq_u16(one_or_two_bytes_bytemask, twomask)); + const uint16_t mask = vaddvq_u16(combined); + // The following fast path may or may not be beneficial. + /*if(mask == 0) { + // We only have three-byte code units. Use fast path. + const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0}; + const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle); + const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle); + vst1q_u8(utf8_output, utf8_0); + utf8_output += 12; + vst1q_u8(utf8_output, utf8_1); + utf8_output += 12; + buf += 8; + continue; + }*/ + const uint8_t mask0 = uint8_t(mask); - if (pos < len) { - size_t remaining = len - pos; - size_t written = - process_valid_block_from_utf8_to_latin1(buf + pos, remaining, latin_output, minus64, - one, &next_leading, &next_bit6); - latin_output += written; - } + const uint8_t *row0 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const uint8x16_t shuffle0 = vld1q_u8(row0 + 1); + const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0); - return (size_t)(latin_output - start); -} -/* end file src/icelake/icelake_convert_valid_utf8_to_latin1.inl.cpp */ -/* begin file src/icelake/icelake_convert_utf16_to_latin1.inl.cpp */ -// file included directly -template -size_t icelake_convert_utf16_to_latin1(const char16_t *buf, size_t len, - char *latin1_output) { - const char16_t *end = buf + len; - __m512i v_0xFF = _mm512_set1_epi16(0xff); - __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809, - 0x0607040502030001, 0x0e0f0c0d0a0b0809, - 0x0607040502030001, 0x0e0f0c0d0a0b0809, - 0x0607040502030001, 0x0e0f0c0d0a0b0809); - __m512i shufmask = _mm512_set_epi8( - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, - 36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); - while (buf + 32 <= end) { - __m512i in = _mm512_loadu_si512((__m512i *)buf); - if (big_endian) { - in = _mm512_shuffle_epi8(in, byteflip); - } - if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) { - return 0; - } - _mm256_storeu_si256( - (__m256i *)latin1_output, - _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in))); - latin1_output += 32; - buf += 32; - } - if (buf < end) { - uint32_t mask(uint32_t(1 << (end - buf)) - 1); - __m512i in = _mm512_maskz_loadu_epi16(mask, buf); - if (big_endian) { - in = _mm512_shuffle_epi8(in, byteflip); - } - if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) { - return 0; + const uint8_t mask1 = static_cast(mask >> 8); + const uint8_t *row1 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const uint8x16_t shuffle1 = vld1q_u8(row1 + 1); + const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1); + + vst1q_u8(utf8_output, utf8_0); + utf8_output += row0[0]; + vst1q_u8(utf8_output, utf8_1); + utf8_output += row1[0]; + + buf += 8; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint16_t word = scalar::utf16::swap_if_needed(buf[k]); + if ((word & 0xFF80) == 0) { + *utf8_output++ = char(word); + } else if ((word & 0xF800) == 0) { + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if ((word & 0xF800) != 0xD800) { + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = + scalar::utf16::swap_if_needed(buf[k + 1]); + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if ((diff | diff2) > 0x3FF) { + return std::make_pair(nullptr, + reinterpret_cast(utf8_output)); + } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf8_output++ = char((value >> 18) | 0b11110000); + *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); + } + } + buf += k; } - _mm256_mask_storeu_epi8( - latin1_output, mask, - _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in))); - } - return len; + } // while + + return std::make_pair(buf, reinterpret_cast(utf8_output)); } +/* + Returns a pair: a result struct and utf8_output. + If there is an error, the count field of the result is the position of the + error. Otherwise, it is the position of the first unprocessed byte in buf + (even if finished). A scalar routing should carry on the conversion of the + tail if needed. +*/ template std::pair -icelake_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len, - char *latin1_output) { - const char16_t *end = buf + len; +arm_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len, + char *utf8_out) { + uint8_t *utf8_output = reinterpret_cast(utf8_out); const char16_t *start = buf; - __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809, - 0x0607040502030001, 0x0e0f0c0d0a0b0809, - 0x0607040502030001, 0x0e0f0c0d0a0b0809, - 0x0607040502030001, 0x0e0f0c0d0a0b0809); - __m512i v_0xFF = _mm512_set1_epi16(0xff); - __m512i shufmask = _mm512_set_epi8( - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, - 36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); - while (buf + 32 <= end) { - __m512i in = _mm512_loadu_si512((__m512i *)buf); - if (big_endian) { - in = _mm512_shuffle_epi8(in, byteflip); - } - if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) { - uint16_t word; - while ((word = (big_endian ? scalar::utf16::swap_bytes(uint16_t(*buf)) - : uint16_t(*buf))) <= 0xff) { - *latin1_output++ = uint8_t(word); - buf++; - } - return std::make_pair(result(error_code::TOO_LARGE, buf - start), - latin1_output); - } - _mm256_storeu_si256( - (__m256i *)latin1_output, - _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in))); - latin1_output += 32; - buf += 32; - } - if (buf < end) { - uint32_t mask(uint32_t(1 << (end - buf)) - 1); - __m512i in = _mm512_maskz_loadu_epi16(mask, buf); - if (big_endian) { - in = _mm512_shuffle_epi8(in, byteflip); - } - if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) { + const char16_t *end = buf + len; - uint16_t word; - while ((word = (big_endian ? scalar::utf16::swap_bytes(uint16_t(*buf)) - : uint16_t(*buf))) <= 0xff) { - *latin1_output++ = uint8_t(word); - buf++; + const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800); + const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); + const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080); + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 + + while (end - buf >= std::ptrdiff_t(16 + safety_margin)) { + uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); + if simdutf_constexpr (!match_system(big_endian)) { + in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in))); + } + if (vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!! + // It is common enough that we have sequences of 16 consecutive ASCII + // characters. + uint16x8_t nextin = + vld1q_u16(reinterpret_cast(buf) + 8); + if simdutf_constexpr (!match_system(big_endian)) { + nextin = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(nextin))); + } + if (vmaxvq_u16(nextin) > 0x7F) { + // 1. pack the bytes + // obviously suboptimal. + uint8x8_t utf8_packed = vmovn_u16(in); + // 2. store (8 bytes) + vst1_u8(utf8_output, utf8_packed); + // 3. adjust pointers + buf += 8; + utf8_output += 8; + in = nextin; + } else { + // 1. pack the bytes + // obviously suboptimal. + uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin); + // 2. store (16 bytes) + vst1q_u8(utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! } - return std::make_pair(result(error_code::TOO_LARGE, buf - start), - latin1_output); } - _mm256_mask_storeu_epi8( - latin1_output, mask, - _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in))); - } - return std::make_pair(result(error_code::SUCCESS, len), latin1_output); -} -/* end file src/icelake/icelake_convert_utf16_to_latin1.inl.cpp */ -/* begin file src/icelake/icelake_convert_utf16_to_utf8.inl.cpp */ -// file included directly -/** - * This function converts the input (inbuf, inlen), assumed to be valid - * UTF16 (little endian) into UTF-8 (to outbuf). The number of code units written - * is written to 'outlen' and the function reports the number of input word - * consumed. - */ -template -size_t utf16_to_utf8_avx512i(const char16_t *inbuf, size_t inlen, - unsigned char *outbuf, size_t *outlen) { - __m512i in; - __mmask32 inmask = _cvtu32_mask32(0x7fffffff); - __m512i byteflip = _mm512_setr_epi64( - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809 - ); - const char16_t * const inbuf_orig = inbuf; - const unsigned char * const outbuf_orig = outbuf; - size_t adjust = 0; - int carry = 0; - - while (inlen >= 32) { - in = _mm512_loadu_si512(inbuf); - if(big_endian) { in = _mm512_shuffle_epi8(in, byteflip); } - inlen -= 31; - lastiteration: - inbuf += 31; + if (vmaxvq_u16(in) <= 0x7FF) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00); + const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f); - failiteration: - const __mmask32 is234byte = _mm512_mask_cmp_epu16_mask( - inmask, in, _mm512_set1_epi16(0x0080), _MM_CMPINT_NLT); + // t0 = [000a|aaaa|bbbb|bb00] + const uint16x8_t t0 = vshlq_n_u16(in, 2); + // t1 = [000a|aaaa|0000|0000] + const uint16x8_t t1 = vandq_u16(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const uint16x8_t t2 = vandq_u16(in, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const uint16x8_t t3 = vorrq_u16(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const uint16x8_t t4 = vorrq_u16(t3, v_c080); + // 2. merge ASCII and 2-byte codewords + const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); + const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f); + const uint8x16_t utf8_unpacked = + vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4)); + // 3. prepare bitmask for 8-bit lookup +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t mask = simdutf_make_uint16x8_t( + 0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080); +#else + const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040, + 0x0002, 0x0008, 0x0020, 0x0080}; +#endif + uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask)); + // 4. pack the bytes + const uint8_t *row = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; + const uint8x16_t shuffle = vld1q_u8(row + 1); + const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle); - if (_ktestz_mask32_u8(inmask, is234byte)) { - // fast path for ASCII only - _mm512_mask_cvtepi16_storeu_epi8(outbuf, inmask, in); - outbuf += 31; - carry = 0; + // 5. store bytes + vst1q_u8(utf8_output, utf8_packed); - if (inlen < 32) { - goto tail; - } else { - continue; - } + // 6. adjust pointers + buf += 8; + utf8_output += row[0]; + continue; } + const uint16x8_t surrogates_bytemask = + vceqq_u16(vandq_u16(in, v_f800), v_d800); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. + // However, it is likely an uncommon occurrence. + if (vmaxvq_u16(surrogates_bytemask) == 0) { + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t dup_even = simdutf_make_uint16x8_t( + 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); +#else + const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e}; +#endif + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - + single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two + UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - + three UTF-8 bytes - const __mmask32 is12byte = - _mm512_cmp_epu16_mask(in, _mm512_set1_epi16(0x0800), _MM_CMPINT_LT); - - if (_ktestc_mask32_u8(is12byte, inmask)) { - // fast path for 1 and 2 byte only - - const __m512i twobytes = _mm512_ternarylogic_epi32( - _mm512_slli_epi16(in, 8), _mm512_srli_epi16(in, 6), - _mm512_set1_epi16(0x3f3f), 0xa8); // (A|B)&C - in = _mm512_mask_add_epi16(in, is234byte, twobytes, - _mm512_set1_epi16(int16_t(0x80c0))); - const __m512i cmpmask = - _mm512_mask_blend_epi16(inmask, _mm512_set1_epi16(int16_t(0xffff)), - _mm512_set1_epi16(0x0800)); - const __mmask64 smoosh = _mm512_cmp_epu8_mask(in, cmpmask, _MM_CMPINT_NLT); - const __m512i out = _mm512_maskz_compress_epi8(smoosh, in); - _mm512_mask_storeu_epi8(outbuf, _cvtu64_mask64(_pext_u64(_cvtmask64_u64(smoosh), _cvtmask64_u64(smoosh))), - out); - outbuf += 31 + _mm_popcnt_u32(_cvtmask32_u32(is234byte)); - carry = 0; - - if (inlen < 32) { - goto tail; - } else { - continue; - } - } - __m512i lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in)); - __m512i hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1)); + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. - __m512i taglo = _mm512_set1_epi32(0x8080e000); - __m512i taghi = taglo; + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. - const __m512i fc00masked = _mm512_and_epi32(in, _mm512_set1_epi16(int16_t(0xfc00))); - const __mmask32 hisurr = _mm512_mask_cmp_epu16_mask( - inmask, fc00masked, _mm512_set1_epi16(int16_t(0xd800)), _MM_CMPINT_EQ); - const __mmask32 losurr = _mm512_cmp_epu16_mask( - fc00masked, _mm512_set1_epi16(int16_t(0xdc00)), _MM_CMPINT_EQ); + Finally from these two code units we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define simdutf_vec(x) vmovq_n_u16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const uint16x8_t t0 = vreinterpretq_u16_u8( + vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even))); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000)); + + // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] + const uint16x8_t s0 = vshrq_n_u16(in, 12); + // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] + const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000)); + // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000] + const uint16x8_t s1s = vshlq_n_u16(s1, 2); + // [00bb|bbbb|0000|aaaa] + const uint16x8_t s2 = vorrq_u16(s0, s1s); + // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000)); + const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF); + const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff); + const uint16x8_t m0 = + vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask); + const uint16x8_t s4 = veorq_u16(s3, m0); +#undef simdutf_vec - int carryout = 0; - if (!_kortestz_mask32_u8(hisurr, losurr)) { - // handle surrogates + // 4. expand code units 16-bit => 32-bit + const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4)); + const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4)); - __m512i los = _mm512_alignr_epi32(hi, lo, 1); - __m512i his = _mm512_alignr_epi32(lo, hi, 1); + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); + const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f); +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t onemask = simdutf_make_uint16x8_t( + 0x0001, 0x0004, 0x0010, 0x0040, 0x0100, 0x0400, 0x1000, 0x4000); + const uint16x8_t twomask = simdutf_make_uint16x8_t( + 0x0002, 0x0008, 0x0020, 0x0080, 0x0200, 0x0800, 0x2000, 0x8000); +#else + const uint16x8_t onemask = {0x0001, 0x0004, 0x0010, 0x0040, + 0x0100, 0x0400, 0x1000, 0x4000}; + const uint16x8_t twomask = {0x0002, 0x0008, 0x0020, 0x0080, + 0x0200, 0x0800, 0x2000, 0x8000}; +#endif + const uint16x8_t combined = + vorrq_u16(vandq_u16(one_byte_bytemask, onemask), + vandq_u16(one_or_two_bytes_bytemask, twomask)); + const uint16_t mask = vaddvq_u16(combined); + // The following fast path may or may not be beneficial. + /*if(mask == 0) { + // We only have three-byte code units. Use fast path. + const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0}; + const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle); + const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle); + vst1q_u8(utf8_output, utf8_0); + utf8_output += 12; + vst1q_u8(utf8_output, utf8_1); + utf8_output += 12; + buf += 8; + continue; + }*/ + const uint8_t mask0 = uint8_t(mask); - const __mmask32 hisurrhi = _kshiftri_mask32(hisurr, 16); - taglo = - _mm512_mask_mov_epi32(taglo,__mmask16(hisurr), _mm512_set1_epi32(0x808080f0)); - taghi = - _mm512_mask_mov_epi32(taghi, __mmask16(hisurrhi), _mm512_set1_epi32(0x808080f0)); + const uint8_t *row0 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const uint8x16_t shuffle0 = vld1q_u8(row0 + 1); + const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0); - lo = _mm512_mask_slli_epi32(lo, __mmask16(hisurr), lo, 10); - hi = _mm512_mask_slli_epi32(hi, __mmask16(hisurrhi), hi, 10); - los = _mm512_add_epi32(los, _mm512_set1_epi32(0xfca02400)); - his = _mm512_add_epi32(his, _mm512_set1_epi32(0xfca02400)); - lo = _mm512_mask_add_epi32(lo, __mmask16(hisurr), lo, los); - hi = _mm512_mask_add_epi32(hi, __mmask16(hisurrhi), hi, his); + const uint8_t mask1 = static_cast(mask >> 8); + const uint8_t *row1 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const uint8x16_t shuffle1 = vld1q_u8(row1 + 1); + const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1); - carryout = _cvtu32_mask32(_kshiftri_mask32(hisurr, 30)); + vst1q_u8(utf8_output, utf8_0); + utf8_output += row0[0]; + vst1q_u8(utf8_output, utf8_1); + utf8_output += row1[0]; - const uint32_t h = _cvtmask32_u32(hisurr); - const uint32_t l = _cvtmask32_u32(losurr); - // check for mismatched surrogates - if ((h + h + carry) ^ l) { - const uint32_t lonohi = l & ~(h + h + carry); - const uint32_t hinolo = h & ~(l >> 1); - inlen = _tzcnt_u32(hinolo | lonohi); - inmask = __mmask32(0x7fffffff & ((1 << inlen) - 1)); - in = _mm512_maskz_mov_epi16(inmask, in); - adjust = (int)inlen - 31; - inlen = 0; - goto failiteration; + buf += 8; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint16_t word = scalar::utf16::swap_if_needed(buf[k]); + if ((word & 0xFF80) == 0) { + *utf8_output++ = char(word); + } else if ((word & 0xF800) == 0) { + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if ((word & 0xF800) != 0xD800) { + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = + scalar::utf16::swap_if_needed(buf[k + 1]); + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if ((diff | diff2) > 0x3FF) { + return std::make_pair( + result(error_code::SURROGATE, buf - start + k - 1), + reinterpret_cast(utf8_output)); + } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf8_output++ = char((value >> 18) | 0b11110000); + *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); + } } + buf += k; } + } // while - hi = _mm512_maskz_mov_epi32(_cvtu32_mask16(0x7fff),hi); - carry = carryout; - - __m512i mslo = - _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), lo); - - __m512i mshi = - _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), hi); - - const __mmask32 outmask = __mmask32(_kandn_mask64(losurr, inmask)); - const __mmask64 outmhi = _kshiftri_mask64(outmask, 16); - - const __mmask32 is1byte = __mmask32(_knot_mask64(is234byte)); - const __mmask64 is1bhi = _kshiftri_mask64(is1byte, 16); - const __mmask64 is12bhi = _kshiftri_mask64(is12byte, 16); - - taglo = - _mm512_mask_mov_epi32(taglo, __mmask16(is12byte), _mm512_set1_epi32(0x80c00000)); - taghi = - _mm512_mask_mov_epi32(taghi, __mmask16(is12bhi), _mm512_set1_epi32(0x80c00000)); - __m512i magiclo = _mm512_mask_blend_epi32(__mmask16(outmask), _mm512_set1_epi32(0xffffffff), - _mm512_set1_epi32(0x00010101)); - __m512i magichi = _mm512_mask_blend_epi32(__mmask16(outmhi), _mm512_set1_epi32(0xffffffff), - _mm512_set1_epi32(0x00010101)); + return std::make_pair(result(error_code::SUCCESS, buf - start), + reinterpret_cast(utf8_output)); +} +template +simdutf_really_inline size_t +arm64_utf8_length_from_utf16_bytemask(const char16_t *in, size_t size) { + constexpr size_t N = + 16; // we process 16 char16_t at a time, this is NEON specific + + if (N + 1 > size) { + return scalar::utf16::utf8_length_from_utf16(in, size); + } // special case for short inputs + size_t count = 0; + const auto one = vmovq_n_u8(1); + + // The general strategy is as follows: + // 1. each code unit yields at least one byte, we can account for that by + // adding the size of the input to the count. + // 2. ASCII bytes then count for zero. + // 3. Values that yield 2 or 3 bytes in UTF-8 add 1 or 2 to the count. + // 4. Surrogate pairs are handled by adding 1 for each surrogate code unit + // for a total of 4 bytes for the pair. + size_t pos = 0; + // We will go through the input at least once. + for (; size - pos >= N; pos += N) { + auto base_input = vld2q_u8(reinterpret_cast(in + pos)); + // + size_t idx = 1; // we use the second lane of the deinterleaved load + if simdutf_constexpr (!match_system(big_endian)) { + idx = 0; + } + size_t idx_lsb = idx ^ 1; + auto c0 = + vminq_u8(vorrq_u8(vandq_u8(base_input.val[idx_lsb], vdupq_n_u8(0x80)), + base_input.val[idx]), + one); + auto c1 = vminq_u8(vandq_u8(base_input.val[idx], vdupq_n_u8(0xf8)), one); + auto is_surrogate = vcleq_u8( + vsubq_u8(base_input.val[idx], vdupq_n_u8(0xd8)), vdupq_n_u8(7)); + + auto v_count = vaddq_u8(c1, c0); + v_count = vaddq_u8(v_count, is_surrogate); + count += vaddlvq_u8(v_count); // sum the counts in the vector + ///////// + // The vaddlvq_u8 instruction could be slow on some hardware. We could + // consider various alternatives if that is an issue such as accumulating + // into a vector of uint16_t or uint8_t and summing only at the end or + // periodically. However, on fast chipsets, like Apple Silicon, it is + // likely fast enough, or even faster than alternatives. + ///////// + } + count += pos; + // If we end with a high surrogate, it might be unpaired or not, we + // don't know. It counts as a pair suggarate for now. + + if (scalar::utf16::is_high_surrogate(in[pos - 1])) { + if (pos == size) { + count += 2; + } else if (scalar::utf16::is_low_surrogate(in[pos])) { + pos += 1; + count += 2; + } + } + return count + scalar::utf16::utf8_length_from_utf16(in + pos, + size - pos); +} - magiclo = _mm512_mask_blend_epi32(__mmask16(outmask), _mm512_set1_epi32(0xffffffff), - _mm512_set1_epi32(0x00010101)); - magichi = _mm512_mask_blend_epi32(__mmask16(outmhi), _mm512_set1_epi32(0xffffffff), - _mm512_set1_epi32(0x00010101)); +template +simdutf_really_inline result +arm64_utf8_length_from_utf16_with_replacement(const char16_t *in, size_t size) { + constexpr size_t N = + 16; // we process 16 char16_t at a time, this is NEON specific + + if (N + 1 > size) { + return scalar::utf16::utf8_length_from_utf16_with_replacement( + in, size); + } // special case for short input + size_t count = 0; + bool any_surrogates = false; + const auto one = vmovq_n_u8(1); + + // The general strategy is as follows: + // 1. each code unit yields at least one byte, we can account for that by + // adding the size of the input to the count. + // 2. ASCII bytes then count for zero. + // 3. Values that yield 2 or 3 bytes in UTF-8 add 1 or 2 to the count. + // 4. Surrogate pairs are handled by adding 1 for each surrogate code unit + // for a total of 4 bytes for the pair. + // 5. Unpaired surrogate elements have value 0xfffd in UTF-8, which is 3 + // bytes, + // so we need to add 2 more bytes for each unpaired surrogate. In effect, + // an unpaired surrogate should count for 1 (+1 for the ) + // + // Our strategy is to proceed like the arm64_utf8_length_from_utf16_bytemask + // function, but, at the same time, to record the number of unpaired + // surrogates. and then adjust the count accordingly. - mslo = _mm512_ternarylogic_epi32(mslo, _mm512_set1_epi32(0x3f3f3f3f), taglo, - 0xea); // A&B|C - mshi = _mm512_ternarylogic_epi32(mshi, _mm512_set1_epi32(0x3f3f3f3f), taghi, - 0xea); - mslo = _mm512_mask_slli_epi32(mslo, __mmask16(is1byte), lo, 24); + // If we start with a low surrogate, it is unpaired and the SIMD code won't + // detect it, so we handle that here. + size_t number_of_unpaired_surrogates = 0; + if (scalar::utf16::is_low_surrogate(in[0])) { + number_of_unpaired_surrogates += 1; + any_surrogates = true; + } + size_t pos = 0; + // We will go through the input at least once. + for (; size - pos >= N + 1; pos += N) { + auto base_input = vld2q_u8(reinterpret_cast(in + pos)); + size_t idx = 1; // we use the second lane of the deinterleaved load + if simdutf_constexpr (!match_system(big_endian)) { + idx = 0; + } + size_t idx_lsb = idx ^ 1; + auto is_surrogate = vcleq_u8( + vsubq_u8(base_input.val[idx], vdupq_n_u8(0xd8)), vdupq_n_u8(7)); + // We count on the fact that most inputs do not have surrogates. + if (vmaxvq_u32(vreinterpretq_u32_u8(is_surrogate)) || + scalar::utf16::is_low_surrogate(in[pos + N])) { + any_surrogates = true; + // there is at least one surrogate in the block + // We use this to check that surrogates are paired correctly. + // It is the input shifted by one code unit (two bytes). + // We use it to detect *low* surrogates. + auto one_unit_offset_input = + vld2q_u8(reinterpret_cast(in + pos + 1)); + // - mshi = _mm512_mask_slli_epi32(mshi, __mmask16(is1bhi), hi, 24); + auto lb_masked = vandq_u8(base_input.val[idx], vdupq_n_u8(0xfc)); + auto block_masked = + vandq_u8(one_unit_offset_input.val[idx], vdupq_n_u8(0xfc)); + auto lb_is_high = vceqq_u8(lb_masked, vdupq_n_u8(0xd8)); + auto block_is_low = vceqq_u8(block_masked, vdupq_n_u8(0xdc)); - const __mmask64 wantlo = _mm512_cmp_epu8_mask(mslo, magiclo, _MM_CMPINT_NLT); - const __mmask64 wanthi = _mm512_cmp_epu8_mask(mshi, magichi, _MM_CMPINT_NLT); - const __m512i outlo = _mm512_maskz_compress_epi8(wantlo, mslo); - const __m512i outhi = _mm512_maskz_compress_epi8(wanthi, mshi); - const uint64_t wantlo_uint64 = _cvtmask64_u64(wantlo); - const uint64_t wanthi_uint64 = _cvtmask64_u64(wanthi); + // illseq will mark every low surrogate in the offset block. + // that is not preceded by a high surrogate + // + // It will also mark every high surrogate in the main block + // that is not followed by a low surrogate + // + // This means that it will miss undetectable errors, like a high surrogate + // at the last index of the main block. And similarly a low surrogate + // at the index prior to the main block that was not preceded by a high + // surrogate. + // + // The interpretation of the values is that they start with the end value + // of the prior block, and end just before the end of the main block + // (minus one). + auto illseq = veorq_u8(lb_is_high, block_is_low); + number_of_unpaired_surrogates += vaddlvq_u8(vandq_u8(illseq, one)); + } + auto c0 = + vminq_u8(vorrq_u8(vandq_u8(base_input.val[idx_lsb], vdupq_n_u8(0x80)), + base_input.val[idx]), + one); + auto c1 = vminq_u8(vandq_u8(base_input.val[idx], vdupq_n_u8(0xf8)), one); + + auto v_count = vaddq_u8(c1, c0); + v_count = vaddq_u8(v_count, is_surrogate); + count += vaddlvq_u8(v_count); // sum the counts in the vector + ///////// + // The vaddlvq_u8 instruction could be slow on some hardware. We could + // consider various alternatives if that is an issue such as accumulating + // into a vector of uint16_t or uint8_t and summing only at the end or + // periodically. However, on fast chipsets, like Apple Silicon, it is + // likely fast enough, or even faster than alternatives. + ///////// + } + + //!!!!!!!!!!!!!!! + // Here, we have processed up to pos - 1 (inclusive) code units. Except for + // the case where the value at pos is a low surrogate not preceded by a high + // surrogate. In this special case, we have already added one to the count for + // the unpaired low surrogate. + //!!!!!!!!!!!!!!! + if (scalar::utf16::is_low_surrogate(in[pos])) { + any_surrogates = true; + if (!scalar::utf16::is_high_surrogate(in[pos - 1])) { + number_of_unpaired_surrogates -= 1; + count += 2; + pos += 1; + } + } + count += pos; + count += number_of_unpaired_surrogates; + // If we end with a high surrogate, it might be unpaired or not, we + // don't know. It counts as a pair suggarate for now. + if (scalar::utf16::is_high_surrogate(in[pos - 1])) { + any_surrogates = true; + if (pos == size) { + count += 2; + } else if (scalar::utf16::is_low_surrogate(in[pos])) { + pos += 1; + count += 2; + } + } + result scalar_result = + scalar::utf16::utf8_length_from_utf16_with_replacement( + in + pos, size - pos); + return {any_surrogates ? SURROGATE : scalar_result.error, + count + scalar_result.count}; +} +/* end file src/arm64/arm_convert_utf16_to_utf8.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF8 - uint64_t advlo = _mm_popcnt_u64(wantlo_uint64); - uint64_t advhi = _mm_popcnt_u64(wanthi_uint64); +#if SIMDUTF_FEATURE_BASE64 +/* begin file src/arm64/arm_base64.cpp */ +/** + * References and further reading: + * + * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the + * speed of a memory copy, Software: Practice and Experience 50 (2), 2020. + * https://arxiv.org/abs/1910.05109 + * + * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2 + * Instructions, ACM Transactions on the Web 12 (3), 2018. + * https://arxiv.org/abs/1704.00605 + * + * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings. + * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force, + * Request for Comments: 4648. + * + * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization. + * http://www.alfredklomp.com/programming/sse-base64/. (2014). + * + * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD + * acceleration. https://github.com/aklomp/base64. (2014). + * + * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014). + * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/ + * + * Nick Kopp. 2013. Base64 Encoding on a GPU. + * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013). + */ - _mm512_mask_storeu_epi8(outbuf, _cvtu64_mask64(_pext_u64(wantlo_uint64, wantlo_uint64)), outlo); - _mm512_mask_storeu_epi8(outbuf + advlo, _cvtu64_mask64(_pext_u64(wanthi_uint64, wanthi_uint64)), outhi); - outbuf += advlo + advhi; +/** + * Insert a line feed character in the 16-byte input at index K in [0,16). + */ +inline uint8x16_t insert_line_feed16(uint8x16_t input, size_t K) { + static const uint8_t shuffle_masks[16][16] = { + {0x80, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, + {0, 0x80, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, + {0, 1, 0x80, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, + {0, 1, 2, 0x80, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, + {0, 1, 2, 3, 0x80, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, + {0, 1, 2, 3, 4, 0x80, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, + {0, 1, 2, 3, 4, 5, 0x80, 6, 7, 8, 9, 10, 11, 12, 13, 14}, + {0, 1, 2, 3, 4, 5, 6, 0x80, 7, 8, 9, 10, 11, 12, 13, 14}, + {0, 1, 2, 3, 4, 5, 6, 7, 0x80, 8, 9, 10, 11, 12, 13, 14}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 0x80, 9, 10, 11, 12, 13, 14}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0x80, 10, 11, 12, 13, 14}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0x80, 11, 12, 13, 14}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0x80, 12, 13, 14}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0x80, 13, 14}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 14}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0x80}}; + // Prepare a vector with '\n' (0x0A) + uint8x16_t line_feed_vector = vdupq_n_u8('\n'); + + // Load the precomputed shuffle mask for K + uint8x16_t mask = vld1q_u8(shuffle_masks[K]); + + // Create a mask where 0x80 indicates the line feed position + uint8x16_t lf_pos = vceqq_u8(mask, vdupq_n_u8(0x80)); + + uint8x16_t result = vqtbl1q_u8(input, mask); + + // Use vbsl to select '\n' where lf_pos is true, else keep input bytes + return vbslq_u8(lf_pos, line_feed_vector, result); +} + +// offset is the number of characters in the current line. +// It can range from 0 to line_length (inclusive). +// If offset == line_length, we need to insert a line feed before writing +// anything. +size_t write_output_with_line_feeds(uint8_t *dst, uint8x16_t src, + size_t line_length, size_t &offset) { + // Fast path: no need to insert line feeds + // If we are at offset, we would write from [offset, offset + 16). + // We need that line_length >= offset + 16. + if (offset + 16 <= line_length) { + // No need to insert line feeds + vst1q_u8(dst, src); + offset += 16; // offset could be line_length here. + return 16; } - outbuf -= adjust; -tail: - if (inlen != 0) { - // We must have inlen < 31. - inmask = _cvtu32_mask32((1 << inlen) - 1); - in = _mm512_maskz_loadu_epi16(inmask, inbuf); - if(big_endian) { in = _mm512_shuffle_epi8(in, byteflip); } - adjust = inlen - 31; - inlen = 0; - goto lastiteration; - } - *outlen = (outbuf - outbuf_orig) + adjust; - return ((inbuf - inbuf_orig) + adjust); -} -/* end file src/icelake/icelake_convert_utf16_to_utf8.inl.cpp */ -/* begin file src/icelake/icelake_convert_utf16_to_utf32.inl.cpp */ -// file included directly - -/* - Returns a pair: the first unprocessed byte from buf and utf32_output - A scalar routing should carry on the conversion of the tail. -*/ -template -std::tuple convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) { - const char16_t* end = buf + len; - const __m512i v_fc00 = _mm512_set1_epi16((uint16_t)0xfc00); - const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800); - const __m512i v_dc00 = _mm512_set1_epi16((uint16_t)0xdc00); - __mmask32 carry{0}; - const __m512i byteflip = _mm512_setr_epi64( - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809 - ); - while (std::distance(buf,end) >= 32) { - // Always safe because buf + 32 <= end so that end - buf >= 32 bytes: - __m512i in = _mm512_loadu_si512((__m512i*)buf); - if(big_endian) { in = _mm512_shuffle_epi8(in, byteflip); } - - // H - bitmask for high surrogates - const __mmask32 H = _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_d800); - // H - bitmask for low surrogates - const __mmask32 L = _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_dc00); - - if ((H|L)) { - // surrogate pair(s) in a register - const __mmask32 V = (L ^ (carry | (H << 1))); // A high surrogate must be followed by low one and a low one must be preceded by a high one. - // If valid, V should be equal to 0 - - if(V == 0) { - // valid case - /* - Input surrogate pair: - |1101.11aa.aaaa.aaaa|1101.10bb.bbbb.bbbb| - low surrogate high surrogate - */ - /* 1. Expand all code units to 32-bit code units - in |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb| - */ - const __m512i first = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in)); - const __m512i second = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in,1)); - - /* 2. Shift by one 16-bit word to align low surrogates with high surrogates - in |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb| - shifted |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa| - */ - const __m512i shifted_first = _mm512_alignr_epi32(second, first, 1); - const __m512i shifted_second = _mm512_alignr_epi32(_mm512_setzero_si512(), second, 1); - - /* 3. Align all high surrogates in first and second by shifting to the left by 10 bits - |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000| - */ - const __m512i aligned_first = _mm512_mask_slli_epi32(first, (__mmask16)H, first, 10); - const __m512i aligned_second = _mm512_mask_slli_epi32(second, (__mmask16)(H>>16), second, 10); - - /* 4. Remove surrogate prefixes and add offset 0x10000 by adding in, shifted and constant - in |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000| - shifted |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa| - constant|1111.1100.1010.0000.0010.0100.0000.0000|1111.1100.1010.0000.0010.0100.0000.0000| - */ - const __m512i constant = _mm512_set1_epi32((uint32_t)0xfca02400); - const __m512i added_first = _mm512_mask_add_epi32(aligned_first, (__mmask16)H, aligned_first, shifted_first); - const __m512i utf32_first = _mm512_mask_add_epi32(added_first, (__mmask16)H, added_first, constant); - - const __m512i added_second = _mm512_mask_add_epi32(aligned_second, (__mmask16)(H>>16), aligned_second, shifted_second); - const __m512i utf32_second = _mm512_mask_add_epi32(added_second, (__mmask16)(H>>16), added_second, constant); - - // 5. Store all valid UTF-32 code units (low surrogate positions and 32nd word are invalid) - const __mmask32 valid = ~L & 0x7fffffff; - // We deliberately do a _mm512_maskz_compress_epi32 followed by storeu_epi32 - // to ease performance portability to Zen 4. - const __m512i compressed_first = _mm512_maskz_compress_epi32((__mmask16)(valid), utf32_first); - const size_t howmany1 = count_ones((uint16_t)(valid)); - _mm512_storeu_si512((__m512i *) utf32_output, compressed_first); - utf32_output += howmany1; - const __m512i compressed_second = _mm512_maskz_compress_epi32((__mmask16)(valid >> 16), utf32_second); - const size_t howmany2 = count_ones((uint16_t)(valid >> 16)); - // The following could be unsafe in some cases? - //_mm512_storeu_epi32((__m512i *) utf32_output, compressed_second); - _mm512_mask_storeu_epi32((__m512i *) utf32_output, __mmask16((1<> 30) & 0x1; - } else { - // invalid case - return std::make_tuple(buf+carry, utf32_output, false); + // We have that offset + 16 >= line_length + // the common case is that line_length is greater than 16 + if (simdutf_likely(line_length >= 16)) { + // offset <= line_length. + // offset + 16 > line_length + // So line_length - offset < 16 + // and line_length - offset >= 0 + uint8x16_t chunk = insert_line_feed16(src, line_length - offset); + vst1q_u8(dst, chunk); + // Not ideal to pull the last element and write it separately but + // it simplifies the code. + *(dst + 16) = vgetq_lane_u8(src, 15); + offset += 16 - line_length; + return 16 + 1; // we wrote 16 bytes plus one line feed + } + // Uncommon case where line_length < 16 + // This is going to be SLOW. + else { + uint8_t buffer[16]; + vst1q_u8(buffer, src); + size_t out_pos = 0; + size_t local_offset = offset; + for (size_t i = 0; i < 16;) { + if (local_offset == line_length) { + dst[out_pos++] = '\n'; + local_offset = 0; } - } else { - // no surrogates - // extend all thirty-two 16-bit code units to thirty-two 32-bit code units - _mm512_storeu_si512((__m512i *)(utf32_output), _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in))); - _mm512_storeu_si512((__m512i *)(utf32_output) + 1, _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in,1))); - utf32_output += 32; - buf += 32; - carry = 0; - } - } // while - return std::make_tuple(buf+carry, utf32_output, true); -} -/* end file src/icelake/icelake_convert_utf16_to_utf32.inl.cpp */ -/* begin file src/icelake/icelake_convert_utf32_to_latin1.inl.cpp */ -// file included directly -size_t icelake_convert_utf32_to_latin1(const char32_t *buf, size_t len, - char *latin1_output) { - const char32_t *end = buf + len; - __m512i v_0xFF = _mm512_set1_epi32(0xff); - __m512i shufmask = _mm512_set_epi8( - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60, - 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0); - while (buf + 16 <= end) { - __m512i in = _mm512_loadu_si512((__m512i *)buf); - if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) { - return 0; - } - _mm_storeu_si128((__m128i *)latin1_output, - _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in))); - latin1_output += 16; - buf += 16; - } - if (buf < end) { - uint16_t mask = uint16_t((1 << (end - buf)) - 1); - __m512i in = _mm512_maskz_loadu_epi32(mask, buf); - if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) { - return 0; + dst[out_pos++] = buffer[i++]; + local_offset++; } - _mm_mask_storeu_epi8( - latin1_output, mask, - _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in))); + offset = local_offset; + return out_pos; } - return len; } -std::pair -icelake_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len, - char *latin1_output) { - const char32_t *end = buf + len; - const char32_t *start = buf; - __m512i v_0xFF = _mm512_set1_epi32(0xff); - __m512i shufmask = _mm512_set_epi8( - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60, - 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0); - while (buf + 16 <= end) { - __m512i in = _mm512_loadu_si512((__m512i *)buf); - if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) { - while (uint32_t(*buf) <= 0xff) { - *latin1_output++ = uint8_t(*buf++); +template +size_t encode_base64_impl(char *dst, const char *src, size_t srclen, + base64_options options, + size_t line_length = simdutf::default_line_length) { + size_t offset = 0; + if (line_length < 4) { + line_length = 4; // We do not support line_length less than 4 + } + // credit: Wojciech Muła + uint8_t *out = (uint8_t *)dst; + constexpr static uint8_t source_table[64] = { + 'A', 'Q', 'g', 'w', 'B', 'R', 'h', 'x', 'C', 'S', 'i', 'y', 'D', + 'T', 'j', 'z', 'E', 'U', 'k', '0', 'F', 'V', 'l', '1', 'G', 'W', + 'm', '2', 'H', 'X', 'n', '3', 'I', 'Y', 'o', '4', 'J', 'Z', 'p', + '5', 'K', 'a', 'q', '6', 'L', 'b', 'r', '7', 'M', 'c', 's', '8', + 'N', 'd', 't', '9', 'O', 'e', 'u', '+', 'P', 'f', 'v', '/', + }; + constexpr static uint8_t source_table_url[64] = { + 'A', 'Q', 'g', 'w', 'B', 'R', 'h', 'x', 'C', 'S', 'i', 'y', 'D', + 'T', 'j', 'z', 'E', 'U', 'k', '0', 'F', 'V', 'l', '1', 'G', 'W', + 'm', '2', 'H', 'X', 'n', '3', 'I', 'Y', 'o', '4', 'J', 'Z', 'p', + '5', 'K', 'a', 'q', '6', 'L', 'b', 'r', '7', 'M', 'c', 's', '8', + 'N', 'd', 't', '9', 'O', 'e', 'u', '-', 'P', 'f', 'v', '_', + }; + const uint8x16_t v3f = vdupq_n_u8(0x3f); +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + // When trying to load a uint8_t array, Visual Studio might + // error with: error C2664: '__n128x4 neon_ld4m_q8(const char *)': + // cannot convert argument 1 from 'const uint8_t [64]' to 'const char * + const uint8x16x4_t table = vld4q_u8( + (reinterpret_cast(options & base64_url) ? source_table_url + : source_table)); +#else + const uint8x16x4_t table = + vld4q_u8((options & base64_url) ? source_table_url : source_table); +#endif + size_t i = 0; + for (; i + 16 * 3 <= srclen; i += 16 * 3) { + const uint8x16x3_t in = vld3q_u8((const uint8_t *)src + i); + uint8x16x4_t result; + result.val[0] = vshrq_n_u8(in.val[0], 2); + result.val[1] = + vandq_u8(vsliq_n_u8(vshrq_n_u8(in.val[1], 4), in.val[0], 4), v3f); + result.val[2] = + vandq_u8(vsliq_n_u8(vshrq_n_u8(in.val[2], 6), in.val[1], 2), v3f); + result.val[3] = vandq_u8(in.val[2], v3f); + result.val[0] = vqtbl4q_u8(table, result.val[0]); + result.val[1] = vqtbl4q_u8(table, result.val[1]); + result.val[2] = vqtbl4q_u8(table, result.val[2]); + result.val[3] = vqtbl4q_u8(table, result.val[3]); + if (insert_line_feeds) { + if (line_length >= 64) { // fast path + vst4q_u8(out, result); + if (offset + 64 > line_length) { + size_t location_end = line_length - offset; + size_t to_move = 64 - location_end; + std::memmove(out + location_end + 1, out + location_end, to_move); + out[location_end] = '\n'; + offset = to_move; + out += 64 + 1; + } else { + offset += 64; + out += 64; + } + } else { // slow path + uint8x16x2_t Z0 = vzipq_u8(result.val[0], result.val[1]); + uint8x16x2_t Z1 = vzipq_u8(result.val[2], result.val[3]); + uint16x8x2_t Z2 = vzipq_u16(vreinterpretq_u16_u8(Z0.val[0]), + vreinterpretq_u16_u8(Z1.val[0])); + uint16x8x2_t Z3 = vzipq_u16(vreinterpretq_u16_u8(Z0.val[1]), + vreinterpretq_u16_u8(Z1.val[1])); + uint8x16_t T0 = vreinterpretq_u8_u16(Z2.val[0]); + uint8x16_t T1 = vreinterpretq_u8_u16(Z2.val[1]); + uint8x16_t T2 = vreinterpretq_u8_u16(Z3.val[0]); + uint8x16_t T3 = vreinterpretq_u8_u16(Z3.val[1]); + out += write_output_with_line_feeds(out, T0, line_length, offset); + out += write_output_with_line_feeds(out, T1, line_length, offset); + out += write_output_with_line_feeds(out, T2, line_length, offset); + out += write_output_with_line_feeds(out, T3, line_length, offset); } - return std::make_pair(result(error_code::TOO_LARGE, buf - start), - latin1_output); + } else { + vst4q_u8(out, result); + out += 64; } - _mm_storeu_si128((__m128i *)latin1_output, - _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in))); - latin1_output += 16; - buf += 16; } - if (buf < end) { - uint16_t mask = uint16_t((1 << (end - buf)) - 1); - __m512i in = _mm512_maskz_loadu_epi32(mask, buf); - if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) { - while (uint32_t(*buf) <= 0xff) { - *latin1_output++ = uint8_t(*buf++); + + if (i + 24 <= srclen) { + const uint8x8_t v3f_d = vdup_n_u8(0x3f); + const uint8x8x3_t in = vld3_u8((const uint8_t *)src + i); + uint8x8x4_t result; + result.val[0] = vshr_n_u8(in.val[0], 2); + result.val[1] = + vand_u8(vsli_n_u8(vshr_n_u8(in.val[1], 4), in.val[0], 4), v3f_d); + result.val[2] = + vand_u8(vsli_n_u8(vshr_n_u8(in.val[2], 6), in.val[1], 2), v3f_d); + result.val[3] = vand_u8(in.val[2], v3f_d); + result.val[0] = vqtbl4_u8(table, result.val[0]); + result.val[1] = vqtbl4_u8(table, result.val[1]); + result.val[2] = vqtbl4_u8(table, result.val[2]); + result.val[3] = vqtbl4_u8(table, result.val[3]); + if (insert_line_feeds) { + if (line_length >= 32) { // fast path + vst4_u8(out, result); + if (offset + 32 > line_length) { + size_t location_end = line_length - offset; + size_t to_move = 32 - location_end; + std::memmove(out + location_end + 1, out + location_end, to_move); + out[location_end] = '\n'; + offset = to_move; + out += 32 + 1; + } else { + offset += 32; + out += 32; + } + } else { // slow path + uint8x8x2_t Z0 = vzip_u8(result.val[0], result.val[1]); + uint8x8x2_t Z1 = vzip_u8(result.val[2], result.val[3]); + uint16x4x2_t Z2 = vzip_u16(vreinterpret_u16_u8(Z0.val[0]), + vreinterpret_u16_u8(Z1.val[0])); + uint16x4x2_t Z3 = vzip_u16(vreinterpret_u16_u8(Z0.val[1]), + vreinterpret_u16_u8(Z1.val[1])); + uint8x8_t T0 = vreinterpret_u8_u16(Z2.val[0]); + uint8x8_t T1 = vreinterpret_u8_u16(Z2.val[1]); + uint8x8_t T2 = vreinterpret_u8_u16(Z3.val[0]); + uint8x8_t T3 = vreinterpret_u8_u16(Z3.val[1]); + uint8x16_t TT0 = vcombine_u8(T0, T1); + uint8x16_t TT1 = vcombine_u8(T2, T3); + out += write_output_with_line_feeds(out, TT0, line_length, offset); + out += write_output_with_line_feeds(out, TT1, line_length, offset); } - return std::make_pair(result(error_code::TOO_LARGE, buf - start), - latin1_output); + } else { + vst4_u8(out, result); + out += 32; } - _mm_mask_storeu_epi8( - latin1_output, mask, - _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in))); + i += 24; } - return std::make_pair(result(error_code::SUCCESS, len), latin1_output); + out += scalar::base64::tail_encode_base64_impl( + (char *)out, src + i, srclen - i, options, line_length, offset); + return size_t((char *)out - dst); } -/* end file src/icelake/icelake_convert_utf32_to_latin1.inl.cpp */ -/* begin file src/icelake/icelake_convert_utf32_to_utf8.inl.cpp */ -// file included directly -// Todo: currently, this is just the haswell code, optimize for icelake kernel. -std::pair avx512_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) { - const char32_t* end = buf + len; - const __m256i v_0000 = _mm256_setzero_si256(); - const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000); - const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80); - const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800); - const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080); - const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff); - __m256i running_max = _mm256_setzero_si256(); - __m256i forbidden_bytemask = _mm256_setzero_si256(); +size_t encode_base64(char *dst, const char *src, size_t srclen, + base64_options options) { + return encode_base64_impl(dst, src, srclen, options); +} - const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 +static inline void compress(uint8x16_t data, uint16_t mask, char *output) { + if (mask == 0) { + vst1q_u8((uint8_t *)output, data); + return; + } + uint8_t mask1 = uint8_t(mask); // least significant 8 bits + uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits + uint64x2_t compactmasku64 = {tables::base64::thintable_epi8[mask1], + tables::base64::thintable_epi8[mask2]}; + uint8x16_t compactmask = vreinterpretq_u8_u64(compactmasku64); +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x16_t off = + simdutf_make_uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8); +#else + const uint8x16_t off = {0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8}; +#endif - while (buf + 16 + safety_margin <= end) { - __m256i in = _mm256_loadu_si256((__m256i*)buf); - __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1); - running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin); + compactmask = vaddq_u8(compactmask, off); + uint8x16_t pruned = vqtbl1q_u8(data, compactmask); - // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned saturation - __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff)); - in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000); + int pop1 = tables::base64::BitsSetTable256mul2[mask1]; + // then load the corresponding mask, what it does is to write + // only the first pop1 bytes from the first 8 bytes, and then + // it fills in with the bytes from the second 8 bytes + some filling + // at the end. + compactmask = vld1q_u8(tables::base64::pshufb_combine_table + pop1 * 8); + uint8x16_t answer = vqtbl1q_u8(pruned, compactmask); + vst1q_u8((uint8_t *)output, answer); +} - // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp) +struct block64 { + uint8x16_t chunks[4]; +}; - if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!! - // 1. pack the bytes - const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1)); - // 2. store (16 bytes) - _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); - // 3. adjust pointers - buf += 16; - utf8_output += 16; - continue; // we are done for this round! - } - // no bits set above 7th bit - const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000); - const uint32_t one_byte_bitmask = static_cast(_mm256_movemask_epi8(one_byte_bytemask)); +static_assert(sizeof(block64) == 64, "block64 is not 64 bytes"); +template +uint64_t to_base64_mask(block64 *b, bool *error) { + uint8x16_t v0f = vdupq_n_u8(0xf); + uint8x16_t v01 = vdupq_n_u8(0x1); - // no bits set above 11th bit - const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000); - const uint32_t one_or_two_bytes_bitmask = static_cast(_mm256_movemask_epi8(one_or_two_bytes_bytemask)); - if (one_or_two_bytes_bitmask == 0xffffffff) { - // 1. prepare 2-byte values - // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 - // expected output : [110a|aaaa|10bb|bbbb] x 8 - const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00); - const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f); + uint8x16_t lo_nibbles0 = vandq_u8(b->chunks[0], v0f); + uint8x16_t lo_nibbles1 = vandq_u8(b->chunks[1], v0f); + uint8x16_t lo_nibbles2 = vandq_u8(b->chunks[2], v0f); + uint8x16_t lo_nibbles3 = vandq_u8(b->chunks[3], v0f); - // t0 = [000a|aaaa|bbbb|bb00] - const __m256i t0 = _mm256_slli_epi16(in_16, 2); - // t1 = [000a|aaaa|0000|0000] - const __m256i t1 = _mm256_and_si256(t0, v_1f00); - // t2 = [0000|0000|00bb|bbbb] - const __m256i t2 = _mm256_and_si256(in_16, v_003f); - // t3 = [000a|aaaa|00bb|bbbb] - const __m256i t3 = _mm256_or_si256(t1, t2); - // t4 = [110a|aaaa|10bb|bbbb] - const __m256i t4 = _mm256_or_si256(t3, v_c080); - - // 2. merge ASCII and 2-byte codewords - const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask); - - // 3. prepare bitmask for 8-bit lookup - const uint32_t M0 = one_byte_bitmask & 0x55555555; - const uint32_t M1 = M0 >> 7; - const uint32_t M2 = (M1 | M0) & 0x00ff00ff; - // 4. pack the bytes - - const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0]; - const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0]; - - const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1)); - const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1)); - - const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2)); - // 5. store bytes - _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed)); - utf8_output += row[0]; - _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1)); - utf8_output += row_2[0]; - - // 6. adjust pointers - buf += 16; - continue; - } - // Must check for overflow in packing - const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000); - const uint32_t saturation_bitmask = static_cast(_mm256_movemask_epi8(saturation_bytemask)); - if (saturation_bitmask == 0xffffffff) { - // case: code units from register produce either 1, 2 or 3 UTF-8 bytes - const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800); - forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800)); + // Needed by the decoding step. + uint8x16_t hi_bits0 = vshrq_n_u8(b->chunks[0], 3); + uint8x16_t hi_bits1 = vshrq_n_u8(b->chunks[1], 3); + uint8x16_t hi_bits2 = vshrq_n_u8(b->chunks[2], 3); + uint8x16_t hi_bits3 = vshrq_n_u8(b->chunks[3], 3); + uint8x16_t lut_lo; +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + if (default_or_url) { + lut_lo = + simdutf_make_uint8x16_t(0xa9, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, + 0xf8, 0xf9, 0xf1, 0xa2, 0xa1, 0xa5, 0xa0, 0xa6); + } else if (base64_url) { + lut_lo = + simdutf_make_uint8x16_t(0xa9, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, + 0xf8, 0xf9, 0xf1, 0xa0, 0xa1, 0xa5, 0xa0, 0xa2); + } else { + lut_lo = + simdutf_make_uint8x16_t(0xa9, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, + 0xf8, 0xf9, 0xf1, 0xa2, 0xa1, 0xa1, 0xa0, 0xa4); + } +#else + if (default_or_url) { + lut_lo = uint8x16_t{0xa9, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, + 0xf8, 0xf9, 0xf1, 0xa2, 0xa1, 0xa5, 0xa0, 0xa6}; + } else if (base64_url) { + lut_lo = uint8x16_t{0xa9, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, + 0xf8, 0xf9, 0xf1, 0xa0, 0xa1, 0xa5, 0xa0, 0xa2}; + } else { + lut_lo = uint8x16_t{0xa9, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, + 0xf8, 0xf9, 0xf1, 0xa2, 0xa1, 0xa1, 0xa0, 0xa4}; + } +#endif + uint8x16_t lo0 = vqtbl1q_u8(lut_lo, lo_nibbles0); + uint8x16_t lo1 = vqtbl1q_u8(lut_lo, lo_nibbles1); + uint8x16_t lo2 = vqtbl1q_u8(lut_lo, lo_nibbles2); + uint8x16_t lo3 = vqtbl1q_u8(lut_lo, lo_nibbles3); + uint8x16_t lut_hi; +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + if (default_or_url) { + lut_hi = + simdutf_make_uint8x16_t(0x0, 0x1, 0x0, 0x0, 0x1, 0x6, 0x8, 0x8, 0x10, + 0x20, 0x20, 0x12, 0x40, 0x80, 0x80, 0x40); + } else if (base64_url) { + lut_hi = + simdutf_make_uint8x16_t(0x0, 0x1, 0x0, 0x0, 0x1, 0x6, 0x8, 0x8, 0x10, + 0x20, 0x20, 0x12, 0x40, 0x80, 0x80, 0x40); + } else { + lut_hi = + simdutf_make_uint8x16_t(0x0, 0x1, 0x0, 0x0, 0x1, 0x6, 0x8, 0x8, 0x10, + 0x20, 0x20, 0x10, 0x40, 0x80, 0x80, 0x40); + } +#else + if (default_or_url) { + lut_hi = uint8x16_t{0x0, 0x1, 0x0, 0x0, 0x1, 0x6, 0x8, 0x8, + 0x10, 0x20, 0x20, 0x12, 0x40, 0x80, 0x80, 0x40}; + } else if (base64_url) { + lut_hi = uint8x16_t{0x0, 0x1, 0x0, 0x0, 0x1, 0x4, 0x8, 0x8, + 0x10, 0x20, 0x20, 0x12, 0x40, 0x80, 0x80, 0x40}; + } else { + lut_hi = uint8x16_t{0x0, 0x1, 0x0, 0x0, 0x1, 0x6, 0x8, 0x8, + 0x10, 0x20, 0x20, 0x10, 0x40, 0x80, 0x80, 0x40}; + } +#endif + uint8x16_t hi0 = vqtbl1q_u8(lut_hi, hi_bits0); + uint8x16_t hi1 = vqtbl1q_u8(lut_hi, hi_bits1); + uint8x16_t hi2 = vqtbl1q_u8(lut_hi, hi_bits2); + uint8x16_t hi3 = vqtbl1q_u8(lut_hi, hi_bits3); - const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e, - 0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + // maps error byte to 0 and space byte to 1, valid bytes are >1 + uint8x16_t res0 = vandq_u8(lo0, hi0); + uint8x16_t res1 = vandq_u8(lo1, hi1); + uint8x16_t res2 = vandq_u8(lo2, hi2); + uint8x16_t res3 = vandq_u8(lo3, hi3); - /* In this branch we handle three cases: - 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte - 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes - 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes + uint8_t checks = + vminvq_u8(vminq_u8(vminq_u8(res0, res1), vminq_u8(res2, res3))); +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x16_t bit_mask = + simdutf_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80); +#else + const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; +#endif + uint64_t badcharmask = 0; + *error = checks == 0; + if (checks <= 1) { + // Add each of the elements next to each other, successively, to stuff each + // 8 byte mask into one. + uint8x16_t test0 = vcleq_u8(res0, v01); + uint8x16_t test1 = vcleq_u8(res1, v01); + uint8x16_t test2 = vcleq_u8(res2, v01); + uint8x16_t test3 = vcleq_u8(res3, v01); + uint8x16_t sum0 = + vpaddq_u8(vandq_u8(test0, bit_mask), vandq_u8(test1, bit_mask)); + uint8x16_t sum1 = + vpaddq_u8(vandq_u8(test2, bit_mask), vandq_u8(test3, bit_mask)); + sum0 = vpaddq_u8(sum0, sum1); + sum0 = vpaddq_u8(sum0, sum0); + badcharmask = vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0); + } + // This is the transformation step that can be done while we are waiting for + // sum0 + uint8x16_t roll_lut; +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + if (default_or_url) { + roll_lut = + simdutf_make_uint8x16_t(0xBF, 0xE0, 0xB9, 0x13, 0x04, 0xBF, 0xBF, 0xB9, + 0xB9, 0x00, 0xFF, 0x11, 0xFF, 0xBF, 0x10, 0xB9); + } else if (base64_url) { + roll_lut = + simdutf_make_uint8x16_t(0xB9, 0xB9, 0xBF, 0xBF, 0x04, 0x11, 0xE0, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); + } else { + roll_lut = + simdutf_make_uint8x16_t(0xB9, 0xB9, 0xBF, 0xBF, 0x04, 0x10, 0x13, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); + } +#else + if (default_or_url) { + roll_lut = uint8x16_t{0xBF, 0xE0, 0xB9, 0x13, 0x04, 0xBF, 0xBF, 0xB9, + 0xB9, 0x00, 0xFF, 0x11, 0xFF, 0xBF, 0x10, 0xB9}; + } else if (base64_url) { + roll_lut = uint8x16_t{0xB9, 0xB9, 0xBF, 0xBF, 0x04, 0x11, 0xE0, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; + } else { + roll_lut = uint8x16_t{0xB9, 0xB9, 0xBF, 0xBF, 0x04, 0x10, 0x13, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; + } +#endif + uint8x16_t roll0, roll1, roll2, roll3; + if (default_or_url) { +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x16_t delta_asso = + simdutf_make_uint8x16_t(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x16); +#else + const uint8x16_t delta_asso = + uint8x16_t{0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x16}; +#endif + // the logic of translating is based on westmere + uint8x16_t delta_hash0 = + vrhaddq_u8(vqtbl1q_u8(delta_asso, lo_nibbles0), hi_bits0); + uint8x16_t delta_hash1 = + vrhaddq_u8(vqtbl1q_u8(delta_asso, lo_nibbles1), hi_bits1); + uint8x16_t delta_hash2 = + vrhaddq_u8(vqtbl1q_u8(delta_asso, lo_nibbles2), hi_bits2); + uint8x16_t delta_hash3 = + vrhaddq_u8(vqtbl1q_u8(delta_asso, lo_nibbles3), hi_bits3); + const uint8x16x2_t roll_lut_2 = {roll_lut, roll_lut}; + roll0 = vqtbl2q_u8(roll_lut_2, delta_hash0); + roll1 = vqtbl2q_u8(roll_lut_2, delta_hash1); + roll2 = vqtbl2q_u8(roll_lut_2, delta_hash2); + roll3 = vqtbl2q_u8(roll_lut_2, delta_hash3); + } else { + uint8x16_t delta_hash0 = vclzq_u8(res0); + uint8x16_t delta_hash1 = vclzq_u8(res1); + uint8x16_t delta_hash2 = vclzq_u8(res2); + uint8x16_t delta_hash3 = vclzq_u8(res3); + roll0 = vqtbl1q_u8(roll_lut, delta_hash0); + roll1 = vqtbl1q_u8(roll_lut, delta_hash1); + roll2 = vqtbl1q_u8(roll_lut, delta_hash2); + roll3 = vqtbl1q_u8(roll_lut, delta_hash3); + } - We expand the input word (16-bit) into two code units (32-bit), thus - we have room for four bytes. However, we need five distinct bit - layouts. Note that the last byte in cases #2 and #3 is the same. + b->chunks[0] = vaddq_u8(b->chunks[0], roll0); + b->chunks[1] = vaddq_u8(b->chunks[1], roll1); + b->chunks[2] = vaddq_u8(b->chunks[2], roll2); + b->chunks[3] = vaddq_u8(b->chunks[3], roll3); + return badcharmask; +} - We precompute byte 1 for case #1 and the common byte for cases #2 & #3 - in register t2. +void copy_block(block64 *b, char *output) { + vst1q_u8((uint8_t *)output, b->chunks[0]); + vst1q_u8((uint8_t *)output + 16, b->chunks[1]); + vst1q_u8((uint8_t *)output + 32, b->chunks[2]); + vst1q_u8((uint8_t *)output + 48, b->chunks[3]); +} - We precompute byte 1 for case #3 and -- **conditionally** -- precompute - either byte 1 for case #2 or byte 2 for case #3. Note that they - differ by exactly one bit. +uint64_t compress_block(block64 *b, uint64_t mask, char *output) { + uint64_t popcounts = + vget_lane_u64(vreinterpret_u64_u8(vcnt_u8(vcreate_u8(~mask))), 0); + uint64_t offsets = popcounts * 0x0101010101010101; + compress(b->chunks[0], uint16_t(mask), output); + compress(b->chunks[1], uint16_t(mask >> 16), &output[(offsets >> 8) & 0xFF]); + compress(b->chunks[2], uint16_t(mask >> 32), &output[(offsets >> 24) & 0xFF]); + compress(b->chunks[3], uint16_t(mask >> 48), &output[(offsets >> 40) & 0xFF]); + return offsets >> 56; +} - Finally from these two code units we build proper UTF-8 sequence, taking - into account the case (i.e, the number of bytes to write). - */ - /** - * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: - * t2 => [0ccc|cccc] [10cc|cccc] - * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) - */ -#define simdutf_vec(x) _mm256_set1_epi16(static_cast(x)) - // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] - const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even); - // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] - const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111)); - // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] - const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000)); +// The caller of this function is responsible to ensure that there are 64 bytes +// available from reading at src. The data is read into a block64 structure. +void load_block(block64 *b, const char *src) { + b->chunks[0] = vld1q_u8(reinterpret_cast(src)); + b->chunks[1] = vld1q_u8(reinterpret_cast(src) + 16); + b->chunks[2] = vld1q_u8(reinterpret_cast(src) + 32); + b->chunks[3] = vld1q_u8(reinterpret_cast(src) + 48); +} - // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] - const __m256i s0 = _mm256_srli_epi16(in_16, 4); - // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] - const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100)); - // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] - const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140)); - // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] - const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000)); - const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000)); - const __m256i s4 = _mm256_xor_si256(s3, m0); -#undef simdutf_vec +// The caller of this function is responsible to ensure that there are 32 bytes +// available from reading at data. It returns a 16-byte value, narrowing with +// saturation the 16-bit words. +inline uint8x16_t load_satured(const uint16_t *data) { + uint16x8_t in1 = vld1q_u16(data); + uint16x8_t in2 = vld1q_u16(data + 8); + return vqmovn_high_u16(vqmovn_u16(in1), in2); +} - // 4. expand code units 16-bit => 32-bit - const __m256i out0 = _mm256_unpacklo_epi16(t2, s4); - const __m256i out1 = _mm256_unpackhi_epi16(t2, s4); +// The caller of this function is responsible to ensure that there are 128 bytes +// available from reading at src. The data is read into a block64 structure. +void load_block(block64 *b, const char16_t *src) { + b->chunks[0] = load_satured(reinterpret_cast(src)); + b->chunks[1] = load_satured(reinterpret_cast(src) + 16); + b->chunks[2] = load_satured(reinterpret_cast(src) + 32); + b->chunks[3] = load_satured(reinterpret_cast(src) + 48); +} - // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle - const uint32_t mask = (one_byte_bitmask & 0x55555555) | - (one_or_two_bytes_bitmask & 0xaaaaaaaa); - // Due to the wider registers, the following path is less likely to be useful. - /*if(mask == 0) { - // We only have three-byte code units. Use fast path. - const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); - const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle); - const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle); - _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0)); - utf8_output += 12; - _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1)); - utf8_output += 12; - _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1)); - utf8_output += 12; - _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1)); - utf8_output += 12; - buf += 16; - continue; - }*/ - const uint8_t mask0 = uint8_t(mask); - const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; - const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1)); - const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0); +// decode 64 bytes and output 48 bytes +void base64_decode_block(char *out, const char *src) { + uint8x16x4_t str = vld4q_u8((uint8_t *)src); + uint8x16x3_t outvec; + outvec.val[0] = vsliq_n_u8(vshrq_n_u8(str.val[1], 4), str.val[0], 2); + outvec.val[1] = vsliq_n_u8(vshrq_n_u8(str.val[2], 2), str.val[1], 4); + outvec.val[2] = vsliq_n_u8(str.val[3], str.val[2], 6); + vst3q_u8((uint8_t *)out, outvec); +} - const uint8_t mask1 = static_cast(mask >> 8); - const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; - const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1)); - const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1); +static size_t compress_block_single(block64 *b, uint64_t mask, char *output) { + const size_t pos64 = trailing_zeroes(mask); + const int8_t pos = pos64 & 0xf; - const uint8_t mask2 = static_cast(mask >> 16); - const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0]; - const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1)); - const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2); + // Predefine the index vector +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x16_t v1 = simdutf_make_uint8x16_t(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15); +#else // SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x16_t v1 = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; +#endif // SIMDUTF_REGULAR_VISUAL_STUDIO + switch (pos64 >> 4) { + case 0b00: { + const uint8x16_t v0 = vmovq_n_u8((uint8_t)(pos - 1)); + const uint8x16_t v2 = + vcgtq_s8(vreinterpretq_s8_u8(v1), + vreinterpretq_s8_u8(v0)); // Compare greater than + const uint8x16_t sh = vsubq_u8(v1, v2); // Subtract + const uint8x16_t compressed = + vqtbl1q_u8(b->chunks[0], sh); // Table lookup (shuffle) + + vst1q_u8((uint8_t *)(output + 0 * 16), compressed); + vst1q_u8((uint8_t *)(output + 1 * 16 - 1), b->chunks[1]); + vst1q_u8((uint8_t *)(output + 2 * 16 - 1), b->chunks[2]); + vst1q_u8((uint8_t *)(output + 3 * 16 - 1), b->chunks[3]); + } break; + + case 0b01: { + vst1q_u8((uint8_t *)(output + 0 * 16), b->chunks[0]); + + const uint8x16_t v0 = vmovq_n_u8((uint8_t)(pos - 1)); + const uint8x16_t v2 = + vcgtq_s8(vreinterpretq_s8_u8(v1), vreinterpretq_s8_u8(v0)); + const uint8x16_t sh = vsubq_u8(v1, v2); + const uint8x16_t compressed = vqtbl1q_u8(b->chunks[1], sh); + + vst1q_u8((uint8_t *)(output + 1 * 16), compressed); + vst1q_u8((uint8_t *)(output + 2 * 16 - 1), b->chunks[2]); + vst1q_u8((uint8_t *)(output + 3 * 16 - 1), b->chunks[3]); + } break; + + case 0b10: { + vst1q_u8((uint8_t *)(output + 0 * 16), b->chunks[0]); + vst1q_u8((uint8_t *)(output + 1 * 16), b->chunks[1]); + + const uint8x16_t v0 = vmovq_n_u8((uint8_t)(pos - 1)); + const uint8x16_t v2 = + vcgtq_s8(vreinterpretq_s8_u8(v1), vreinterpretq_s8_u8(v0)); + const uint8x16_t sh = vsubq_u8(v1, v2); + const uint8x16_t compressed = vqtbl1q_u8(b->chunks[2], sh); + + vst1q_u8((uint8_t *)(output + 2 * 16), compressed); + vst1q_u8((uint8_t *)(output + 3 * 16 - 1), b->chunks[3]); + } break; + + case 0b11: { + vst1q_u8((uint8_t *)(output + 0 * 16), b->chunks[0]); + vst1q_u8((uint8_t *)(output + 1 * 16), b->chunks[1]); + vst1q_u8((uint8_t *)(output + 2 * 16), b->chunks[2]); + + const uint8x16_t v0 = vmovq_n_u8((uint8_t)(pos - 1)); + const uint8x16_t v2 = + vcgtq_s8(vreinterpretq_s8_u8(v1), vreinterpretq_s8_u8(v0)); + const uint8x16_t sh = vsubq_u8(v1, v2); + const uint8x16_t compressed = vqtbl1q_u8(b->chunks[3], sh); + + vst1q_u8((uint8_t *)(output + 3 * 16), compressed); + } break; + } + return 63; +} + +template bool is_power_of_two(T x) { return (x & (x - 1)) == 0; } + +template +full_result +compress_decode_base64(char *dst, const char_type *src, size_t srclen, + base64_options options, + last_chunk_handling_options last_chunk_options) { + const uint8_t *to_base64 = + default_or_url ? tables::base64::to_base64_default_or_url_value + : (base64_url ? tables::base64::to_base64_url_value + : tables::base64::to_base64_value); + auto ri = simdutf::scalar::base64::find_end(src, srclen, options); + size_t equallocation = ri.equallocation; + size_t equalsigns = ri.equalsigns; + srclen = ri.srclen; + size_t full_input_length = ri.full_input_length; + if (srclen == 0) { + if (!ignore_garbage && equalsigns > 0) { + return {INVALID_BASE64_CHARACTER, equallocation, 0}; + } + return {SUCCESS, full_input_length, 0}; + } + const char_type *const srcinit = src; + const char *const dstinit = dst; + const char_type *const srcend = src + srclen; - const uint8_t mask3 = static_cast(mask >> 24); - const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0]; - const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1)); - const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3); + constexpr size_t block_size = 10; + char buffer[block_size * 64]; + char *bufferptr = buffer; + if (srclen >= 64) { + const char_type *const srcend64 = src + srclen - 64; + while (src <= srcend64) { + block64 b; + load_block(&b, src); + src += 64; + bool error = false; + uint64_t badcharmask = + to_base64_mask(&b, &error); + if (badcharmask) { + if (error && !ignore_garbage) { + src -= 64; + while (src < srcend && scalar::base64::is_eight_byte(*src) && + to_base64[uint8_t(*src)] <= 64) { + src++; + } + if (src < srcend) { + // should never happen + } + return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit), + size_t(dst - dstinit)}; + } + } - _mm_storeu_si128((__m128i*)utf8_output, utf8_0); - utf8_output += row0[0]; - _mm_storeu_si128((__m128i*)utf8_output, utf8_1); - utf8_output += row1[0]; - _mm_storeu_si128((__m128i*)utf8_output, utf8_2); - utf8_output += row2[0]; - _mm_storeu_si128((__m128i*)utf8_output, utf8_3); - utf8_output += row3[0]; - buf += 16; - } else { - // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes. - // Let us do a scalar fallback. - // It may seem wasteful to use scalar code, but being efficient with SIMD - // may require large, non-trivial tables? - size_t forward = 15; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint32_t word = buf[k]; - if((word & 0xFFFFFF80)==0) { // 1-byte (ASCII) - *utf8_output++ = char(word); - } else if((word & 0xFFFFF800)==0) { // 2-byte - *utf8_output++ = char((word>>6) | 0b11000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else if((word & 0xFFFF0000 )==0) { // 3-byte - if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); } - *utf8_output++ = char((word>>12) | 0b11100000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else { // 4-byte - if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); } - *utf8_output++ = char((word>>18) | 0b11110000); - *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); + if (badcharmask != 0) { + // optimization opportunity: check for simple masks like those made of + // continuous 1s followed by continuous 0s. And masks containing a + // single bad character. + if (is_power_of_two(badcharmask)) { + bufferptr += compress_block_single(&b, badcharmask, bufferptr); + } else { + bufferptr += compress_block(&b, badcharmask, bufferptr); + } + } else { + // optimization opportunity: if bufferptr == buffer and mask == 0, we + // can avoid the call to compress_block and decode directly. + copy_block(&b, bufferptr); + bufferptr += 64; + } + if (bufferptr >= (block_size - 1) * 64 + buffer) { + for (size_t i = 0; i < (block_size - 1); i++) { + base64_decode_block(dst, buffer + i * 64); + dst += 48; } + std::memcpy(buffer, buffer + (block_size - 1) * 64, + 64); // 64 might be too much + bufferptr -= (block_size - 1) * 64; } - buf += k; } - } // while - - // check for invalid input - const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff); - if(static_cast(_mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) { - return std::make_pair(nullptr, utf8_output); + } + char *buffer_start = buffer; + // Optimization note: if this is almost full, then it is worth our + // time, otherwise, we should just decode directly. + int last_block = (int)((bufferptr - buffer_start) % 64); + if (last_block != 0 && srcend - src + last_block >= 64) { + while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) { + uint8_t val = to_base64[uint8_t(*src)]; + *bufferptr = char(val); + if ((!scalar::base64::is_eight_byte(*src) || val > 64) && + !ignore_garbage) { + return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit), + size_t(dst - dstinit)}; + } + bufferptr += (val <= 63); + src++; + } } - if (static_cast(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf8_output); } + for (; buffer_start + 64 <= bufferptr; buffer_start += 64) { + base64_decode_block(dst, buffer_start); + dst += 48; + } + if ((bufferptr - buffer_start) % 64 != 0) { + while (buffer_start + 4 < bufferptr) { + uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) + + (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) + + (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) + + (uint32_t(uint8_t(buffer_start[3])) << 0 * 6)) + << 8; +#if !SIMDUTF_IS_BIG_ENDIAN + triple = scalar::u32_swap_bytes(triple); +#endif + std::memcpy(dst, &triple, 4); - return std::make_pair(buf, utf8_output); -} + dst += 3; + buffer_start += 4; + } + if (buffer_start + 4 <= bufferptr) { + uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) + + (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) + + (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) + + (uint32_t(uint8_t(buffer_start[3])) << 0 * 6)) + << 8; +#if !SIMDUTF_IS_BIG_ENDIAN + triple = scalar::u32_swap_bytes(triple); +#endif + std::memcpy(dst, &triple, 3); -// Todo: currently, this is just the haswell code, optimize for icelake kernel. -std::pair avx512_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) { - const char32_t* end = buf + len; - const char32_t* start = buf; + dst += 3; + buffer_start += 4; + } + // we may have 1, 2 or 3 bytes left and we need to decode them so let us + // backtrack + int leftover = int(bufferptr - buffer_start); + while (leftover > 0) { + if (!ignore_garbage) { + while (to_base64[uint8_t(*(src - 1))] == 64) { + src--; + } + } else { + while (to_base64[uint8_t(*(src - 1))] >= 64) { + src--; + } + } + src--; + leftover--; + } + } + if (src < srcend + equalsigns) { + full_result r = scalar::base64::base64_tail_decode( + dst, src, srcend - src, equalsigns, options, last_chunk_options); + r = scalar::base64::patch_tail_result( + r, size_t(src - srcinit), size_t(dst - dstinit), equallocation, + full_input_length, last_chunk_options); + // When is_partial(last_chunk_options) is true, we must either end with + // the end of the stream (beyond whitespace) or right after a non-ignorable + // character or at the very beginning of the stream. + // See https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64 + if (is_partial(last_chunk_options) && r.error == error_code::SUCCESS && + r.input_count < full_input_length) { + // First check if we can extend the input to the end of the stream + while (r.input_count < full_input_length && + base64_ignorable(*(srcinit + r.input_count), options)) { + r.input_count++; + } + // If we are still not at the end of the stream, then we must backtrack + // to the last non-ignorable character. + if (r.input_count < full_input_length) { + while (r.input_count > 0 && + base64_ignorable(*(srcinit + r.input_count - 1), options)) { + r.input_count--; + } + } + } + return r; + } + if (equalsigns > 0 && !ignore_garbage) { + if ((size_t(dst - dstinit) % 3 == 0) || + ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) { + return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)}; + } + } + return {SUCCESS, srclen, size_t(dst - dstinit)}; +} +/* end file src/arm64/arm_base64.cpp */ +/* begin file src/arm64/arm_find.cpp */ +simdutf_really_inline const char *util_find(const char *start, const char *end, + char character) noexcept { + // Handle empty or invalid range + if (start >= end) + return end; + + const size_t widestep = 64; + const size_t step = 16; + uint8x16_t char_vec = vdupq_n_u8(static_cast(character)); + + // Handle unaligned beginning + uintptr_t misalignment = reinterpret_cast(start) % step; + if (misalignment != 0) { + size_t adjustment = step - misalignment; + if (size_t(end - start) < adjustment) { + adjustment = end - start; + } + for (size_t i = 0; i < adjustment; ++i) { + if (start[i] == character) { + return start + i; + } + } + start += adjustment; + } + + // Main loop for full 64-byte chunks + while (size_t(end - start) >= widestep) { + uint8x16_t data1 = vld1q_u8(reinterpret_cast(start)); + uint8x16_t data2 = vld1q_u8(reinterpret_cast(start) + 16); + uint8x16_t data3 = vld1q_u8(reinterpret_cast(start) + 32); + uint8x16_t data4 = vld1q_u8(reinterpret_cast(start) + 48); + + uint8x16_t cmp1 = vceqq_u8(data1, char_vec); + uint8x16_t cmp2 = vceqq_u8(data2, char_vec); + uint8x16_t cmp3 = vceqq_u8(data3, char_vec); + uint8x16_t cmp4 = vceqq_u8(data4, char_vec); + uint8x16_t cmpall = vorrq_u8(vorrq_u8(cmp1, cmp2), vorrq_u8(cmp3, cmp4)); + + uint64_t mask = vget_lane_u64( + vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(cmpall), 4)), 0); + + if (mask != 0) { + // Found a match, return the first one + uint64_t mask1 = vget_lane_u64( + vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(cmp1), 4)), 0); + if (mask1 != 0) { + // Found a match in the first chunk + int index = trailing_zeroes(mask1) / 4; // Each character maps to 4 bits + return start + index; + } + uint64_t mask2 = vget_lane_u64( + vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(cmp2), 4)), 0); + if (mask2 != 0) { + // Found a match in the second chunk + int index = trailing_zeroes(mask2) / 4; // Each character maps to 4 bits + return start + index + 16; + } + uint64_t mask3 = vget_lane_u64( + vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(cmp3), 4)), 0); + if (mask3 != 0) { + // Found a match in the third chunk + int index = trailing_zeroes(mask3) / 4; // Each character maps to 4 bits + return start + index + 32; + } + uint64_t mask4 = vget_lane_u64( + vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(cmp4), 4)), 0); + if (mask4 != 0) { + // Found a match in the fourth chunk + int index = trailing_zeroes(mask4) / 4; // Each character maps to 4 bits + return start + index + 48; + } + } - const __m256i v_0000 = _mm256_setzero_si256(); - const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000); - const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80); - const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800); - const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080); - const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff); - const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff); + start += widestep; + } - const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 + // Main loop for full 16-byte chunks + while (size_t(end - start) >= step) { + uint8x16_t data = vld1q_u8(reinterpret_cast(start)); + uint8x16_t cmp = vceqq_u8(data, char_vec); + uint64_t mask = vget_lane_u64( + vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(cmp), 4)), 0); - while (buf + 16 + safety_margin <= end) { - __m256i in = _mm256_loadu_si256((__m256i*)buf); - __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1); - // Check for too large input - const __m256i max_input = _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff); - if(static_cast(_mm256_movemask_epi8(_mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) { - return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output); + if (mask != 0) { + // Found a match, return the first one + int index = trailing_zeroes(mask) / 4; // Each character maps to 4 bits + return start + index; } - // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned saturation - __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff)); - in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000); - - // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp) + start += step; + } - if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!! - // 1. pack the bytes - const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1)); - // 2. store (16 bytes) - _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); - // 3. adjust pointers - buf += 16; - utf8_output += 16; - continue; // we are done for this round! + // Handle remaining bytes with scalar loop + for (; start < end; ++start) { + if (*start == character) { + return start; } - // no bits set above 7th bit - const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000); - const uint32_t one_byte_bitmask = static_cast(_mm256_movemask_epi8(one_byte_bytemask)); - - // no bits set above 11th bit - const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000); - const uint32_t one_or_two_bytes_bitmask = static_cast(_mm256_movemask_epi8(one_or_two_bytes_bytemask)); - if (one_or_two_bytes_bitmask == 0xffffffff) { - // 1. prepare 2-byte values - // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 - // expected output : [110a|aaaa|10bb|bbbb] x 8 - const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00); - const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f); + } - // t0 = [000a|aaaa|bbbb|bb00] - const __m256i t0 = _mm256_slli_epi16(in_16, 2); - // t1 = [000a|aaaa|0000|0000] - const __m256i t1 = _mm256_and_si256(t0, v_1f00); - // t2 = [0000|0000|00bb|bbbb] - const __m256i t2 = _mm256_and_si256(in_16, v_003f); - // t3 = [000a|aaaa|00bb|bbbb] - const __m256i t3 = _mm256_or_si256(t1, t2); - // t4 = [110a|aaaa|10bb|bbbb] - const __m256i t4 = _mm256_or_si256(t3, v_c080); + return end; +} - // 2. merge ASCII and 2-byte codewords - const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask); +simdutf_really_inline const char16_t *util_find(const char16_t *start, + const char16_t *end, + char16_t character) noexcept { + // Handle empty or invalid range + if (start >= end) + return end; - // 3. prepare bitmask for 8-bit lookup - const uint32_t M0 = one_byte_bitmask & 0x55555555; - const uint32_t M1 = M0 >> 7; - const uint32_t M2 = (M1 | M0) & 0x00ff00ff; - // 4. pack the bytes + const size_t step = 8; + uint16x8_t char_vec = vdupq_n_u16(character); - const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0]; - const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0]; + // Handle unaligned beginning + uintptr_t misalignment = + reinterpret_cast(start) % (step * sizeof(char16_t)); + if (misalignment != 0 && misalignment % 2 == 0) { + size_t adjustment = + (step * sizeof(char16_t) - misalignment) / sizeof(char16_t); + if (size_t(end - start) < adjustment) { + adjustment = end - start; + } + for (size_t i = 0; i < adjustment; ++i) { + if (start[i] == character) { + return start + i; + } + } + start += adjustment; + } - const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1)); - const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1)); + // Main loop for full 8-element chunks with unrolling + while (size_t(end - start) >= 4 * step) { + uint16x8_t data1 = vld1q_u16(reinterpret_cast(start)); + uint16x8_t data2 = + vld1q_u16(reinterpret_cast(start) + step); + uint16x8_t data3 = + vld1q_u16(reinterpret_cast(start) + 2 * step); + uint16x8_t data4 = + vld1q_u16(reinterpret_cast(start) + 3 * step); - const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2)); - // 5. store bytes - _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed)); - utf8_output += row[0]; - _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1)); - utf8_output += row_2[0]; + uint16x8_t cmp1 = vceqq_u16(data1, char_vec); + uint16x8_t cmp2 = vceqq_u16(data2, char_vec); + uint16x8_t cmp3 = vceqq_u16(data3, char_vec); + uint16x8_t cmp4 = vceqq_u16(data4, char_vec); - // 6. adjust pointers - buf += 16; - continue; + uint64_t mask1 = vget_lane_u64( + vreinterpret_u64_u16(vshrn_n_u32(vreinterpretq_u32_u16(cmp1), 4)), 0); + if (mask1 != 0) { + int index = trailing_zeroes(mask1) / 8; + return start + index; } - // Must check for overflow in packing - const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000); - const uint32_t saturation_bitmask = static_cast(_mm256_movemask_epi8(saturation_bytemask)); - if (saturation_bitmask == 0xffffffff) { - // case: code units from register produce either 1, 2 or 3 UTF-8 bytes - - // Check for illegal surrogate code units - const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800); - const __m256i forbidden_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800); - if (static_cast(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) { - return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output); - } - const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e, - 0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + uint64_t mask2 = vget_lane_u64( + vreinterpret_u64_u16(vshrn_n_u32(vreinterpretq_u32_u16(cmp2), 4)), 0); + if (mask2 != 0) { + int index = trailing_zeroes(mask2) / 8; + return start + index + step; + } - /* In this branch we handle three cases: - 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte - 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes - 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes + uint64_t mask3 = vget_lane_u64( + vreinterpret_u64_u16(vshrn_n_u32(vreinterpretq_u32_u16(cmp3), 4)), 0); + if (mask3 != 0) { + int index = trailing_zeroes(mask3) / 8; + return start + index + 2 * step; + } - We expand the input word (16-bit) into two code units (32-bit), thus - we have room for four bytes. However, we need five distinct bit - layouts. Note that the last byte in cases #2 and #3 is the same. + uint64_t mask4 = vget_lane_u64( + vreinterpret_u64_u16(vshrn_n_u32(vreinterpretq_u32_u16(cmp4), 4)), 0); + if (mask4 != 0) { + int index = trailing_zeroes(mask4) / 8; + return start + index + 3 * step; + } - We precompute byte 1 for case #1 and the common byte for cases #2 & #3 - in register t2. + start += 4 * step; + } - We precompute byte 1 for case #3 and -- **conditionally** -- precompute - either byte 1 for case #2 or byte 2 for case #3. Note that they - differ by exactly one bit. + // Main loop for full 8-element chunks + while (size_t(end - start) >= step) { + uint16x8_t data = vld1q_u16(reinterpret_cast(start)); + uint16x8_t cmp = vceqq_u16(data, char_vec); + uint64_t mask = vget_lane_u64( + vreinterpret_u64_u16(vshrn_n_u32(vreinterpretq_u32_u16(cmp), 4)), 0); - Finally from these two code units we build proper UTF-8 sequence, taking - into account the case (i.e, the number of bytes to write). - */ - /** - * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: - * t2 => [0ccc|cccc] [10cc|cccc] - * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) - */ -#define simdutf_vec(x) _mm256_set1_epi16(static_cast(x)) - // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] - const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even); - // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] - const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111)); - // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] - const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000)); + if (mask != 0) { + int index = trailing_zeroes(mask) / 8; + return start + index; + } - // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] - const __m256i s0 = _mm256_srli_epi16(in_16, 4); - // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] - const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100)); - // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] - const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140)); - // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] - const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000)); - const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000)); - const __m256i s4 = _mm256_xor_si256(s3, m0); -#undef simdutf_vec + start += step; + } - // 4. expand code units 16-bit => 32-bit - const __m256i out0 = _mm256_unpacklo_epi16(t2, s4); - const __m256i out1 = _mm256_unpackhi_epi16(t2, s4); + // Handle remaining elements with scalar loop + for (; start < end; ++start) { + if (*start == character) { + return start; + } + } - // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle - const uint32_t mask = (one_byte_bitmask & 0x55555555) | - (one_or_two_bytes_bitmask & 0xaaaaaaaa); - // Due to the wider registers, the following path is less likely to be useful. - /*if(mask == 0) { - // We only have three-byte code units. Use fast path. - const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); - const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle); - const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle); - _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0)); - utf8_output += 12; - _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1)); - utf8_output += 12; - _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1)); - utf8_output += 12; - _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1)); - utf8_output += 12; - buf += 16; - continue; - }*/ - const uint8_t mask0 = uint8_t(mask); - const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; - const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1)); - const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0); + return end; +} +/* end file src/arm64/arm_find.cpp */ +#endif // SIMDUTF_FEATURE_BASE64 +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/arm64/arm_convert_utf32_to_latin1.cpp */ +std::pair +arm_convert_utf32_to_latin1(const char32_t *buf, size_t len, + char *latin1_output) { + const char32_t *end = buf + len; + while (end - buf >= 8) { + uint32x4_t in1 = vld1q_u32(reinterpret_cast(buf)); + uint32x4_t in2 = vld1q_u32(reinterpret_cast(buf + 4)); - const uint8_t mask1 = static_cast(mask >> 8); - const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; - const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1)); - const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1); + uint16x8_t utf16_packed = vcombine_u16(vqmovn_u32(in1), vqmovn_u32(in2)); + if (vmaxvq_u16(utf16_packed) <= 0xff) { + // 1. pack the bytes + uint8x8_t latin1_packed = vmovn_u16(utf16_packed); + // 2. store (8 bytes) + vst1_u8(reinterpret_cast(latin1_output), latin1_packed); + // 3. adjust pointers + buf += 8; + latin1_output += 8; + } else { + return std::make_pair(nullptr, reinterpret_cast(latin1_output)); + } + } // while + return std::make_pair(buf, latin1_output); +} - const uint8_t mask2 = static_cast(mask >> 16); - const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0]; - const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1)); - const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2); +std::pair +arm_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len, + char *latin1_output) { + const char32_t *start = buf; + const char32_t *end = buf + len; + while (end - buf >= 8) { + uint32x4_t in1 = vld1q_u32(reinterpret_cast(buf)); + uint32x4_t in2 = vld1q_u32(reinterpret_cast(buf + 4)); - const uint8_t mask3 = static_cast(mask >> 24); - const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0]; - const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1)); - const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3); + uint16x8_t utf16_packed = vcombine_u16(vqmovn_u32(in1), vqmovn_u32(in2)); - _mm_storeu_si128((__m128i*)utf8_output, utf8_0); - utf8_output += row0[0]; - _mm_storeu_si128((__m128i*)utf8_output, utf8_1); - utf8_output += row1[0]; - _mm_storeu_si128((__m128i*)utf8_output, utf8_2); - utf8_output += row2[0]; - _mm_storeu_si128((__m128i*)utf8_output, utf8_3); - utf8_output += row3[0]; - buf += 16; + if (vmaxvq_u16(utf16_packed) <= 0xff) { + // 1. pack the bytes + uint8x8_t latin1_packed = vmovn_u16(utf16_packed); + // 2. store (8 bytes) + vst1_u8(reinterpret_cast(latin1_output), latin1_packed); + // 3. adjust pointers + buf += 8; + latin1_output += 8; } else { - // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes. // Let us do a scalar fallback. - // It may seem wasteful to use scalar code, but being efficient with SIMD - // may require large, non-trivial tables? - size_t forward = 15; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { + for (int k = 0; k < 8; k++) { uint32_t word = buf[k]; - if((word & 0xFFFFFF80)==0) { // 1-byte (ASCII) - *utf8_output++ = char(word); - } else if((word & 0xFFFFF800)==0) { // 2-byte - *utf8_output++ = char((word>>6) | 0b11000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else if((word & 0xFFFF0000 )==0) { // 3-byte - if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); } - *utf8_output++ = char((word>>12) | 0b11100000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else { // 4-byte - if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf8_output); } - *utf8_output++ = char((word>>18) | 0b11110000); - *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); + if (word <= 0xff) { + *latin1_output++ = char(word); + } else { + return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), + latin1_output); } } - buf += k; } } // while + return std::make_pair(result(error_code::SUCCESS, buf - start), + latin1_output); +} +/* end file src/arm64/arm_convert_utf32_to_latin1.cpp */ +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_UTF16 +/* begin file src/arm64/arm_convert_utf32_to_utf16.cpp */ +struct expansion_result_t { + size_t u16count; + uint8x16_t compressed_v; +}; - return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output); +// This function is used to check for invalid UTF-32 characters +// and surrogate pairs in the input +simdutf_really_inline uint64_t invalid_utf32(const uint32x4x2_t in) { + const auto standardmax = vdupq_n_u32(0x10ffff); + const auto v_d800 = vdupq_n_u32(0xd800); + const auto v_fffff800 = vdupq_n_u32(0xfffff800); + const auto too_large1 = vcgtq_u32(in.val[0], standardmax); + const auto too_large2 = vcgtq_u32(in.val[1], standardmax); + const auto surrogate1 = vceqq_u32(vandq_u32(in.val[0], v_fffff800), v_d800); + const auto surrogate2 = vceqq_u32(vandq_u32(in.val[1], v_fffff800), v_d800); + const auto err1 = vorrq_u32(too_large1, surrogate1); + const auto err2 = vorrq_u32(too_large2, surrogate2); + const auto err = + vuzp2q_u16(vreinterpretq_u16_u32(err1), vreinterpretq_u16_u32(err2)); + + return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(err, 8)), 0); +} + +// This function is used to check for surrogate pairs in the input +simdutf_really_inline uint64_t fast_invalid_utf32(const uint32x4x2_t in) { + const auto v_d800 = vdupq_n_u32(0xd800); + const auto v_fffff800 = vdupq_n_u32(0xfffff800); + const auto surrogate1 = vceqq_u32(vandq_u32(in.val[0], v_fffff800), v_d800); + const auto surrogate2 = vceqq_u32(vandq_u32(in.val[1], v_fffff800), v_d800); + const auto err = vuzp2q_u16(vreinterpretq_u16_u32(surrogate1), + vreinterpretq_u16_u32(surrogate2)); + return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(err, 8)), 0); +} + +template +simdutf_really_inline expansion_result_t +neon_expand_surrogate(const uint32x4_t in) { + const uint32x4_t v_ffff0000 = vdupq_n_u32(0xffff0000); + const uint32x4_t non_surrogate_mask = vceqzq_u32(vandq_u32(in, v_ffff0000)); + const uint64_t cmp_bits = + vget_lane_u64(vreinterpret_u64_u32(vshrn_n_u64( + vreinterpretq_u64_u32(non_surrogate_mask), 31)), + 0); + const uint8_t mask = + uint8_t(~((cmp_bits & 0x3) | ((cmp_bits >> 30) & 0xc)) & 0xf); + const uint32x4_t v_10000 = vdupq_n_u32(0x00010000); + const uint32x4_t t0 = vsubq_u32(in, v_10000); + const uint32x4_t t1 = vandq_u32(t0, vdupq_n_u32(0xfffff)); + const uint32x4_t t2 = vshrq_n_u32(t1, 10); + const uint32x4_t t3 = vsliq_n_u32(t2, t1, 16); + const uint32x4_t surrogates = vorrq_u32( + vandq_u32(t3, vdupq_n_u32(0x03ff03ff)), vdupq_n_u32(0xdc00d800)); + const uint8x16_t merged = + vreinterpretq_u8_u32(vbslq_u32(non_surrogate_mask, in, surrogates)); + + const uint8x16_t shuffle_v = vld1q_u8(reinterpret_cast( + (byte_order == endianness::LITTLE) + ? tables::utf32_to_utf16::pack_utf32_to_utf16le[mask] + : tables::utf32_to_utf16::pack_utf32_to_utf16be[mask])); + + const size_t u16count = 4 + vget_lane_u8(vcnt_u8(vcreate_u8(mask)), 0); + const uint8x16_t compressed_v = vqtbl1q_u8(merged, shuffle_v); + + return {u16count, compressed_v}; } -/* end file src/icelake/icelake_convert_utf32_to_utf8.inl.cpp */ -/* begin file src/icelake/icelake_convert_utf32_to_utf16.inl.cpp */ -// file included directly -// Todo: currently, this is just the haswell code, optimize for icelake kernel. template -std::pair avx512_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output) { - const char32_t* end = buf + len; - - const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 - __m256i forbidden_bytemask = _mm256_setzero_si256(); +std::pair +arm_convert_utf32_to_utf16(const char32_t *buf, size_t len, + char16_t *utf16_out) { + uint16_t *utf16_output = reinterpret_cast(utf16_out); + const char32_t *end = buf + len; + uint16x8_t forbidden_bytemask = vmovq_n_u16(0x0); + // To avoid buffer overflow while writing compressed_v + const size_t safety_margin = 4; + while (end - buf >= std::ptrdiff_t(8 + safety_margin)) { + uint32x4x2_t in = vld1q_u32_x2(reinterpret_cast(buf)); - while (buf + 8 + safety_margin <= end) { - __m256i in = _mm256_loadu_si256((__m256i*)buf); - - const __m256i v_00000000 = _mm256_setzero_si256(); - const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000); - - // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs - const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000); - const uint32_t saturation_bitmask = static_cast(_mm256_movemask_epi8(saturation_bytemask)); - - if (saturation_bitmask == 0xffffffff) { - const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800); - const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800); - forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800)); - - __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1)); - if (big_endian) { - const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - utf16_packed = _mm_shuffle_epi8(utf16_packed, swap); + // Check if no bits set above 16th + uint32_t max_val = vmaxvq_u32(vmaxq_u32(in.val[0], in.val[1])); + if (simdutf_likely(max_val <= 0xFFFF)) { + uint16x8_t utf16_packed = vuzp1q_u16(vreinterpretq_u16_u32(in.val[0]), + vreinterpretq_u16_u32(in.val[1])); + + const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); + const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800); + forbidden_bytemask = + vorrq_u16(vceqq_u16(vandq_u16(utf16_packed, v_f800), v_d800), + forbidden_bytemask); + + if simdutf_constexpr (!match_system(big_endian)) { + utf16_packed = vreinterpretq_u16_u8( + vrev16q_u8(vreinterpretq_u8_u16(utf16_packed))); } - _mm_storeu_si128((__m128i*)utf16_output, utf16_packed); + vst1q_u16(utf16_output, utf16_packed); utf16_output += 8; buf += 8; } else { - size_t forward = 7; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint32_t word = buf[k]; - if((word & 0xFFFF0000)==0) { - // will not generate a surrogate pair - if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); } - *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word); - } else { - // will generate a surrogate pair - if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); } - word -= 0x10000; - uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); - uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); - if (big_endian) { - high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8)); - low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8)); - } - *utf16_output++ = char16_t(high_surrogate); - *utf16_output++ = char16_t(low_surrogate); - } + if (simdutf_unlikely(fast_invalid_utf32(in) || max_val > 0x10ffff)) { + return std::make_pair(nullptr, + reinterpret_cast(utf16_output)); } - buf += k; + expansion_result_t res = neon_expand_surrogate(in.val[0]); + vst1q_u8(reinterpret_cast(utf16_output), res.compressed_v); + utf16_output += res.u16count; + res = neon_expand_surrogate(in.val[1]); + vst1q_u8(reinterpret_cast(utf16_output), res.compressed_v); + utf16_output += res.u16count; + buf += 8; } } // check for invalid input - if (static_cast(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf16_output); } + if (vmaxvq_u32(vreinterpretq_u32_u16(forbidden_bytemask)) != 0) { + return std::make_pair(nullptr, reinterpret_cast(utf16_output)); + } - return std::make_pair(buf, utf16_output); + return std::make_pair(buf, reinterpret_cast(utf16_output)); } -// Todo: currently, this is just the haswell code, optimize for icelake kernel. template -std::pair avx512_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) { - const char32_t* start = buf; - const char32_t* end = buf + len; - - const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 - - while (buf + 8 + safety_margin <= end) { - __m256i in = _mm256_loadu_si256((__m256i*)buf); - - const __m256i v_00000000 = _mm256_setzero_si256(); - const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000); +std::pair +arm_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len, + char16_t *utf16_out) { + uint16_t *utf16_output = reinterpret_cast(utf16_out); + const char32_t *start = buf; + const char32_t *end = buf + len; - // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs - const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000); - const uint32_t saturation_bitmask = static_cast(_mm256_movemask_epi8(saturation_bytemask)); + // To avoid buffer overflow while writing compressed_v + const size_t safety_margin = 4; + while (end - buf >= std::ptrdiff_t(8 + safety_margin)) { + uint32x4x2_t in = vld1q_u32_x2(reinterpret_cast(buf)); - if (saturation_bitmask == 0xffffffff) { - const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800); - const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800); - const __m256i forbidden_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800); - if (static_cast(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) { - return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output); + // Check if no bits set above 16th + uint32_t max_val = vmaxvq_u32(vmaxq_u32(in.val[0], in.val[1])); + if (simdutf_likely(max_val <= 0xFFFF)) { + uint16x8_t utf16_packed = vuzp1q_u16(vreinterpretq_u16_u32(in.val[0]), + vreinterpretq_u16_u32(in.val[1])); + + const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); + const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800); + const uint16x8_t forbidden_bytemask = + vceqq_u16(vandq_u16(utf16_packed, v_f800), v_d800); + if (vmaxvq_u16(forbidden_bytemask) != 0) { + return std::make_pair(result(error_code::SURROGATE, buf - start), + reinterpret_cast(utf16_output)); } - __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1)); - if (big_endian) { - const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - utf16_packed = _mm_shuffle_epi8(utf16_packed, swap); + if simdutf_constexpr (!match_system(big_endian)) { + utf16_packed = vreinterpretq_u16_u8( + vrev16q_u8(vreinterpretq_u8_u16(utf16_packed))); } - _mm_storeu_si128((__m128i*)utf16_output, utf16_packed); + vst1q_u16(utf16_output, utf16_packed); utf16_output += 8; buf += 8; } else { - size_t forward = 7; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint32_t word = buf[k]; - if((word & 0xFFFF0000)==0) { - // will not generate a surrogate pair - if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); } - *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word); - } else { - // will generate a surrogate pair - if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); } - word -= 0x10000; - uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); - uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); - if (big_endian) { - high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8)); - low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8)); + const uint64_t err = + max_val <= 0x10ffff ? fast_invalid_utf32(in) : invalid_utf32(in); + if (simdutf_unlikely(err)) { + const size_t pos = trailing_zeroes(err) / 8; + for (size_t k = 0; k < pos; k++) { + uint32_t word = buf[k]; + if ((word & 0xFFFF0000) == 0) { + // will not generate a surrogate pair + *utf16_output++ = !match_system(big_endian) + ? char16_t(word >> 8 | word << 8) + : char16_t(word); + } else { + // will generate a surrogate pair + word -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); + if simdutf_constexpr (!match_system(big_endian)) { + high_surrogate = + uint16_t(high_surrogate >> 8 | high_surrogate << 8); + low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); } - *utf16_output++ = char16_t(high_surrogate); - *utf16_output++ = char16_t(low_surrogate); } + const uint32_t word = buf[pos]; + const size_t error_pos = buf - start + pos; + if (word > 0x10FFFF) { + return {result(error_code::TOO_LARGE, error_pos), + reinterpret_cast(utf16_output)}; + } + if (word >= 0xD800 && word <= 0xDFFF) { + return {result(error_code::SURROGATE, error_pos), + reinterpret_cast(utf16_output)}; + } + return {result(error_code::OTHER, error_pos), + reinterpret_cast(utf16_output)}; } - buf += k; + expansion_result_t res = neon_expand_surrogate(in.val[0]); + vst1q_u8(reinterpret_cast(utf16_output), res.compressed_v); + utf16_output += res.u16count; + res = neon_expand_surrogate(in.val[1]); + vst1q_u8(reinterpret_cast(utf16_output), res.compressed_v); + utf16_output += res.u16count; + buf += 8; } } - return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output); + return std::make_pair(result(error_code::SUCCESS, buf - start), + reinterpret_cast(utf16_output)); } -/* end file src/icelake/icelake_convert_utf32_to_utf16.inl.cpp */ -/* begin file src/icelake/icelake_ascii_validation.inl.cpp */ -// file included directly +/* end file src/arm64/arm_convert_utf32_to_utf16.cpp */ +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_UTF8 +/* begin file src/arm64/arm_convert_utf32_to_utf8.cpp */ +std::pair +arm_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) { + uint8_t *utf8_output = reinterpret_cast(utf8_out); + const char32_t *end = buf + len; -bool validate_ascii(const char* buf, size_t len) { - const char* end = buf + len; - const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80); - __m512i running_or = _mm512_setzero_si512(); - for (; buf + 64 <= end; buf += 64) { - const __m512i utf8 = _mm512_loadu_si512((const __m512i*)buf); - running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii, 0xf8); // running_or | (utf8 & ascii) - } - if(buf < end) { - const __m512i utf8 = _mm512_maskz_loadu_epi8((uint64_t(1) << (end-buf)) - 1,(const __m512i*)buf); - running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii, 0xf8); // running_or | (utf8 & ascii) - } - return (_mm512_test_epi8_mask(running_or, running_or) == 0); -} -/* end file src/icelake/icelake_ascii_validation.inl.cpp */ -/* begin file src/icelake/icelake_utf32_validation.inl.cpp */ -// file included directly + const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080); -const char32_t* validate_utf32(const char32_t* buf, size_t len) { - const char32_t* end = len >= 16 ? buf + len - 16 : nullptr; + uint16x8_t forbidden_bytemask = vmovq_n_u16(0x0); + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 - const __m512i offset = _mm512_set1_epi32((uint32_t)0xffff2000); - __m512i currentmax = _mm512_setzero_si512(); - __m512i currentoffsetmax = _mm512_setzero_si512(); + while (buf + 16 + safety_margin < end) { + uint32x4_t in = vld1q_u32(reinterpret_cast(buf)); + uint32x4_t nextin = vld1q_u32(reinterpret_cast(buf + 4)); - while (buf <= end) { - __m512i utf32 = _mm512_loadu_si512((const __m512i*)buf); - buf += 16; - currentoffsetmax = _mm512_max_epu32(_mm512_add_epi32(utf32, offset), currentoffsetmax); - currentmax = _mm512_max_epu32(utf32, currentmax); - } + // Check if no bits set above 16th + if (vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) { + // Pack UTF-32 to UTF-16 safely (without surrogate pairs) + // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp) + uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin)); + if (vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!! + // 1. pack the bytes + // obviously suboptimal. + uint8x8_t utf8_packed = vmovn_u16(utf16_packed); + // 2. store (8 bytes) + vst1_u8(utf8_output, utf8_packed); + // 3. adjust pointers + buf += 8; + utf8_output += 8; + continue; // we are done for this round! + } - const __m512i standardmax = _mm512_set1_epi32((uint32_t)0x10ffff); - const __m512i standardoffsetmax = _mm512_set1_epi32((uint32_t)0xfffff7ff); - __m512i is_zero = _mm512_xor_si512(_mm512_max_epu32(currentmax, standardmax), standardmax); - if (_mm512_test_epi8_mask(is_zero, is_zero) != 0) { - return nullptr; - } - is_zero = _mm512_xor_si512(_mm512_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax); - if (_mm512_test_epi8_mask(is_zero, is_zero) != 0) { - return nullptr; - } + if (vmaxvq_u16(utf16_packed) <= 0x7FF) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00); + const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f); - return buf; -} -/* end file src/icelake/icelake_utf32_validation.inl.cpp */ -/* begin file src/icelake/icelake_convert_latin1_to_utf8.inl.cpp */ -// file included directly + // t0 = [000a|aaaa|bbbb|bb00] + const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2); + // t1 = [000a|aaaa|0000|0000] + const uint16x8_t t1 = vandq_u16(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const uint16x8_t t3 = vorrq_u16(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const uint16x8_t t4 = vorrq_u16(t3, v_c080); + // 2. merge ASCII and 2-byte codewords + const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); + const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f); + const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16( + vbslq_u16(one_byte_bytemask, utf16_packed, t4)); + // 3. prepare bitmask for 8-bit lookup +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t mask = simdutf_make_uint16x8_t( + 0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080); +#else + const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040, + 0x0002, 0x0008, 0x0020, 0x0080}; +#endif + uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask)); + // 4. pack the bytes + const uint8_t *row = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; + const uint8x16_t shuffle = vld1q_u8(row + 1); + const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle); -static inline size_t latin1_to_utf8_avx512_vec(__m512i input, size_t input_len, char *utf8_output, int mask_output) { - __mmask64 nonascii = _mm512_movepi8_mask(input); - size_t output_size = input_len + (size_t)count_ones(nonascii); + // 5. store bytes + vst1q_u8(utf8_output, utf8_packed); - // Mask to denote whether the byte is a leading byte that is not ascii - __mmask64 sixth = - _mm512_cmpge_epu8_mask(input, _mm512_set1_epi8(-64)); //binary representation of -64: 1100 0000 + // 6. adjust pointers + buf += 8; + utf8_output += row[0]; + continue; + } else { + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes + const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); + const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff); + forbidden_bytemask = + vorrq_u16(vandq_u16(vcleq_u16(utf16_packed, v_dfff), + vcgeq_u16(utf16_packed, v_d800)), + forbidden_bytemask); - const uint64_t alternate_bits = UINT64_C(0x5555555555555555); - uint64_t ascii = ~nonascii; - // the bits in ascii are inverted and zeros are interspersed in between them - uint64_t maskA = ~_pdep_u64(ascii, alternate_bits); - uint64_t maskB = ~_pdep_u64(ascii>>32, alternate_bits); +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t dup_even = simdutf_make_uint16x8_t( + 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); +#else + const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e}; +#endif + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - + single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - + two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - + three UTF-8 bytes - // interleave bytes from top and bottom halves (abcd...ABCD -> aAbBcCdD) - __m512i input_interleaved = _mm512_permutexvar_epi8(_mm512_set_epi32( - 0x3f1f3e1e, 0x3d1d3c1c, 0x3b1b3a1a, 0x39193818, - 0x37173616, 0x35153414, 0x33133212, 0x31113010, - 0x2f0f2e0e, 0x2d0d2c0c, 0x2b0b2a0a, 0x29092808, - 0x27072606, 0x25052404, 0x23032202, 0x21012000 - ), input); + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. - // double size of each byte, and insert the leading byte 1100 0010 + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. -/* -upscale the bytes to 16-bit value, adding the 0b11000000 leading byte in the process. -We adjust for the bytes that have their two most significant bits. This takes care of the first 32 bytes, assuming we interleaved the bytes. */ - __m512i outputA = _mm512_shldi_epi16(input_interleaved, _mm512_set1_epi8(-62), 8); - outputA = _mm512_mask_add_epi16( - outputA, - (__mmask32)sixth, - outputA, - _mm512_set1_epi16(1 - 0x4000)); // 1- 0x4000 = 1100 0000 0000 0001???? - - // in the second 32-bit half, set first or second option based on whether original input is leading byte (second case) or not (first case) - __m512i leadingB = _mm512_mask_blend_epi16( - (__mmask32)(sixth>>32), - _mm512_set1_epi16(0x00c2), // 0000 0000 1101 0010 - _mm512_set1_epi16(0x40c3));// 0100 0000 1100 0011 - __m512i outputB = _mm512_ternarylogic_epi32( - input_interleaved, - leadingB, - _mm512_set1_epi16((short)0xff00), - (240 & 170) ^ 204); // (input_interleaved & 0xff00) ^ leadingB + We precompute byte 1 for case #3 and -- **conditionally** -- + precompute either byte 1 for case #2 or byte 2 for case #3. Note that + they differ by exactly one bit. - // prune redundant bytes - outputA = _mm512_maskz_compress_epi8(maskA, outputA); - outputB = _mm512_maskz_compress_epi8(maskB, outputB); + Finally from these two code units we build proper UTF-8 sequence, + taking into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define simdutf_vec(x) vmovq_n_u16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const uint16x8_t t0 = + vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed), + vreinterpretq_u8_u16(dup_even))); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000)); + // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] + const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12); + // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] + const uint16x8_t s1 = + vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000)); + // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000] + const uint16x8_t s1s = vshlq_n_u16(s1, 2); + // [00bb|bbbb|0000|aaaa] + const uint16x8_t s2 = vorrq_u16(s0, s1s); + // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000)); + const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF); + const uint16x8_t one_or_two_bytes_bytemask = + vcleq_u16(utf16_packed, v_07ff); + const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), + one_or_two_bytes_bytemask); + const uint16x8_t s4 = veorq_u16(s3, m0); +#undef simdutf_vec - size_t output_sizeA = (size_t)count_ones((uint32_t)nonascii) + 32; + // 4. expand code units 16-bit => 32-bit + const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4)); + const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4)); - if(mask_output) { - if(input_len > 32) { // is the second half of the input vector used? - __mmask64 write_mask = _bzhi_u64(~0ULL, (unsigned int)output_sizeA); - _mm512_mask_storeu_epi8(utf8_output, write_mask, outputA); - utf8_output += output_sizeA; - write_mask = _bzhi_u64(~0ULL, (unsigned int)(output_size - output_sizeA)); - _mm512_mask_storeu_epi8(utf8_output, write_mask, outputB); - } else { - __mmask64 write_mask = _bzhi_u64(~0ULL, (unsigned int)output_size); - _mm512_mask_storeu_epi8(utf8_output, write_mask, outputA); - } - } else { - _mm512_storeu_si512(utf8_output, outputA); - utf8_output += output_sizeA; - _mm512_storeu_si512(utf8_output, outputB); - } - return output_size; -} + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); + const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f); +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t onemask = simdutf_make_uint16x8_t( + 0x0001, 0x0004, 0x0010, 0x0040, 0x0100, 0x0400, 0x1000, 0x4000); + const uint16x8_t twomask = simdutf_make_uint16x8_t( + 0x0002, 0x0008, 0x0020, 0x0080, 0x0200, 0x0800, 0x2000, 0x8000); +#else + const uint16x8_t onemask = {0x0001, 0x0004, 0x0010, 0x0040, + 0x0100, 0x0400, 0x1000, 0x4000}; + const uint16x8_t twomask = {0x0002, 0x0008, 0x0020, 0x0080, + 0x0200, 0x0800, 0x2000, 0x8000}; +#endif + const uint16x8_t combined = + vorrq_u16(vandq_u16(one_byte_bytemask, onemask), + vandq_u16(one_or_two_bytes_bytemask, twomask)); + const uint16_t mask = vaddvq_u16(combined); + // The following fast path may or may not be beneficial. + /*if(mask == 0) { + // We only have three-byte code units. Use fast path. + const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0}; + const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle); + const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle); + vst1q_u8(utf8_output, utf8_0); + utf8_output += 12; + vst1q_u8(utf8_output, utf8_1); + utf8_output += 12; + buf += 8; + continue; + }*/ + const uint8_t mask0 = uint8_t(mask); + const uint8_t *row0 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const uint8x16_t shuffle0 = vld1q_u8(row0 + 1); + const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0); -static inline size_t latin1_to_utf8_avx512_branch(__m512i input, char *utf8_output) { - __mmask64 nonascii = _mm512_movepi8_mask(input); - if(nonascii) { - return latin1_to_utf8_avx512_vec(input, 64, utf8_output, 0); - } else { - _mm512_storeu_si512(utf8_output, input); - return 64; - } -} + const uint8_t mask1 = static_cast(mask >> 8); + const uint8_t *row1 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const uint8x16_t shuffle1 = vld1q_u8(row1 + 1); + const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1); -size_t latin1_to_utf8_avx512_start(const char *buf, size_t len, char *utf8_output) { - char *start = utf8_output; - size_t pos = 0; - // if there's at least 128 bytes remaining, we don't need to mask the output - for (; pos + 128 <= len; pos += 64) { - __m512i input = _mm512_loadu_si512((__m512i *)(buf + pos)); - utf8_output += latin1_to_utf8_avx512_branch(input, utf8_output); - } - // in the last 128 bytes, the first 64 may require masking the output - if (pos + 64 <= len) { - __m512i input = _mm512_loadu_si512((__m512i *)(buf + pos)); - utf8_output += latin1_to_utf8_avx512_vec(input, 64, utf8_output, 1); - pos += 64; - } - // with the last 64 bytes, the input also needs to be masked - if (pos < len) { - __mmask64 load_mask = _bzhi_u64(~0ULL, (unsigned int)(len - pos)); - __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)(buf + pos)); - utf8_output += latin1_to_utf8_avx512_vec(input, len - pos, utf8_output, 1); - } - return (size_t)(utf8_output - start); -} -/* end file src/icelake/icelake_convert_latin1_to_utf8.inl.cpp */ -/* begin file src/icelake/icelake_convert_latin1_to_utf16.inl.cpp */ -// file included directly -template -size_t icelake_convert_latin1_to_utf16(const char *latin1_input, size_t len, - char16_t *utf16_output) { - size_t rounded_len = len & ~0x1F; // Round down to nearest multiple of 32 + vst1q_u8(utf8_output, utf8_0); + utf8_output += row0[0]; + vst1q_u8(utf8_output, utf8_1); + utf8_output += row1[0]; - __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809, - 0x0607040502030001, 0x0e0f0c0d0a0b0809, - 0x0607040502030001, 0x0e0f0c0d0a0b0809, - 0x0607040502030001, 0x0e0f0c0d0a0b0809); - for (size_t i = 0; i < rounded_len; i += 32) { - // Load 32 Latin1 characters into a 256-bit register - __m256i in = _mm256_loadu_si256((__m256i *)&latin1_input[i]); - // Zero extend each set of 8 Latin1 characters to 32 16-bit integers - __m512i out = _mm512_cvtepu8_epi16(in); - if (big_endian) { - out = _mm512_shuffle_epi8(out, byteflip); + buf += 8; + } + // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> + // will produce four UTF-8 bytes. + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint32_t word = buf[k]; + if ((word & 0xFFFFFF80) == 0) { + *utf8_output++ = char(word); + } else if ((word & 0xFFFFF800) == 0) { + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if ((word & 0xFFFF0000) == 0) { + if (word >= 0xD800 && word <= 0xDFFF) { + return std::make_pair(nullptr, + reinterpret_cast(utf8_output)); + } + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + if (word > 0x10FFFF) { + return std::make_pair(nullptr, + reinterpret_cast(utf8_output)); + } + *utf8_output++ = char((word >> 18) | 0b11110000); + *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } + } + buf += k; } - // Store the results back to memory - _mm512_storeu_si512((__m512i *)&utf16_output[i], out); - } - if (rounded_len != len) { - uint32_t mask = uint32_t(1 << (len - rounded_len)) - 1; - __m256i in = _mm256_maskz_loadu_epi8(mask, latin1_input + rounded_len); + } // while - // Zero extend each set of 8 Latin1 characters to 32 16-bit integers - __m512i out = _mm512_cvtepu8_epi16(in); - if (big_endian) { - out = _mm512_shuffle_epi8(out, byteflip); - } - // Store the results back to memory - _mm512_mask_storeu_epi16(utf16_output + rounded_len, mask, out); + // check for invalid input + if (vmaxvq_u16(forbidden_bytemask) != 0) { + return std::make_pair(nullptr, reinterpret_cast(utf8_output)); } - - return len; + return std::make_pair(buf, reinterpret_cast(utf8_output)); } -/* end file src/icelake/icelake_convert_latin1_to_utf16.inl.cpp */ -/* begin file src/icelake/icelake_convert_latin1_to_utf32.inl.cpp */ -std::pair avx512_convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) { - size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16 - - for (size_t i = 0; i < rounded_len; i += 16) { - // Load 16 Latin1 characters into a 128-bit register - __m128i in = _mm_loadu_si128((__m128i*)&buf[i]); - // Zero extend each set of 8 Latin1 characters to 16 32-bit integers using vpmovzxbd - __m512i out = _mm512_cvtepu8_epi32(in); +std::pair +arm_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len, + char *utf8_out) { + uint8_t *utf8_output = reinterpret_cast(utf8_out); + const char32_t *start = buf; + const char32_t *end = buf + len; - // Store the results back to memory - _mm512_storeu_si512((__m512i*)&utf32_output[i], out); - } + const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080); + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 - // Return pointers pointing to where we left off - return std::make_pair(buf + rounded_len, utf32_output + rounded_len); -} -/* end file src/icelake/icelake_convert_latin1_to_utf32.inl.cpp */ -/* begin file src/icelake/icelake_base64.inl.cpp */ -// file included directly -/** - * References and further reading: - * - * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the - * speed of a memory copy, Software: Practice and Experience 50 (2), 2020. - * https://arxiv.org/abs/1910.05109 - * - * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2 - * Instructions, ACM Transactions on the Web 12 (3), 2018. - * https://arxiv.org/abs/1704.00605 - * - * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings. - * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force, - * Request for Comments: 4648. - * - * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization. - * http://www.alfredklomp.com/programming/sse-base64/. (2014). - * - * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD - * acceleration. https://github.com/aklomp/base64. (2014). - * - * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014). - * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/ - * - * Nick Kopp. 2013. Base64 Encoding on a GPU. - * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013). - */ + while (buf + 16 + safety_margin < end) { + uint32x4_t in = vld1q_u32(reinterpret_cast(buf)); + uint32x4_t nextin = vld1q_u32(reinterpret_cast(buf + 4)); -struct block64 { - __m512i chunks[1]; -}; + // Check if no bits set above 16th + if (vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) { + // Pack UTF-32 to UTF-16 safely (without surrogate pairs) + // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp) + uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin)); + if (vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!! + // 1. pack the bytes + // obviously suboptimal. + uint8x8_t utf8_packed = vmovn_u16(utf16_packed); + // 2. store (8 bytes) + vst1_u8(utf8_output, utf8_packed); + // 3. adjust pointers + buf += 8; + utf8_output += 8; + continue; // we are done for this round! + } -template -size_t encode_base64(char *dst, const char *src, size_t srclen, - base64_options options) { - // credit: Wojciech Muła - const uint8_t *input = (const uint8_t *)src; + if (vmaxvq_u16(utf16_packed) <= 0x7FF) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00); + const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f); - uint8_t *out = (uint8_t *)dst; - static const char *lookup_tbl = - base64_url - ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_" - : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + // t0 = [000a|aaaa|bbbb|bb00] + const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2); + // t1 = [000a|aaaa|0000|0000] + const uint16x8_t t1 = vandq_u16(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const uint16x8_t t3 = vorrq_u16(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const uint16x8_t t4 = vorrq_u16(t3, v_c080); + // 2. merge ASCII and 2-byte codewords + const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); + const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f); + const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16( + vbslq_u16(one_byte_bytemask, utf16_packed, t4)); + // 3. prepare bitmask for 8-bit lookup +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t mask = simdutf_make_uint16x8_t( + 0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080); +#else + const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040, + 0x0002, 0x0008, 0x0020, 0x0080}; +#endif + uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask)); + // 4. pack the bytes + const uint8_t *row = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; + const uint8x16_t shuffle = vld1q_u8(row + 1); + const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle); - const __m512i shuffle_input = _mm512_setr_epi32( - 0x01020001, 0x04050304, 0x07080607, 0x0a0b090a, 0x0d0e0c0d, 0x10110f10, - 0x13141213, 0x16171516, 0x191a1819, 0x1c1d1b1c, 0x1f201e1f, 0x22232122, - 0x25262425, 0x28292728, 0x2b2c2a2b, 0x2e2f2d2e); - const __m512i lookup = - _mm512_loadu_si512(reinterpret_cast(lookup_tbl)); - const __m512i multi_shifts = _mm512_set1_epi64(UINT64_C(0x3036242a1016040a)); - size_t i = 0; - for (; i + 64 <= srclen; i += 48) { - const __m512i v = - _mm512_loadu_si512(reinterpret_cast(input + i)); - const __m512i in = _mm512_permutexvar_epi8(shuffle_input, v); - const __m512i indices = _mm512_multishift_epi64_epi8(multi_shifts, in); - const __m512i result = _mm512_permutexvar_epi8(indices, lookup); - _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), result); - out += 64; - } - return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i, - srclen - i, options); -} + // 5. store bytes + vst1q_u8(utf8_output, utf8_packed); -template -static inline uint64_t to_base64_mask(block64 *b, bool *error) { - __m512i input = b->chunks[0]; - const __m512i ascii_space_tbl = _mm512_set_epi8( - 0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 12, 0, 10, 9, - 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, - 32, 0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32); - __m512i lookup0; - if (base64_url) { - lookup0 = _mm512_set_epi8( - -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53, - 52, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128, -128, - -128, -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128, - -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1, - -128, -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -1); - } else { - lookup0 = _mm512_set_epi8( - -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53, - 52, 63, -128, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128, - -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128, -128, - -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1, -128, - -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -128); - } - __m512i lookup1; - if (base64_url) { - lookup1 = _mm512_set_epi8( - -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, - 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128, - 63, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, - 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128); - } else { - lookup1 = _mm512_set_epi8( - -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, - 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128, - -128, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, - 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128); - } + // 6. adjust pointers + buf += 8; + utf8_output += row[0]; + continue; + } else { + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes - const __m512i translated = _mm512_permutex2var_epi8(lookup0, input, lookup1); - const __m512i combined = _mm512_or_si512(translated, input); - const __mmask64 mask = _mm512_movepi8_mask(combined); - if (mask) { - const __mmask64 spaces = _mm512_cmpeq_epi8_mask( - _mm512_shuffle_epi8(ascii_space_tbl, input), input); - *error |= (mask != spaces); - } - b->chunks[0] = translated; + // check for invalid input + const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); + const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff); + const uint16x8_t forbidden_bytemask = vandq_u16( + vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800)); + if (vmaxvq_u16(forbidden_bytemask) != 0) { + return std::make_pair(result(error_code::SURROGATE, buf - start), + reinterpret_cast(utf8_output)); + } - return mask; -} +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t dup_even = simdutf_make_uint16x8_t( + 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); +#else + const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e}; +#endif + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - + single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - + two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - + three UTF-8 bytes -static inline void copy_block(block64 *b, char *output) { - _mm512_storeu_si512(reinterpret_cast<__m512i *>(output), b->chunks[0]); -} + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. -static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) { - uint64_t nmask = ~mask; - __m512i c = _mm512_maskz_compress_epi8(nmask, b->chunks[0]); - _mm512_storeu_si512(reinterpret_cast<__m512i *>(output), c); - return _mm_popcnt_u64(nmask); -} + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. -// The caller of this function is responsible to ensure that there are 64 bytes available -// from reading at src. The data is read into a block64 structure. -static inline void load_block(block64 *b, const char *src) { - b->chunks[0] = _mm512_loadu_si512(reinterpret_cast(src)); -} + We precompute byte 1 for case #3 and -- **conditionally** -- + precompute either byte 1 for case #2 or byte 2 for case #3. Note that + they differ by exactly one bit. -// The caller of this function is responsible to ensure that there are 128 bytes available -// from reading at src. The data is read into a block64 structure. -static inline void load_block(block64 *b, const char16_t *src) { - __m512i m1 = _mm512_loadu_si512(reinterpret_cast(src)); - __m512i m2 = _mm512_loadu_si512(reinterpret_cast(src + 32)); - __m512i p = _mm512_packus_epi16(m1, m2); - b->chunks[0] = - _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), p); -} + Finally from these two code units we build proper UTF-8 sequence, + taking into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define simdutf_vec(x) vmovq_n_u16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const uint16x8_t t0 = + vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed), + vreinterpretq_u8_u16(dup_even))); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000)); -static inline void base64_decode(char *out, __m512i str) { - const __m512i merge_ab_and_bc = - _mm512_maddubs_epi16(str, _mm512_set1_epi32(0x01400140)); - const __m512i merged = - _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000)); - const __m512i pack = _mm512_set_epi8( - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60, 61, 62, 56, 57, 58, - 52, 53, 54, 48, 49, 50, 44, 45, 46, 40, 41, 42, 36, 37, 38, 32, 33, 34, - 28, 29, 30, 24, 25, 26, 20, 21, 22, 16, 17, 18, 12, 13, 14, 8, 9, 10, 4, - 5, 6, 0, 1, 2); - const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged); - _mm512_mask_storeu_epi8( - (__m512i *)out, 0xffffffffffff, - shuffled); // mask would be 0xffffffffffff since we write 48 bytes. -} -// decode 64 bytes and output 48 bytes -static inline void base64_decode_block(char *out, const char *src) { - base64_decode(out, - _mm512_loadu_si512(reinterpret_cast(src))); -} -static inline void base64_decode_block(char *out, block64 *b) { - base64_decode(out, b->chunks[0]); -} + // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] + const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12); + // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] + const uint16x8_t s1 = + vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000)); + // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000] + const uint16x8_t s1s = vshlq_n_u16(s1, 2); + // [00bb|bbbb|0000|aaaa] + const uint16x8_t s2 = vorrq_u16(s0, s1s); + // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000)); + const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF); + const uint16x8_t one_or_two_bytes_bytemask = + vcleq_u16(utf16_packed, v_07ff); + const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), + one_or_two_bytes_bytemask); + const uint16x8_t s4 = veorq_u16(s3, m0); +#undef simdutf_vec -template -result compress_decode_base64(char *dst, const chartype *src, size_t srclen, - base64_options options) { - const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value - : tables::base64::to_base64_value; - size_t equalsigns = 0; - // skip trailing spaces - while (srclen > 0 && to_base64[uint8_t(src[srclen - 1])] == 64) { - srclen--; - } - if (srclen > 0 && src[srclen - 1] == '=') { - srclen--; - equalsigns = 1; - // skip trailing spaces - while (srclen > 0 && to_base64[uint8_t(src[srclen - 1])] == 64) { - srclen--; - } - if (srclen > 0 && src[srclen - 1] == '=') { - srclen--; - equalsigns = 2; - } - } - const chartype *const srcinit = src; - const char *const dstinit = dst; - const chartype *const srcend = src + srclen; + // 4. expand code units 16-bit => 32-bit + const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4)); + const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4)); - // figure out why block_size == 2 is sometimes best??? - constexpr size_t block_size = 6; - char buffer[block_size * 64]; - char *bufferptr = buffer; - if (srclen >= 64) { - const chartype *const srcend64 = src + srclen - 64; - while (src <= srcend64) { - block64 b; - load_block(&b, src); - src += 64; - bool error = false; - uint64_t badcharmask = to_base64_mask(&b, &error); - if (error) { - src -= 64; - while (src < srcend && to_base64[uint8_t(*src)] <= 64) { - src++; - } - return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)}; + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); + const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f); +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t onemask = simdutf_make_uint16x8_t( + 0x0001, 0x0004, 0x0010, 0x0040, 0x0100, 0x0400, 0x1000, 0x4000); + const uint16x8_t twomask = simdutf_make_uint16x8_t( + 0x0002, 0x0008, 0x0020, 0x0080, 0x0200, 0x0800, 0x2000, 0x8000); +#else + const uint16x8_t onemask = {0x0001, 0x0004, 0x0010, 0x0040, + 0x0100, 0x0400, 0x1000, 0x4000}; + const uint16x8_t twomask = {0x0002, 0x0008, 0x0020, 0x0080, + 0x0200, 0x0800, 0x2000, 0x8000}; +#endif + const uint16x8_t combined = + vorrq_u16(vandq_u16(one_byte_bytemask, onemask), + vandq_u16(one_or_two_bytes_bytemask, twomask)); + const uint16_t mask = vaddvq_u16(combined); + // The following fast path may or may not be beneficial. + /*if(mask == 0) { + // We only have three-byte code units. Use fast path. + const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0}; + const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle); + const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle); + vst1q_u8(utf8_output, utf8_0); + utf8_output += 12; + vst1q_u8(utf8_output, utf8_1); + utf8_output += 12; + buf += 8; + continue; + }*/ + const uint8_t mask0 = uint8_t(mask); + + const uint8_t *row0 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const uint8x16_t shuffle0 = vld1q_u8(row0 + 1); + const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0); + + const uint8_t mask1 = static_cast(mask >> 8); + const uint8_t *row1 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const uint8x16_t shuffle1 = vld1q_u8(row1 + 1); + const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1); + + vst1q_u8(utf8_output, utf8_0); + utf8_output += row0[0]; + vst1q_u8(utf8_output, utf8_1); + utf8_output += row1[0]; + + buf += 8; } - if (badcharmask != 0) { - // optimization opportunity: check for simple masks like those made of - // continuous 1s followed by continuous 0s. And masks containing a - // single bad character. - bufferptr += compress_block(&b, badcharmask, bufferptr); - } else if (bufferptr != buffer) { - copy_block(&b, bufferptr); - bufferptr += 64; - } else { - base64_decode_block(dst, &b); - dst += 48; + // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> + // will produce four UTF-8 bytes. + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); } - if (bufferptr >= (block_size - 1) * 64 + buffer) { - for (size_t i = 0; i < (block_size - 1); i++) { - base64_decode_block(dst, buffer + i * 64); - dst += 48; + for (; k < forward; k++) { + uint32_t word = buf[k]; + if ((word & 0xFFFFFF80) == 0) { + *utf8_output++ = char(word); + } else if ((word & 0xFFFFF800) == 0) { + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if ((word & 0xFFFF0000) == 0) { + if (word >= 0xD800 && word <= 0xDFFF) { + return std::make_pair( + result(error_code::SURROGATE, buf - start + k), + reinterpret_cast(utf8_output)); + } + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + if (word > 0x10FFFF) { + return std::make_pair( + result(error_code::TOO_LARGE, buf - start + k), + reinterpret_cast(utf8_output)); + } + *utf8_output++ = char((word >> 18) | 0b11110000); + *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); } - std::memcpy(buffer, buffer + (block_size - 1) * 64, - 64); // 64 might be too much - bufferptr -= (block_size - 1) * 64; } + buf += k; } - } + } // while - char *buffer_start = buffer; - // Optimization note: if this is almost full, then it is worth our - // time, otherwise, we should just decode directly. - int last_block = (int)((bufferptr - buffer_start) % 64); - if (last_block != 0 && srcend - src + last_block >= 64) { + return std::make_pair(result(error_code::SUCCESS, buf - start), + reinterpret_cast(utf8_output)); +} +/* end file src/arm64/arm_convert_utf32_to_utf8.cpp */ +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_UTF8 - while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) { - uint8_t val = to_base64[uint8_t(*src)]; - *bufferptr = char(val); - if (val > 64) { - return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)}; - } - bufferptr += (val <= 63); - src++; - } - } +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf - for (; buffer_start + 64 <= bufferptr; buffer_start += 64) { - base64_decode_block(dst, buffer_start); - dst += 48; - } - if ((bufferptr - buffer_start) % 64 != 0) { - while (buffer_start + 4 < bufferptr) { - uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) + - (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) + - (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) + - (uint32_t(uint8_t(buffer_start[3])) << 0 * 6)) - << 8; - triple = scalar::utf32::swap_bytes(triple); - std::memcpy(dst, &triple, 4); - dst += 3; - buffer_start += 4; - } - if (buffer_start + 4 <= bufferptr) { - uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) + - (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) + - (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) + - (uint32_t(uint8_t(buffer_start[3])) << 0 * 6)) - << 8; - triple = scalar::utf32::swap_bytes(triple); - std::memcpy(dst, &triple, 3); - dst += 3; - buffer_start += 4; - } - // we may have 1, 2 or 3 bytes left and we need to decode them so let us - // bring in src content - int leftover = int(bufferptr - buffer_start); - if (leftover > 0) { - while (leftover < 4 && src < srcend) { - uint8_t val = to_base64[uint8_t(*src)]; - if (val > 64) { - return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)}; - } - buffer_start[leftover] = char(val); - leftover += (val <= 63); - src++; - } - - if (leftover == 1) { - return {BASE64_INPUT_REMAINDER, size_t(dst - dstinit)}; - } - if (leftover == 2) { - uint32_t triple = (uint32_t(buffer_start[0]) << 3 * 6) + - (uint32_t(buffer_start[1]) << 2 * 6); - triple = scalar::utf32::swap_bytes(triple); - triple >>= 8; - std::memcpy(dst, &triple, 1); - dst += 1; - } else if (leftover == 3) { - uint32_t triple = (uint32_t(buffer_start[0]) << 3 * 6) + - (uint32_t(buffer_start[1]) << 2 * 6) + - (uint32_t(buffer_start[2]) << 1 * 6); - triple = scalar::utf32::swap_bytes(triple); - triple >>= 8; - - std::memcpy(dst, &triple, 2); - dst += 2; - } else { - uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) + - (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) + - (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) + - (uint32_t(uint8_t(buffer_start[3])) << 0 * 6)) - << 8; - triple = scalar::utf32::swap_bytes(triple); - std::memcpy(dst, &triple, 3); - dst += 3; - } - } +/* begin file src/generic/buf_block_reader.h */ +namespace simdutf { +namespace arm64 { +namespace { + +// Walks through a buffer in block-sized increments, loading the last part with +// spaces +template struct buf_block_reader { +public: + simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len); + simdutf_really_inline size_t block_index(); + simdutf_really_inline bool has_full_block() const; + simdutf_really_inline const uint8_t *full_block() const; + /** + * Get the last block, padded with spaces. + * + * There will always be a last block, with at least 1 byte, unless len == 0 + * (in which case this function fills the buffer with spaces and returns 0. In + * particular, if len == STEP_SIZE there will be 0 full_blocks and 1 remainder + * block with STEP_SIZE bytes and no spaces for padding. + * + * @return the number of effective characters in the last block. + */ + simdutf_really_inline size_t get_remainder(uint8_t *dst) const; + simdutf_really_inline void advance(); + +private: + const uint8_t *buf; + const size_t len; + const size_t lenminusstep; + size_t idx; +}; + +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char *format_input_text_64(const uint8_t *text) { + static char *buf = + reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + for (size_t i = 0; i < sizeof(simd8x64); i++) { + buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]); } - if (src < srcend + equalsigns) { - result r = - scalar::base64::base64_tail_decode(dst, src, srcend - src, options); - if (r.error == error_code::INVALID_BASE64_CHARACTER) { - r.count += size_t(src - srcinit); - return r; - } else { - r.count += size_t(dst - dstinit); - } - if(r.error == error_code::SUCCESS && equalsigns > 0) { - // additional checks - if((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) { - r.error = error_code::INVALID_BASE64_CHARACTER; - } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} + +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char *format_input_text(const simd8x64 &in) { + static char *buf = + reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + in.store(reinterpret_cast(buf)); + for (size_t i = 0; i < sizeof(simd8x64); i++) { + if (buf[i] < ' ') { + buf[i] = '_'; } - return r; } - if(equalsigns > 0) { - if((size_t(dst - dstinit) % 3 == 0) || ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) { - return {INVALID_BASE64_CHARACTER, size_t(dst - dstinit)}; - } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} + +simdutf_unused static char *format_mask(uint64_t mask) { + static char *buf = reinterpret_cast(malloc(64 + 1)); + for (size_t i = 0; i < 64; i++) { + buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' '; } - return {SUCCESS, size_t(dst - dstinit)}; + buf[64] = '\0'; + return buf; } -/* end file src/icelake/icelake_base64.inl.cpp */ +template +simdutf_really_inline +buf_block_reader::buf_block_reader(const uint8_t *_buf, size_t _len) + : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, + idx{0} {} -#include +template +simdutf_really_inline size_t buf_block_reader::block_index() { + return idx; +} -} // namespace -} // namespace icelake -} // namespace simdutf +template +simdutf_really_inline bool buf_block_reader::has_full_block() const { + return idx < lenminusstep; +} -namespace simdutf { -namespace icelake { +template +simdutf_really_inline const uint8_t * +buf_block_reader::full_block() const { + return &buf[idx]; +} -simdutf_warn_unused int -implementation::detect_encodings(const char *input, - size_t length) const noexcept { - // If there is a BOM, then we trust it. - auto bom_encoding = simdutf::BOM::check_bom(input, length); - if(bom_encoding != encoding_type::unspecified) { return bom_encoding; } - if (length % 2 == 0) { - const char *buf = input; - - const char *start = buf; - const char *end = input + length; - - bool is_utf8 = true; - bool is_utf16 = true; - bool is_utf32 = true; - - int out = 0; - - avx512_utf8_checker checker{}; - __m512i currentmax = _mm512_setzero_si512(); - while (buf + 64 <= end) { - __m512i in = _mm512_loadu_si512((__m512i *)buf); - __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800))); - __mmask32 surrogates = - _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800))); - if (surrogates) { - is_utf8 = false; - - // Can still be either UTF-16LE or UTF-32 depending on the positions - // of the surrogates To be valid UTF-32, a surrogate cannot be in the - // two most significant bytes of any 32-bit word. On the other hand, to - // be valid UTF-16LE, at least one surrogate must be in the two most - // significant bytes of a 32-bit word since they always come in pairs in - // UTF-16LE. Note that we always proceed in multiple of 4 before this - // point so there is no offset in 32-bit code units. - - if ((surrogates & 0xaaaaaaaa) != 0) { - is_utf32 = false; - __mmask32 highsurrogates = _mm512_cmplt_epu16_mask( - diff, _mm512_set1_epi16(uint16_t(0x0400))); - __mmask32 lowsurrogates = surrogates ^ highsurrogates; - // high must be followed by low - if ((highsurrogates << 1) != lowsurrogates) { - return simdutf::encoding_type::unspecified; - } +template +simdutf_really_inline size_t +buf_block_reader::get_remainder(uint8_t *dst) const { + if (len == idx) { + return 0; + } // memcpy(dst, null, 0) will trigger an error with some sanitizers + std::memset(dst, 0x20, + STEP_SIZE); // std::memset STEP_SIZE because it is more efficient + // to write out 8 or 16 bytes at once. + std::memcpy(dst, buf + idx, len - idx); + return len - idx; +} - bool ends_with_high = ((highsurrogates & 0x80000000) != 0); - if (ends_with_high) { - buf += - 31 * - sizeof(char16_t); // advance only by 31 code units so that we start - // with the high surrogate on the next round. - } else { - buf += 32 * sizeof(char16_t); - } - is_utf16 = validate_utf16le(reinterpret_cast(buf), - (end - buf) / sizeof(char16_t)); - if (!is_utf16) { - return simdutf::encoding_type::unspecified; +template +simdutf_really_inline void buf_block_reader::advance() { + idx += STEP_SIZE; +} - } else { - return simdutf::encoding_type::UTF16_LE; - } +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/buf_block_reader.h */ +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +namespace simdutf { +namespace arm64 { +namespace { +namespace utf8_validation { - } else { - is_utf16 = false; - // Check for UTF-32 - if (length % 4 == 0) { - const char32_t *input32 = reinterpret_cast(buf); - const char32_t *end32 = - reinterpret_cast(start) + length / 4; - if (validate_utf32(input32, end32 - input32)) { - return simdutf::encoding_type::UTF32_LE; - } - } - return simdutf::encoding_type::unspecified; - } - } - // If no surrogate, validate under other encodings as well +using namespace simd; - // UTF-32 validation - currentmax = _mm512_max_epu32(in, currentmax); +simdutf_really_inline simd8 +check_special_cases(const simd8 input, const simd8 prev1) { + // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) + // Bit 1 = Too Long (ASCII followed by continuation) + // Bit 2 = Overlong 3-byte + // Bit 4 = Surrogate + // Bit 5 = Overlong 2-byte + // Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1 << 6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4); + constexpr const uint8_t CARRY = + TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = + (prev1 & 0x0F) + .lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, - // UTF-8 validation - checker.check_next_input(in); + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | + OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - buf += 64; - } + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); + return (byte_1_high & byte_1_low & byte_2_high); +} +simdutf_really_inline simd8 +check_multibyte_lengths(const simd8 input, + const simd8 prev_input, + const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = + simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; +} - // Check which encodings are possible +// +// Return nonzero if there are incomplete multibyte characters at the end of the +// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end. +// +simdutf_really_inline simd8 is_incomplete(const simd8 input) { + // If the previous input's last 3 bytes match this, they're too short (they + // ended at EOF): + // ... 1111____ 111_____ 11______ + static const uint8_t max_array[32] = {255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 0b11110000u - 1, + 0b11100000u - 1, + 0b11000000u - 1}; + const simd8 max_value( + &max_array[sizeof(max_array) - sizeof(simd8)]); + return input.gt_bits(max_value); +} + +struct utf8_checker { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + // The last input we received + simd8 prev_input_block; + // Whether the last input we received was incomplete (used for ASCII fast + // path) + simd8 prev_incomplete; - if (is_utf8) { - size_t current_length = static_cast(buf - start); - if (current_length != length) { - const __m512i utf8 = _mm512_maskz_loadu_epi8( - (1ULL << (length - current_length)) - 1, (const __m512i *)buf); - checker.check_next_input(utf8); - } - checker.check_eof(); - if (!checker.errors()) { - out |= simdutf::encoding_type::UTF8; + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, + const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ + // lead bytes (2, 3, 4-byte leads become large positive numbers instead of + // small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + // The only problem that can happen at EOF is that a multibyte character is + // too short or a byte value too large in the last bytes: check_special_cases + // only checks for bytes too large in the first of two bytes. + simdutf_really_inline void check_eof() { + // If the previous block had incomplete UTF-8 characters at the end, an + // ASCII block can't possibly finish them. + this->error |= this->prev_incomplete; + } + + simdutf_really_inline void check_next_input(const simd8x64 &input) { + if (simdutf_likely(is_ascii(input))) { + this->error |= this->prev_incomplete; + } else { + // you might think that a for-loop would work, but under Visual Studio, it + // is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); } + this->prev_incomplete = + is_incomplete(input.chunks[simd8x64::NUM_CHUNKS - 1]); + this->prev_input_block = input.chunks[simd8x64::NUM_CHUNKS - 1]; } + } - if (is_utf16 && scalar::utf16::validate( - reinterpret_cast(buf), - (length - (buf - start)) / 2)) { - out |= simdutf::encoding_type::UTF16_LE; - } + // do not forget to call check_eof! + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } - if (is_utf32 && (length % 4 == 0)) { - currentmax = _mm512_max_epu32( - _mm512_maskz_loadu_epi8( - (1ULL << (length - static_cast(buf - start))) - 1, - (const __m512i *)buf), - currentmax); - __mmask16 outside_range = _mm512_cmp_epu32_mask(currentmax, _mm512_set1_epi32(0x10ffff), - _MM_CMPINT_GT); - if (outside_range == 0) { - out |= simdutf::encoding_type::UTF32_LE; - } - } +}; // struct utf8_checker +} // namespace utf8_validation - return out; - } else if (implementation::validate_utf8(input, length)) { - return simdutf::encoding_type::UTF8; - } else { - return simdutf::encoding_type::unspecified; +using utf8_validation::utf8_checker; + +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +/* begin file src/generic/utf8_validation/utf8_validator.h */ +namespace simdutf { +namespace arm64 { +namespace { +namespace utf8_validation { + +/** + * Validates that the string is actual UTF-8. + */ +template +bool generic_validate_utf8(const uint8_t *input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + reader.advance(); } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + c.check_eof(); + return !c.errors(); } -simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { - avx512_utf8_checker checker{}; - const char* ptr = buf; - const char* end = ptr + len; - for (; ptr + 64 <= end; ptr += 64) { - const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr); - checker.check_next_input(utf8); - } - { - const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - ptr))-1, (const __m512i*)ptr); - checker.check_next_input(utf8); - } - checker.check_eof(); - return ! checker.errors(); -} - -simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept { - avx512_utf8_checker checker{}; - const char* ptr = buf; - const char* end = ptr + len; - size_t count{0}; - for (; ptr + 64 <= end; ptr += 64) { - const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr); - checker.check_next_input(utf8); - if(checker.errors()) { - if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk - result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(buf), reinterpret_cast(buf + count), len - count); - res.count += count; - return res; - } - count += 64; - } - { - const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - ptr))-1, (const __m512i*)ptr); - checker.check_next_input(utf8); - if(checker.errors()) { - if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk - result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(buf), reinterpret_cast(buf + count), len - count); - res.count += count; - return res; - } else { - return result(error_code::SUCCESS, len); - } +bool generic_validate_utf8(const char *input, size_t length) { + return generic_validate_utf8( + reinterpret_cast(input), length); +} + +/** + * Validates that the string is actual UTF-8 and stops on errors. + */ +template +result generic_validate_utf8_with_errors(const uint8_t *input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + size_t count{0}; + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + if (c.errors()) { + if (count != 0) { + count--; + } // Sometimes the error is only detected in the next chunk + result res = scalar::utf8::rewind_and_validate_with_errors( + reinterpret_cast(input), + reinterpret_cast(input + count), length - count); + res.count += count; + return res; } + reader.advance(); + count += 64; + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + c.check_eof(); + if (c.errors()) { + if (count != 0) { + count--; + } // Sometimes the error is only detected in the next chunk + result res = scalar::utf8::rewind_and_validate_with_errors( + reinterpret_cast(input), + reinterpret_cast(input) + count, length - count); + res.count += count; + return res; + } else { + return result(error_code::SUCCESS, length); + } } -simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept { - return icelake::validate_ascii(buf, len); +result generic_validate_utf8_with_errors(const char *input, size_t length) { + return generic_validate_utf8_with_errors( + reinterpret_cast(input), length); } -simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept { - const char* buf_orig = buf; - const char* end = buf + len; - const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80); - for (; buf + 64 <= end; buf += 64) { - const __m512i input = _mm512_loadu_si512((const __m512i*)buf); - __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT); - if(notascii) { - return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u64(notascii)); +} // namespace utf8_validation +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/utf8_validation/utf8_validator.h */ +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_ASCII +/* begin file src/generic/ascii_validation.h */ +namespace simdutf { +namespace arm64 { +namespace { +namespace ascii_validation { + +result generic_validate_ascii_with_errors(const char *input, size_t length) { + buf_block_reader<64> reader(reinterpret_cast(input), length); + size_t count{0}; + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + if (!in.is_ascii()) { + result res = scalar::ascii::validate_with_errors( + reinterpret_cast(input + count), length - count); + return result(res.error, count + res.count); } + reader.advance(); + + count += 64; } - { - const __m512i input = _mm512_maskz_loadu_epi8((1ULL<<(end - buf))-1, (const __m512i*)buf); - __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT); - if(notascii) { - return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u64(notascii)); + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + if (!in.is_ascii()) { + result res = scalar::ascii::validate_with_errors( + reinterpret_cast(input + count), length - count); + return result(res.error, count + res.count); + } else { + return result(error_code::SUCCESS, length); + } +} + +bool generic_validate_ascii(const char *input, size_t length) { + buf_block_reader<64> reader(reinterpret_cast(input), length); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + if (!in.is_ascii()) { + return false; } + reader.advance(); } - return result(error_code::SUCCESS, len); + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + return in.is_ascii(); } -simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept { - const char16_t *end = buf + len; +} // namespace ascii_validation +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/ascii_validation.h */ +#endif // SIMDUTF_FEATURE_ASCII +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + // transcoding from UTF-8 to UTF-16 +/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */ +namespace simdutf { +namespace arm64 { +namespace { +namespace utf8_to_utf16 { +using namespace simd; - for(;buf + 32 <= end; ) { - __m512i in = _mm512_loadu_si512((__m512i*)buf); - __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800))); - __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800))); - if(surrogates) { - __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400))); - __mmask32 lowsurrogates = surrogates ^ highsurrogates; - // high must be followed by low - if ((highsurrogates << 1) != lowsurrogates) { - return false; - } - bool ends_with_high = ((highsurrogates & 0x80000000) != 0); - if(ends_with_high) { - buf += 31; // advance only by 31 code units so that we start with the high surrogate on the next round. - } else { - buf += 32; - } +simdutf_really_inline simd8 +check_special_cases(const simd8 input, const simd8 prev1) { + // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) + // Bit 1 = Too Long (ASCII followed by continuation) + // Bit 2 = Overlong 3-byte + // Bit 4 = Surrogate + // Bit 5 = Overlong 2-byte + // Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1 << 6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4); + constexpr const uint8_t CARRY = + TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = + (prev1 & 0x0F) + .lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | + OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); + return (byte_1_high & byte_1_low & byte_2_high); +} +simdutf_really_inline simd8 +check_multibyte_lengths(const simd8 input, + const simd8 prev_input, + const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = + simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; +} + +struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, + const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ + // lead bytes (2, 3, 4-byte leads become large positive numbers instead of + // small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + template + simdutf_really_inline size_t convert(const char *in, size_t size, + char16_t *utf16_output) { + size_t pos = 0; + char16_t *start{utf16_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 8 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the eight last + // leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; } else { - buf += 32; - } - } - if(buf < end) { - __m512i in = _mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf); - __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800))); - __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800))); - if(surrogates) { - __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400))); - __mmask32 lowsurrogates = surrogates ^ highsurrogates; - // high must be followed by low - if ((highsurrogates << 1) != lowsurrogates) { - return false; + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); } - } - } - return true; -} - -simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept { - const char16_t *end = buf + len; - const __m512i byteflip = _mm512_setr_epi64( - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809 - ); - for(;buf + 32 <= end; ) { - __m512i in = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)buf), byteflip); - __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800))); - __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800))); - if(surrogates) { - __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400))); - __mmask32 lowsurrogates = surrogates ^ highsurrogates; - // high must be followed by low - if ((highsurrogates << 1) != lowsurrogates) { - return false; + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (utf8_continuation_mask & 1) { + return 0; // error } - bool ends_with_high = ((highsurrogates & 0x80000000) != 0); - if(ends_with_high) { - buf += 31; // advance only by 31 code units so that we start with the high surrogate on the next round. - } else { - buf += 32; + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf16( + in + pos, utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; } - } else { - buf += 32; + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. } } - if(buf < end) { - __m512i in = _mm512_shuffle_epi8(_mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf), byteflip); - __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800))); - __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800))); - if(surrogates) { - __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400))); - __mmask32 lowsurrogates = surrogates ^ highsurrogates; - // high must be followed by low - if ((highsurrogates << 1) != lowsurrogates) { - return false; - } + if (errors()) { + return 0; + } + if (pos < size) { + size_t howmany = scalar::utf8_to_utf16::convert( + in + pos, size - pos, utf16_output); + if (howmany == 0) { + return 0; } + utf16_output += howmany; } - return true; -} + return utf16_output - start; + } -simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept { - const char16_t *start_buf = buf; - const char16_t *end = buf + len; - for(;buf + 32 <= end; ) { - __m512i in = _mm512_loadu_si512((__m512i*)buf); - __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800))); - __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800))); - if(surrogates) { - __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400))); - __mmask32 lowsurrogates = surrogates ^ highsurrogates; - // high must be followed by low - if ((highsurrogates << 1) != lowsurrogates) { - uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1)); - uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1)); - return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high)); + template + simdutf_really_inline result convert_with_errors(const char *in, size_t size, + char16_t *utf16_output) { + size_t pos = 0; + char16_t *start{utf16_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 8 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the eight last + // leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); } - bool ends_with_high = ((highsurrogates & 0x80000000) != 0); - if(ends_with_high) { - buf += 31; // advance only by 31 code units so that we start with the high surrogate on the next round. - } else { - buf += 32; + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (errors() || (utf8_continuation_mask & 1)) { + // rewind_and_convert_with_errors will seek a potential error from + // in+pos onward, with the ability to go back up to pos bytes, and + // read size-pos bytes forward. + result res = + scalar::utf8_to_utf16::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf16_output); + res.count += pos; + return res; } - } else { - buf += 32; + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf16( + in + pos, utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. } } - if(buf < end) { - __m512i in = _mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf); - __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800))); - __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800))); - if(surrogates) { - __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400))); - __mmask32 lowsurrogates = surrogates ^ highsurrogates; - // high must be followed by low - if ((highsurrogates << 1) != lowsurrogates) { - uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1)); - uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1)); - return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high)); - } + if (errors()) { + // rewind_and_convert_with_errors will seek a potential error from in+pos + // onward, with the ability to go back up to pos bytes, and read size-pos + // bytes forward. + result res = + scalar::utf8_to_utf16::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf16_output); + res.count += pos; + return res; + } + if (pos < size) { + // rewind_and_convert_with_errors will seek a potential error from in+pos + // onward, with the ability to go back up to pos bytes, and read size-pos + // bytes forward. + result res = + scalar::utf8_to_utf16::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf16_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + utf16_output += res.count; } } - return result(error_code::SUCCESS, len); -} + return result(error_code::SUCCESS, utf16_output - start); + } -simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept { - const char16_t *start_buf = buf; - const char16_t *end = buf + len; - const __m512i byteflip = _mm512_setr_epi64( - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809 - ); - for(;buf + 32 <= end; ) { - __m512i in = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)buf), byteflip); - __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800))); - __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800))); - if(surrogates) { - __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400))); - __mmask32 lowsurrogates = surrogates ^ highsurrogates; - // high must be followed by low - if ((highsurrogates << 1) != lowsurrogates) { - uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1)); - uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1)); - return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high)); - } - bool ends_with_high = ((highsurrogates & 0x80000000) != 0); - if(ends_with_high) { - buf += 31; // advance only by 31 code units so that we start with the high surrogate on the next round. - } else { - buf += 32; - } - } else { - buf += 32; - } - } - if(buf < end) { - __m512i in = _mm512_shuffle_epi8(_mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf), byteflip); - __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800))); - __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800))); - if(surrogates) { - __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400))); - __mmask32 lowsurrogates = surrogates ^ highsurrogates; - // high must be followed by low - if ((highsurrogates << 1) != lowsurrogates) { - uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1)); - uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1)); - return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high)); - } + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + +}; // struct utf8_checker +} // namespace utf8_to_utf16 +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */ +/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ +namespace simdutf { +namespace arm64 { +namespace { +namespace utf8_to_utf16 { + +using namespace simd; + +template +simdutf_warn_unused size_t convert_valid(const char *input, size_t size, + char16_t *utf16_output) noexcept { + // The implementation is not specific to haswell and should be moved to the + // generic directory. + size_t pos = 0; + char16_t *start{utf16_output}; + const size_t safety_margin = 16; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + // this loop could be unrolled further. For example, we could process the + // mask far more than 64 bytes. + simd8x64 in(reinterpret_cast(input + pos)); + if (in.is_ascii()) { + in.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // Slow path. We hope that the compiler will recognize that this is a slow + // path. Anything that is not a continuation mask is a 'leading byte', + // that is, the start of a new code point. + uint64_t utf8_continuation_mask = in.lt(-65 + 1); + // -65 is 0b10111111 in two-complement's, so largest possible continuation + // byte + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + // The *start* of code points is not so useful, rather, we want the *end* + // of code points. + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times when using solely + // the slow/regular path, and at least four times if there are fast paths. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + // + // Thus we may allow convert_masked_utf8_to_utf16 to process + // more bytes at a time under a fast-path mode where 16 bytes + // are consumed at once (e.g., when encountering ASCII). + size_t consumed = convert_masked_utf8_to_utf16( + input + pos, utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. } - return result(error_code::SUCCESS, len); -} - -simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept { - const char32_t * tail = icelake::validate_utf32(buf, len); - if (tail) { - return scalar::utf32::validate(tail, len - (tail - buf)); - } else { - return false; } + utf16_output += scalar::utf8_to_utf16::convert_valid( + input + pos, size - pos, utf16_output); + return utf16_output - start; } -simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept { +} // namespace utf8_to_utf16 +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + // transcoding from UTF-8 to UTF-32 +/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */ +namespace simdutf { +namespace arm64 { +namespace { +namespace utf8_to_utf32 { +using namespace simd; - const char32_t* end = len >= 16 ? buf + len - 16 : nullptr; - const char32_t* buf_orig = buf; - while (buf <= end) { - __m512i utf32 = _mm512_loadu_si512((const __m512i*)buf); - __mmask16 outside_range = _mm512_cmp_epu32_mask(utf32, _mm512_set1_epi32(0x10ffff), - _MM_CMPINT_GT); - if (outside_range) { - return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u32(outside_range)); - } +simdutf_really_inline simd8 +check_special_cases(const simd8 input, const simd8 prev1) { + // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) + // Bit 1 = Too Long (ASCII followed by continuation) + // Bit 2 = Overlong 3-byte + // Bit 4 = Surrogate + // Bit 5 = Overlong 2-byte + // Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1 << 6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4); + constexpr const uint8_t CARRY = + TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = + (prev1 & 0x0F) + .lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, - __m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000)); + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | + OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - __mmask16 surrogate_range = _mm512_cmp_epu32_mask(utf32_off, _mm512_set1_epi32(0xfffff7ff), - _MM_CMPINT_GT); - if (surrogate_range) { - return result(error_code::SURROGATE, buf - buf_orig + _tzcnt_u32(surrogate_range)); + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); + return (byte_1_high & byte_1_low & byte_2_high); +} +simdutf_really_inline simd8 +check_multibyte_lengths(const simd8 input, + const simd8 prev_input, + const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = + simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; +} + +struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, + const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ + // lead bytes (2, 3, 4-byte leads become large positive numbers instead of + // small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + simdutf_really_inline size_t convert(const char *in, size_t size, + char32_t *utf32_output) { + size_t pos = 0; + char32_t *start{utf32_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 words when calling convert_masked_utf8_to_utf32. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 16 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the fourth + // last leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (utf8_continuation_mask & 1) { + return 0; // we have an error + } + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf32( + in + pos, utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. } - buf += 16; } - if(buf < buf_orig + len) { - __m512i utf32 = _mm512_maskz_loadu_epi32(__mmask16((1<<(buf_orig + len - buf))-1),(const __m512i*)buf); - __mmask16 outside_range = _mm512_cmp_epu32_mask(utf32, _mm512_set1_epi32(0x10ffff), - _MM_CMPINT_GT); - if (outside_range) { - return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u32(outside_range)); + if (errors()) { + return 0; + } + if (pos < size) { + size_t howmany = + scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output); + if (howmany == 0) { + return 0; } - __m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000)); + utf32_output += howmany; + } + return utf32_output - start; + } - __mmask16 surrogate_range = _mm512_cmp_epu32_mask(utf32_off, _mm512_set1_epi32(0xfffff7ff), - _MM_CMPINT_GT); - if (surrogate_range) { - return result(error_code::SURROGATE, buf - buf_orig + _tzcnt_u32(surrogate_range)); + simdutf_really_inline result convert_with_errors(const char *in, size_t size, + char32_t *utf32_output) { + size_t pos = 0; + char32_t *start{utf32_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 8 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the fourth + // last leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (errors() || (utf8_continuation_mask & 1)) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf32_output); + res.count += pos; + return res; + } + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf32( + in + pos, utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (errors()) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf32_output); + res.count += pos; + return res; + } + if (pos < size) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf32_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + utf32_output += res.count; } } + return result(error_code::SUCCESS, utf32_output - start); + } - return result(error_code::SUCCESS, len); -} + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } -simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept { - return icelake::latin1_to_utf8_avx512_start(buf, len, utf8_output); -} +}; // struct utf8_checker +} // namespace utf8_to_utf32 +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */ +/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ +namespace simdutf { +namespace arm64 { +namespace { +namespace utf8_to_utf32 { -simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - return icelake_convert_latin1_to_utf16(buf, len, utf16_output); -} +using namespace simd; -simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - return icelake_convert_latin1_to_utf16(buf, len, utf16_output); +simdutf_warn_unused size_t convert_valid(const char *input, size_t size, + char32_t *utf32_output) noexcept { + size_t pos = 0; + char32_t *start{utf32_output}; + const size_t safety_margin = 16; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 in(reinterpret_cast(input + pos)); + if (in.is_ascii()) { + in.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // -65 is 0b10111111 in two-complement's, so largest possible continuation + // byte + uint64_t utf8_continuation_mask = in.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + size_t max_starting_point = (pos + 64) - 12; + while (pos < max_starting_point) { + size_t consumed = convert_masked_utf8_to_utf32( + input + pos, utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + } + } + utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, + utf32_output); + return utf32_output - start; } -simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept { - std::pair ret = avx512_convert_latin1_to_utf32(buf, len, utf32_output); - if (ret.first == nullptr) { return 0; } - size_t converted_chars = ret.second - utf32_output; - if (ret.first != buf + len) { - const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_converted_chars == 0) { return 0; } - converted_chars += scalar_converted_chars; +} // namespace utf8_to_utf32 +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +// other functions +#if SIMDUTF_FEATURE_UTF16 +/* begin file src/generic/utf16.h */ +namespace simdutf { +namespace arm64 { +namespace { +namespace utf16 { + +template +simdutf_really_inline size_t count_code_points(const char16_t *in, + size_t size) { + size_t pos = 0; + size_t count = 0; + for (; pos < size / 32 * 32; pos += 32) { + simd16x32 input(reinterpret_cast(in + pos)); + if simdutf_constexpr (!match_system(big_endian)) { + input.swap_bytes(); } - return converted_chars; + uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF); + count += count_ones(not_pair) / 2; + } + return count + + scalar::utf16::count_code_points(in + pos, size - pos); } -simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept { - return icelake::utf8_to_latin1_avx512(buf, len, latin1_output); -} +template +simdutf_really_inline size_t utf8_length_from_utf16(const char16_t *in, + size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for (; pos < size / 32 * 32; pos += 32) { + simd16x32 input(reinterpret_cast(in + pos)); + if simdutf_constexpr (!match_system(big_endian)) { + input.swap_bytes(); + } + uint64_t ascii_mask = input.lteq(0x7F); + uint64_t twobyte_mask = input.lteq(0x7FF); + uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF); + size_t ascii_count = count_ones(ascii_mask) / 2; + size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2; + size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2; + size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2; + count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + + ascii_count; + } + return count + scalar::utf16::utf8_length_from_utf16(in + pos, + size - pos); +} -simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept { - // Initialize output length and input length counters - size_t inlen = 0; +template +simdutf_really_inline size_t utf32_length_from_utf16(const char16_t *in, + size_t size) { + return count_code_points(in, size); +} - // First, try to convert as much as possible using the SIMD implementation. - inlen = icelake::utf8_to_latin1_avx512(buf, len, latin1_output); +simdutf_really_inline void +change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) { + size_t pos = 0; - // If we have completely converted the string - if(inlen == len) { - return {simdutf::SUCCESS, len}; + while (pos < size / 32 * 32) { + simd16x32 input(reinterpret_cast(in + pos)); + input.swap_bytes(); + input.store(reinterpret_cast(output)); + pos += 32; + output += 32; } - // Else if there are remaining bytes, use the scalar function to process them. - // Note: This is assuming scalar::utf8_to_latin1::convert_with_errors is a function that takes - // the input buffer, length, and output buffer, and returns a result object with an error code - // and the number of characters processed. - result res = scalar::utf8_to_latin1::convert_with_errors(buf + inlen, len - inlen, latin1_output + inlen); - res.count += inlen; // Add the number of characters processed by the SIMD implementation - - return res; + scalar::utf16::change_endianness_utf16(in + pos, size - pos, output); } +} // namespace utf16 +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/utf16.h */ +#endif // SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF8 +/* begin file src/generic/utf8.h */ +namespace simdutf { +namespace arm64 { +namespace { +namespace utf8 { -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept { - return icelake::valid_utf8_to_latin1_avx512(buf, len, latin1_output); -} +using namespace simd; -simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - utf8_to_utf16_result ret = fast_avx512_convert_utf8_to_utf16(buf, len, utf16_output); - if (ret.second == nullptr) { - return 0; +simdutf_really_inline size_t count_code_points(const char *in, size_t size) { + size_t pos = 0; + size_t count = 0; + for (; pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.gt(-65); + count += count_ones(utf8_continuation_mask); } - return ret.second - utf16_output; + return count + scalar::utf8::count_code_points(in + pos, size - pos); } -simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - utf8_to_utf16_result ret = fast_avx512_convert_utf8_to_utf16(buf, len, utf16_output); - if (ret.second == nullptr) { - return 0; - } - return ret.second - utf16_output; -} +#ifdef SIMDUTF_SIMD_HAS_BYTEMASK +simdutf_really_inline size_t count_code_points_bytemask(const char *in, + size_t size) { + using vector_i8 = simd8; + using vector_u8 = simd8; + using vector_u64 = simd64; -simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - return fast_avx512_convert_utf8_to_utf16_with_errors(buf, len, utf16_output); -} + constexpr size_t N = vector_i8::SIZE; + constexpr size_t max_iterations = 255 / 4; -simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - return fast_avx512_convert_utf8_to_utf16_with_errors(buf, len, utf16_output); -} + size_t pos = 0; + size_t count = 0; -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - utf8_to_utf16_result ret = icelake::valid_utf8_to_fixed_length(buf, len, utf16_output); - size_t saved_bytes = ret.second - utf16_output; - const char* end = buf + len; - if (ret.first == end) { - return saved_bytes; - } + auto counters = vector_u64::zero(); + auto local = vector_u8::zero(); + size_t iterations = 0; + for (; pos + 4 * N <= size; pos += 4 * N) { + const auto input0 = + simd8::load(reinterpret_cast(in + pos + 0 * N)); + const auto input1 = + simd8::load(reinterpret_cast(in + pos + 1 * N)); + const auto input2 = + simd8::load(reinterpret_cast(in + pos + 2 * N)); + const auto input3 = + simd8::load(reinterpret_cast(in + pos + 3 * N)); + const auto mask0 = input0 > int8_t(-65); + const auto mask1 = input1 > int8_t(-65); + const auto mask2 = input2 > int8_t(-65); + const auto mask3 = input3 > int8_t(-65); - // Note: AVX512 procedure looks up 4 bytes forward, and - // correctly converts multi-byte chars even if their - // continuation bytes lie outsiede 16-byte window. - // It meas, we have to skip continuation bytes from - // the beginning ret.first, as they were already consumed. - while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) { - ret.first += 1; + local -= vector_u8(mask0); + local -= vector_u8(mask1); + local -= vector_u8(mask2); + local -= vector_u8(mask3); + + iterations += 1; + if (iterations == max_iterations) { + counters += sum_8bytes(local); + local = vector_u8::zero(); + iterations = 0; + } } - if (ret.first != end) { - const size_t scalar_saved_bytes = scalar::utf8_to_utf16::convert_valid( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; + if (iterations > 0) { + count += local.sum_bytes(); } - return saved_bytes; + count += counters.sum(); + + return count + scalar::utf8::count_code_points(in + pos, size - pos); } +#endif // SIMDUTF_SIMD_HAS_BYTEMASK -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - utf8_to_utf16_result ret = icelake::valid_utf8_to_fixed_length(buf, len, utf16_output); - size_t saved_bytes = ret.second - utf16_output; - const char* end = buf + len; - if (ret.first == end) { - return saved_bytes; - } - - // Note: AVX512 procedure looks up 4 bytes forward, and - // correctly converts multi-byte chars even if their - // continuation bytes lie outsiede 16-byte window. - // It meas, we have to skip continuation bytes from - // the beginning ret.first, as they were already consumed. - while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) { - ret.first += 1; +simdutf_really_inline size_t utf16_length_from_utf8(const char *in, + size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for (; pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + // We count one word for anything that is not a continuation (so + // leading bytes). + count += 64 - count_ones(utf8_continuation_mask); + int64_t utf8_4byte = input.gteq_unsigned(240); + count += count_ones(utf8_4byte); } + return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos); +} - if (ret.first != end) { - const size_t scalar_saved_bytes = scalar::utf8_to_utf16::convert_valid( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; - } +} // namespace utf8 +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/utf8.h */ +#endif // SIMDUTF_FEATURE_UTF8 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + // transcoding from UTF-8 to Latin 1 +/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */ +namespace simdutf { +namespace arm64 { +namespace { +namespace utf8_to_latin1 { +using namespace simd; - return saved_bytes; -} +simdutf_really_inline simd8 +check_special_cases(const simd8 input, const simd8 prev1) { + // For UTF-8 to Latin 1, we can allow any ASCII character, and any + // continuation byte, but the non-ASCII leading bytes must be 0b11000011 or + // 0b11000010 and nothing else. + // + // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) + // Bit 1 = Too Long (ASCII followed by continuation) + // Bit 2 = Overlong 3-byte + // Bit 4 = Surrogate + // Bit 5 = Overlong 2-byte + // Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1 << 6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____ + constexpr const uint8_t FORBIDDEN = 0xff; + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + FORBIDDEN, + // 1110____ ________ + FORBIDDEN, + // 1111____ ________ + FORBIDDEN); + constexpr const uint8_t CARRY = + TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = + (prev1 & 0x0F) + .lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, CARRY, + + // ____0100 ________ + FORBIDDEN, + // ____0101 ________ + FORBIDDEN, + // ____011_ ________ + FORBIDDEN, FORBIDDEN, + + // ____1___ ________ + FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, + // ____1101 ________ + FORBIDDEN, FORBIDDEN, FORBIDDEN); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | + OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, -simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_out) const noexcept { - uint32_t * utf32_output = reinterpret_cast(utf32_out); - utf8_to_utf32_result ret = icelake::validating_utf8_to_fixed_length(buf, len, utf32_output); - if (ret.second == nullptr) - return 0; + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); + return (byte_1_high & byte_1_low & byte_2_high); +} - size_t saved_bytes = ret.second - utf32_output; - const char* end = buf + len; - if (ret.first == end) { - return saved_bytes; - } +struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; - // Note: the AVX512 procedure looks up 4 bytes forward, and - // correctly converts multi-byte chars even if their - // continuation bytes lie outside 16-byte window. - // It means, we have to skip continuation bytes from - // the beginning ret.first, as they were already consumed. - while (ret.first != end and ((uint8_t(*ret.first) & 0xc0) == 0x80)) { - ret.first += 1; + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, + const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ + // lead bytes (2, 3, 4-byte leads become large positive numbers instead of + // small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + this->error |= check_special_cases(input, prev1); } - if (ret.first != end) { - const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert( - ret.first, len - (ret.first - buf), utf32_out + saved_bytes); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; + simdutf_really_inline size_t convert(const char *in, size_t size, + char *latin1_output) { + size_t pos = 0; + char *start{latin1_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 16 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 16; margin--) { + leading_byte += (int8_t(in[margin - 1]) > + -65); // twos complement of -65 is 1011 1111 ... + } + // If the input is long enough, then we have that margin-1 is the eight last + // leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store((int8_t *)latin1_output); + latin1_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = + input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in + // this case, we also have ASCII to account for. + if (utf8_continuation_mask & 1) { + return 0; // error + } + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_latin1( + in + pos, utf8_end_of_code_point_mask, latin1_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (errors()) { + return 0; + } + if (pos < size) { + size_t howmany = + scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output); + if (howmany == 0) { + return 0; + } + latin1_output += howmany; + } + return latin1_output - start; } - return saved_bytes; -} - -simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32) const noexcept { - uint32_t * utf32_output = reinterpret_cast(utf32); - auto ret = icelake::validating_utf8_to_fixed_length_with_constant_checks(buf, len, utf32_output); - if (!std::get<2>(ret)) { - auto new_buf = std::get<0>(ret); - // rewind_and_convert_with_errors will seek a potential error from new_buf onward, - // with the ability to go back up to new_buf - buf bytes, and read len - (new_buf - buf) bytes forward. - result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(new_buf - buf, new_buf, len - (new_buf - buf), reinterpret_cast(std::get<1>(ret))); - res.count += (std::get<0>(ret) - buf); - return res; - } - size_t saved_bytes = std::get<1>(ret) - utf32_output; - const char* end = buf + len; - if (std::get<0>(ret) == end) { - return {simdutf::SUCCESS, saved_bytes}; + simdutf_really_inline result convert_with_errors(const char *in, size_t size, + char *latin1_output) { + size_t pos = 0; + char *start{latin1_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 8 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the eight last + // leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store((int8_t *)latin1_output); + latin1_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + if (errors()) { + // rewind_and_convert_with_errors will seek a potential error from + // in+pos onward, with the ability to go back up to pos bytes, and + // read size-pos bytes forward. + result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors( + pos, in + pos, size - pos, latin1_output); + res.count += pos; + return res; + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_latin1( + in + pos, utf8_end_of_code_point_mask, latin1_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (errors()) { + // rewind_and_convert_with_errors will seek a potential error from in+pos + // onward, with the ability to go back up to pos bytes, and read size-pos + // bytes forward. + result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors( + pos, in + pos, size - pos, latin1_output); + res.count += pos; + return res; + } + if (pos < size) { + // rewind_and_convert_with_errors will seek a potential error from in+pos + // onward, with the ability to go back up to pos bytes, and read size-pos + // bytes forward. + result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors( + pos, in + pos, size - pos, latin1_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + latin1_output += res.count; + } + } + return result(error_code::SUCCESS, latin1_output - start); } - // Note: the AVX512 procedure looks up 4 bytes forward, and - // correctly converts multi-byte chars even if their - // continuation bytes lie outside 16-byte window. - // It means, we have to skip continuation bytes from - // the beginning ret.first, as they were already consumed. - while (std::get<0>(ret) != end and ((uint8_t(*std::get<0>(ret)) & 0xc0) == 0x80)) { - std::get<0>(ret) += 1; + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); } - if (std::get<0>(ret) != end) { - auto scalar_result = scalar::utf8_to_utf32::convert_with_errors( - std::get<0>(ret), len - (std::get<0>(ret) - buf), reinterpret_cast(utf32_output) + saved_bytes); - if (scalar_result.error != simdutf::SUCCESS) { - scalar_result.count += (std::get<0>(ret) - buf); +}; // struct utf8_checker +} // namespace utf8_to_latin1 +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */ +/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */ +namespace simdutf { +namespace arm64 { +namespace { +namespace utf8_to_latin1 { +using namespace simd; + +simdutf_really_inline size_t convert_valid(const char *in, size_t size, + char *latin1_output) { + size_t pos = 0; + char *start{latin1_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last + // 16 bytes, and if the data is valid, then it is entirely safe because 16 + // UTF-8 bytes generate much more than 8 bytes. However, you cannot generally + // assume that you have valid UTF-8 input, so we are going to go back from the + // end counting 8 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > + -65); // twos complement of -65 is 1011 1111 ... + } + // If the input is long enough, then we have that margin-1 is the eight last + // leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store((int8_t *)latin1_output); + latin1_output += 64; + pos += 64; } else { - scalar_result.count += saved_bytes; + // you might think that a for-loop would work, but under Visual Studio, it + // is not good enough. + uint64_t utf8_continuation_mask = + input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in + // this case, we also have ASCII to account for. + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_latin1( + in + pos, utf8_end_of_code_point_mask, latin1_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. } - return scalar_result; } - - return {simdutf::SUCCESS, size_t(std::get<1>(ret) - utf32_output)}; + if (pos < size) { + size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos, + latin1_output); + latin1_output += howmany; + } + return latin1_output - start; } +} // namespace utf8_to_latin1 +} // namespace +} // namespace arm64 +} // namespace simdutf + // namespace simdutf +/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +// +// Implementation-specific overrides +// +namespace simdutf { +namespace arm64 { -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_out) const noexcept { - uint32_t * utf32_output = reinterpret_cast(utf32_out); - utf8_to_utf32_result ret = icelake::valid_utf8_to_fixed_length(buf, len, utf32_output); - size_t saved_bytes = ret.second - utf32_output; - const char* end = buf + len; - if (ret.first == end) { - return saved_bytes; +#if SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused int +implementation::detect_encodings(const char *input, + size_t length) const noexcept { + // If there is a BOM, then we trust it. + auto bom_encoding = simdutf::BOM::check_bom(input, length); + if (bom_encoding != encoding_type::unspecified) { + return bom_encoding; } - - // Note: AVX512 procedure looks up 4 bytes forward, and - // correctly converts multi-byte chars even if their - // continuation bytes lie outsiede 16-byte window. - // It meas, we have to skip continuation bytes from - // the beginning ret.first, as they were already consumed. - while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) { - ret.first += 1; + // todo: reimplement as a one-pass algorithm. + int out = 0; + if (validate_utf8(input, length)) { + out |= encoding_type::UTF8; } - - if (ret.first != end) { - const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert_valid( - ret.first, len - (ret.first - buf), utf32_out + saved_bytes); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; + if ((length % 2) == 0) { + if (validate_utf16le(reinterpret_cast(input), + length / 2)) { + out |= encoding_type::UTF16_LE; + } } - - return saved_bytes; + if ((length % 4) == 0) { + if (validate_utf32(reinterpret_cast(input), length / 4)) { + out |= encoding_type::UTF32_LE; + } + } + return out; } +#endif // SIMDUTF_FEATURE_DETECT_ENCODING - -simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { - return icelake_convert_utf16_to_latin1(buf,len,latin1_output); +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused bool +implementation::validate_utf8(const char *buf, size_t len) const noexcept { + return arm64::utf8_validation::generic_validate_utf8(buf, len); } +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING -simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { - return icelake_convert_utf16_to_latin1(buf,len,latin1_output); +#if SIMDUTF_FEATURE_UTF8 +simdutf_warn_unused result implementation::validate_utf8_with_errors( + const char *buf, size_t len) const noexcept { + return arm64::utf8_validation::generic_validate_utf8_with_errors(buf, len); } +#endif // SIMDUTF_FEATURE_UTF8 -simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept { - return icelake_convert_utf16_to_latin1_with_errors(buf,len,latin1_output).first; +#if SIMDUTF_FEATURE_ASCII +simdutf_warn_unused bool +implementation::validate_ascii(const char *buf, size_t len) const noexcept { + return arm64::ascii_validation::generic_validate_ascii(buf, len); } -simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept { - return icelake_convert_utf16_to_latin1_with_errors(buf,len,latin1_output).first; +simdutf_warn_unused result implementation::validate_ascii_with_errors( + const char *buf, size_t len) const noexcept { + return arm64::ascii_validation::generic_validate_ascii_with_errors(buf, len); } +#endif // SIMDUTF_FEATURE_ASCII -simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { - // optimization opportunity: implement custom function - return convert_utf16be_to_latin1(buf, len, latin1_output); +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII +simdutf_warn_unused bool +implementation::validate_utf16le_as_ascii(const char16_t *buf, + size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + // empty input is valid. protected the implementation from nullptr. + return true; + } + const char16_t *tail = + arm_validate_utf16_as_ascii(buf, len); + if (tail) { + return scalar::utf16::validate_as_ascii( + tail, len - (tail - buf)); + } else { + return false; + } } -simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { - // optimization opportunity: implement custom function - return convert_utf16le_to_latin1(buf, len, latin1_output); +simdutf_warn_unused bool +implementation::validate_utf16be_as_ascii(const char16_t *buf, + size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + // empty input is valid. protected the implementation from nullptr. + return true; + } + const char16_t *tail = arm_validate_utf16_as_ascii(buf, len); + if (tail) { + return scalar::utf16::validate_as_ascii( + tail, len - (tail - buf)); + } else { + return false; + } } - -simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - size_t outlen; - size_t inlen = utf16_to_utf8_avx512i(buf, len, (unsigned char*)utf8_output, &outlen); - if(inlen != len) { return 0; } - return outlen; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused bool +implementation::validate_utf16le(const char16_t *buf, + size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + // empty input is valid. protected the implementation from nullptr. + return true; + } + const char16_t *tail = arm_validate_utf16(buf, len); + if (tail) { + return scalar::utf16::validate(tail, + len - (tail - buf)); + } else { + return false; + } } +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING -simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - size_t outlen; - size_t inlen = utf16_to_utf8_avx512i(buf, len, (unsigned char*)utf8_output, &outlen); - if(inlen != len) { return 0; } - return outlen; +#if SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused bool +implementation::validate_utf16be(const char16_t *buf, + size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + // empty input is valid. protected the implementation from nullptr. + return true; + } + const char16_t *tail = arm_validate_utf16(buf, len); + if (tail) { + return scalar::utf16::validate(tail, len - (tail - buf)); + } else { + return false; + } } -simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - size_t outlen; - size_t inlen = utf16_to_utf8_avx512i(buf, len, (unsigned char*)utf8_output, &outlen); - if(inlen != len) { - result res = scalar::utf16_to_utf8::convert_with_errors(buf + inlen, len - outlen, utf8_output + outlen); - res.count += inlen; +simdutf_warn_unused result implementation::validate_utf16le_with_errors( + const char16_t *buf, size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + return result(error_code::SUCCESS, 0); + } + result res = arm_validate_utf16_with_errors(buf, len); + if (res.count != len) { + result scalar_res = scalar::utf16::validate_with_errors( + buf + res.count, len - res.count); + return result(scalar_res.error, res.count + scalar_res.count); + } else { return res; } - return {simdutf::SUCCESS, outlen}; } -simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - size_t outlen; - size_t inlen = utf16_to_utf8_avx512i(buf, len, (unsigned char*)utf8_output, &outlen); - if(inlen != len) { - result res = scalar::utf16_to_utf8::convert_with_errors(buf + inlen, len - outlen, utf8_output + outlen); - res.count += inlen; +simdutf_warn_unused result implementation::validate_utf16be_with_errors( + const char16_t *buf, size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + return result(error_code::SUCCESS, 0); + } + result res = arm_validate_utf16_with_errors(buf, len); + if (res.count != len) { + result scalar_res = scalar::utf16::validate_with_errors( + buf + res.count, len - res.count); + return result(scalar_res.error, res.count + scalar_res.count); + } else { return res; } - return {simdutf::SUCCESS, outlen}; } -simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - return convert_utf16le_to_utf8(buf, len, utf8_output); +void implementation::to_well_formed_utf16le(const char16_t *input, size_t len, + char16_t *output) const noexcept { + return utf16fix_neon_64bits(input, len, output); } -simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - return convert_utf16be_to_utf8(buf, len, utf8_output); +void implementation::to_well_formed_utf16be(const char16_t *input, size_t len, + char16_t *output) const noexcept { + return utf16fix_neon_64bits(input, len, output); } +#endif // SIMDUTF_FEATURE_UTF16 -simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept { - return icelake_convert_utf32_to_latin1(buf,len,latin1_output); +#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused bool +implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + // empty input is valid. protected the implementation from nullptr. + return true; + } + const char32_t *tail = arm_validate_utf32le(buf, len); + if (tail) { + return scalar::utf32::validate(tail, len - (tail - buf)); + } else { + return false; + } } +#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING -simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept { - return icelake_convert_utf32_to_latin1_with_errors(buf,len,latin1_output).first; +#if SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused result implementation::validate_utf32_with_errors( + const char32_t *buf, size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + return result(error_code::SUCCESS, 0); + } + result res = arm_validate_utf32le_with_errors(buf, len); + if (res.count != len) { + result scalar_res = + scalar::utf32::validate_with_errors(buf + res.count, len - res.count); + return result(scalar_res.error, res.count + scalar_res.count); + } else { + return res; + } +} +#endif // SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_latin1_to_utf8( + const char *buf, size_t len, char *utf8_output) const noexcept { + std::pair ret = + arm_convert_latin1_to_utf8(buf, len, utf8_output); + size_t converted_chars = ret.second - utf8_output; + + if (ret.first != buf + len) { + const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + converted_chars += scalar_converted_chars; + } + return converted_chars; +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + std::pair ret = + arm_convert_latin1_to_utf16(buf, len, utf16_output); + size_t converted_chars = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_converted_chars = + scalar::latin1_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + converted_chars += scalar_converted_chars; + } + return converted_chars; +} + +simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + std::pair ret = + arm_convert_latin1_to_utf16(buf, len, utf16_output); + size_t converted_chars = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_converted_chars = + scalar::latin1_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + converted_chars += scalar_converted_chars; + } + return converted_chars; +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_latin1_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept { + std::pair ret = + arm_convert_latin1_to_utf32(buf, len, utf32_output); + size_t converted_chars = ret.second - utf32_output; + if (ret.first != buf + len) { + const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert( + ret.first, len - (ret.first - buf), ret.second); + converted_chars += scalar_converted_chars; + } + return converted_chars; +} +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept { + utf8_to_latin1::validating_transcoder converter; + return converter.convert(buf, len, latin1_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors( + const char *buf, size_t len, char *latin1_output) const noexcept { + utf8_to_latin1::validating_transcoder converter; + return converter.convert_with_errors(buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept { + return arm64::utf8_to_latin1::convert_valid(buf, len, latin1_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert(buf, len, utf16_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert(buf, len, utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert_with_errors(buf, len, + utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert_with_errors(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le( + const char *input, size_t size, char16_t *utf16_output) const noexcept { + return utf8_to_utf16::convert_valid(input, size, + utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be( + const char *input, size_t size, char16_t *utf16_output) const noexcept { + return utf8_to_utf16::convert_valid(input, size, + utf16_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_utf8_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept { + utf8_to_utf32::validating_transcoder converter; + return converter.convert(buf, len, utf32_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors( + const char *buf, size_t len, char32_t *utf32_output) const noexcept { + utf8_to_utf32::validating_transcoder converter; + return converter.convert_with_errors(buf, len, utf32_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32( + const char *input, size_t size, char32_t *utf32_output) const noexcept { + return utf8_to_utf32::convert_valid(input, size, utf32_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair ret = + arm_convert_utf16_to_latin1(buf, len, latin1_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - latin1_output; + + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_latin1::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair ret = + arm_convert_utf16_to_latin1(buf, len, latin1_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - latin1_output; + + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_latin1::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result +implementation::convert_utf16le_to_latin1_with_errors( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair ret = + arm_convert_utf16_to_latin1_with_errors( + buf, len, latin1_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = + scalar::utf16_to_latin1::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + latin1_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused result +implementation::convert_utf16be_to_latin1_with_errors( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair ret = + arm_convert_utf16_to_latin1_with_errors(buf, len, + latin1_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = + scalar::utf16_to_latin1::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + latin1_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + // optimization opportunity: implement a custom function. + return convert_utf16be_to_latin1(buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + // optimization opportunity: implement a custom function. + return convert_utf16le_to_latin1(buf, len, latin1_output); +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + std::pair ret = + arm_convert_utf16_to_utf8(buf, len, utf8_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf8_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + std::pair ret = + arm_convert_utf16_to_utf8(buf, len, utf8_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf8_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + arm_convert_utf16_to_utf8_with_errors(buf, len, + utf8_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = + scalar::utf16_to_utf8::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + utf8_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + arm_convert_utf16_to_utf8_with_errors(buf, len, + utf8_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = + scalar::utf16_to_utf8::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + utf8_output; // Set count to the number of 8-bit code units written + return ret.first; } -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept { - return icelake_convert_utf32_to_latin1(buf,len,latin1_output); +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + return convert_utf16le_to_utf8(buf, len, utf8_output); } +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + return convert_utf16be_to_utf8(buf, len, utf8_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 -simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { - std::pair ret = avx512_convert_utf32_to_utf8(buf, len, utf8_output); - if (ret.first == nullptr) { return 0; } +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_output) const noexcept { + if (simdutf_unlikely(len == 0)) { + return 0; + } + std::pair ret = + arm_convert_utf32_to_utf8(buf, len, utf8_output); + if (ret.first == nullptr) { + return 0; + } size_t saved_bytes = ret.second - utf8_output; if (ret.first != buf + len) { const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } saved_bytes += scalar_saved_bytes; } return saved_bytes; } -simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of code units written even if finished - std::pair ret = icelake::avx512_convert_utf32_to_utf8_with_errors(buf, len, utf8_output); +simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors( + const char32_t *buf, size_t len, char *utf8_output) const noexcept { + if (simdutf_unlikely(len == 0)) { + return result(error_code::SUCCESS, 0); + } + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + arm_convert_utf32_to_utf8_with_errors(buf, len, utf8_output); if (ret.first.count != len) { result scalar_res = scalar::utf32_to_utf8::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); + buf + ret.first.count, len - ret.first.count, ret.second); if (scalar_res.error) { scalar_res.count += ret.first.count; return scalar_res; @@ -23971,46 +21231,69 @@ simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(con ret.second += scalar_res.count; } } - ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit code units written + ret.first.count = + ret.second - + utf8_output; // Set count to the number of 8-bit code units written return ret.first; } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { - return convert_utf32_to_utf8(buf, len, utf8_output); -} - -simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - std::pair ret = avx512_convert_utf32_to_utf16(buf, len, utf16_output); - if (ret.first == nullptr) { return 0; } - size_t saved_bytes = ret.second - utf16_output; +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + std::pair ret = + arm_convert_utf16_to_utf32(buf, len, utf32_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf32_output; if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } + const size_t scalar_saved_bytes = + scalar::utf16_to_utf32::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } saved_bytes += scalar_saved_bytes; } return saved_bytes; } -simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - std::pair ret = avx512_convert_utf32_to_utf16(buf, len, utf16_output); - if (ret.first == nullptr) { return 0; } - size_t saved_bytes = ret.second - utf16_output; +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + std::pair ret = + arm_convert_utf16_to_utf32(buf, len, utf32_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf32_output; if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } + const size_t scalar_saved_bytes = + scalar::utf16_to_utf32::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } saved_bytes += scalar_saved_bytes; } return saved_bytes; } -simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of code units written even if finished - std::pair ret = avx512_convert_utf32_to_utf16_with_errors(buf, len, utf16_output); - if (ret.first.count != len) { - result scalar_res = scalar::utf32_to_utf16::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); +simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + arm_convert_utf16_to_utf32_with_errors(buf, len, + utf32_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = + scalar::utf16_to_utf32::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); if (scalar_res.error) { scalar_res.count += ret.first.count; return scalar_res; @@ -24018,16 +21301,27 @@ simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors( ret.second += scalar_res.count; } } - ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit code units written + ret.first.count = + ret.second - + utf32_output; // Set count to the number of 8-bit code units written return ret.first; } -simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of code units written even if finished - std::pair ret = avx512_convert_utf32_to_utf16_with_errors(buf, len, utf16_output); - if (ret.first.count != len) { - result scalar_res = scalar::utf32_to_utf16::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); +simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + arm_convert_utf16_to_utf32_with_errors(buf, len, + utf32_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = + scalar::utf16_to_utf32::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); if (scalar_res.error) { scalar_res.count += ret.first.count; return scalar_res; @@ -24035,3468 +21329,29372 @@ simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors( ret.second += scalar_res.count; } } - ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit code units written + ret.first.count = + ret.second - + utf32_output; // Set count to the number of 8-bit code units written return ret.first; } +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - return convert_utf32_to_utf16le(buf, len, utf16_output); +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_utf32_to_latin1( + const char32_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair ret = + arm_convert_utf32_to_latin1(buf, len, latin1_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - latin1_output; + + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; } -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - return convert_utf32_to_utf16be(buf, len, utf16_output); +simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors( + const char32_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair ret = + arm_convert_utf32_to_latin1_with_errors(buf, len, latin1_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = scalar::utf32_to_latin1::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + latin1_output; // Set count to the number of 8-bit code units written + return ret.first; } -simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - std::tuple ret = icelake::convert_utf16_to_utf32(buf, len, utf32_output); - if (!std::get<2>(ret)) { return 0; } - size_t saved_bytes = std::get<1>(ret) - utf32_output; - if (std::get<0>(ret) != buf + len) { - const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert( - std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret)); - if (scalar_saved_bytes == 0) { return 0; } +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1( + const char32_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair ret = + arm_convert_utf32_to_latin1(buf, len, latin1_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - latin1_output; + + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert_valid( + ret.first, len - (ret.first - buf), ret.second); saved_bytes += scalar_saved_bytes; } return saved_bytes; } +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 -simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - std::tuple ret = icelake::convert_utf16_to_utf32(buf, len, utf32_output); - if (!std::get<2>(ret)) { return 0; } - size_t saved_bytes = std::get<1>(ret) - utf32_output; - if (std::get<0>(ret) != buf + len) { - const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert( - std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret)); - if (scalar_saved_bytes == 0) { return 0; } +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_output) const noexcept { + // optimization opportunity: implement a custom function. + return convert_utf32_to_utf8(buf, len, utf8_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + std::pair ret = + arm_convert_utf32_to_utf16(buf, len, utf16_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf32_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } saved_bytes += scalar_saved_bytes; } - return saved_bytes; + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + std::pair ret = + arm_convert_utf32_to_utf16(buf, len, utf16_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf32_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + arm_convert_utf32_to_utf16_with_errors(buf, len, + utf16_output); + if (ret.first.count != len) { + result scalar_res = + scalar::utf32_to_utf16::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + utf16_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + arm_convert_utf32_to_utf16_with_errors(buf, len, + utf16_output); + if (ret.first.count != len) { + result scalar_res = + scalar::utf32_to_utf16::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + utf16_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + return convert_utf32_to_utf16le(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + return convert_utf32_to_utf16be(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + return convert_utf16le_to_utf32(buf, len, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + return convert_utf16be_to_utf32(buf, len, utf32_output); +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 +void implementation::change_endianness_utf16(const char16_t *input, + size_t length, + char16_t *output) const noexcept { + utf16::change_endianness_utf16(input, length, output); +} + +simdutf_warn_unused size_t implementation::count_utf16le( + const char16_t *input, size_t length) const noexcept { + return utf16::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::count_utf16be( + const char16_t *input, size_t length) const noexcept { + return utf16::count_code_points(input, length); +} +#endif // SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 +simdutf_warn_unused size_t +implementation::count_utf8(const char *input, size_t length) const noexcept { + return utf8::count_code_points(input, length); +} +#endif // SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::latin1_length_from_utf8( + const char *buf, size_t len) const noexcept { + return count_utf8(buf, len); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::utf8_length_from_latin1( + const char *input, size_t length) const noexcept { + // See + // https://lemire.me/blog/2023/05/15/computing-the-utf-8-size-of-a-latin-1-string-quickly-arm-neon-edition/ + // credit to Pete Cawley + const uint8_t *data = reinterpret_cast(input); + uint64_t result = 0; + const int lanes = sizeof(uint8x16_t); + uint8_t rem = length % lanes; + const uint8_t *simd_end = data + (length / lanes) * lanes; + const uint8x16_t threshold = vdupq_n_u8(0x80); + for (; data < simd_end; data += lanes) { + // load 16 bytes + uint8x16_t input_vec = vld1q_u8(data); + // compare to threshold (0x80) + uint8x16_t withhighbit = vcgeq_u8(input_vec, threshold); + // vertical addition + result -= vaddvq_s8(vreinterpretq_s8_u8(withhighbit)); + } + return result + (length / lanes) * lanes + + scalar::latin1::utf8_length_from_latin1((const char *)simd_end, rem); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::utf8_length_from_utf16le( + const char16_t *input, size_t length) const noexcept { + return arm64_utf8_length_from_utf16_bytemask(input, + length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf16be( + const char16_t *input, size_t length) const noexcept { + return arm64_utf8_length_from_utf16_bytemask(input, length); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf32_length_from_utf16le( + const char16_t *input, size_t length) const noexcept { + return utf16::utf32_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf16be( + const char16_t *input, size_t length) const noexcept { + return utf16::utf32_length_from_utf16(input, length); +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::utf16_length_from_utf8( + const char *input, size_t length) const noexcept { + return utf8::utf16_length_from_utf8(input, length); +} +simdutf_warn_unused result +implementation::utf8_length_from_utf16le_with_replacement( + const char16_t *input, size_t length) const noexcept { + return arm64_utf8_length_from_utf16_with_replacement( + input, length); +} + +simdutf_warn_unused result +implementation::utf8_length_from_utf16be_with_replacement( + const char16_t *input, size_t length) const noexcept { + return arm64_utf8_length_from_utf16_with_replacement(input, + length); +} + +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf8_length_from_utf32( + const char32_t *input, size_t length) const noexcept { + const uint32x4_t v_7f = vmovq_n_u32((uint32_t)0x7f); + const uint32x4_t v_7ff = vmovq_n_u32((uint32_t)0x7ff); + const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff); + const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1); + size_t pos = 0; + size_t count = 0; + for (; pos + 4 <= length; pos += 4) { + uint32x4_t in = vld1q_u32(reinterpret_cast(input + pos)); + const uint32x4_t ascii_bytes_bytemask = vcleq_u32(in, v_7f); + const uint32x4_t one_two_bytes_bytemask = vcleq_u32(in, v_7ff); + const uint32x4_t two_bytes_bytemask = + veorq_u32(one_two_bytes_bytemask, ascii_bytes_bytemask); + const uint32x4_t three_bytes_bytemask = + veorq_u32(vcleq_u32(in, v_ffff), one_two_bytes_bytemask); + + const uint16x8_t reduced_ascii_bytes_bytemask = + vreinterpretq_u16_u32(vandq_u32(ascii_bytes_bytemask, v_1)); + const uint16x8_t reduced_two_bytes_bytemask = + vreinterpretq_u16_u32(vandq_u32(two_bytes_bytemask, v_1)); + const uint16x8_t reduced_three_bytes_bytemask = + vreinterpretq_u16_u32(vandq_u32(three_bytes_bytemask, v_1)); + + const uint16x8_t compressed_bytemask0 = + vpaddq_u16(reduced_ascii_bytes_bytemask, reduced_two_bytes_bytemask); + const uint16x8_t compressed_bytemask1 = + vpaddq_u16(reduced_three_bytes_bytemask, reduced_three_bytes_bytemask); + + size_t ascii_count = count_ones( + vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 0)); + size_t two_bytes_count = count_ones( + vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 1)); + size_t three_bytes_count = count_ones( + vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask1), 0)); + + count += 16 - 3 * ascii_count - 2 * two_bytes_count - three_bytes_count; + } + return count + + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf16_length_from_utf32( + const char32_t *input, size_t length) const noexcept { + const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff); + const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1); + size_t pos = 0; + size_t count = 0; + for (; pos + 4 <= length; pos += 4) { + uint32x4_t in = vld1q_u32(reinterpret_cast(input + pos)); + const uint32x4_t surrogate_bytemask = vcgtq_u32(in, v_ffff); + const uint16x8_t reduced_bytemask = + vreinterpretq_u16_u32(vandq_u32(surrogate_bytemask, v_1)); + const uint16x8_t compressed_bytemask = + vpaddq_u16(reduced_bytemask, reduced_bytemask); + size_t surrogate_count = count_ones( + vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask), 0)); + count += 4 + surrogate_count; + } + return count + + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos); +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf32_length_from_utf8( + const char *input, size_t length) const noexcept { + return utf8::count_code_points(input, length); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_BASE64 +simdutf_warn_unused result implementation::base64_to_binary( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + if (options & base64_default_or_url) { + if (options == base64_options::base64_default_or_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } +} + +simdutf_warn_unused full_result implementation::base64_to_binary_details( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + if (options & base64_default_or_url) { + if (options == base64_options::base64_default_or_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } +} + +simdutf_warn_unused result implementation::base64_to_binary( + const char16_t *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + if (options & base64_default_or_url) { + if (options == base64_options::base64_default_or_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } +} + +simdutf_warn_unused full_result implementation::base64_to_binary_details( + const char16_t *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + if (options & base64_default_or_url) { + if (options == base64_options::base64_default_or_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } +} + +size_t implementation::binary_to_base64(const char *input, size_t length, + char *output, + base64_options options) const noexcept { + return encode_base64(output, input, length, options); +} + +size_t implementation::binary_to_base64_with_lines( + const char *input, size_t length, char *output, size_t line_length, + base64_options options) const noexcept { + return encode_base64_impl(output, input, length, options, line_length); +} + +const char *implementation::find(const char *start, const char *end, + char character) const noexcept { + return util_find(start, end, character); +} + +const char16_t *implementation::find(const char16_t *start, const char16_t *end, + char16_t character) const noexcept { + return util_find(start, end, character); +} +#endif // SIMDUTF_FEATURE_BASE64 + +} // namespace arm64 +} // namespace simdutf + +/* begin file src/simdutf/arm64/end.h */ +#undef SIMDUTF_SIMD_HAS_BYTEMASK +/* end file src/simdutf/arm64/end.h */ +/* end file src/arm64/implementation.cpp */ +#endif +#if SIMDUTF_IMPLEMENTATION_FALLBACK +/* begin file src/fallback/implementation.cpp */ +/* begin file src/simdutf/fallback/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "fallback" +// #define SIMDUTF_IMPLEMENTATION fallback +/* end file src/simdutf/fallback/begin.h */ + +namespace simdutf { +namespace fallback { + +#if SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused int +implementation::detect_encodings(const char *input, + size_t length) const noexcept { + // If there is a BOM, then we trust it. + auto bom_encoding = simdutf::BOM::check_bom(input, length); + if (bom_encoding != encoding_type::unspecified) { + return bom_encoding; + } + int out = 0; + // todo: reimplement as a one-pass algorithm. + if (validate_utf8(input, length)) { + out |= encoding_type::UTF8; + } + if ((length % 2) == 0) { + if (validate_utf16le(reinterpret_cast(input), + length / 2)) { + out |= encoding_type::UTF16_LE; + } + } + if ((length % 4) == 0) { + if (validate_utf32(reinterpret_cast(input), length / 4)) { + out |= encoding_type::UTF32_LE; + } + } + return out; +} +#endif // SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused bool +implementation::validate_utf8(const char *buf, size_t len) const noexcept { + return scalar::utf8::validate(buf, len); +} +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF8 +simdutf_warn_unused result implementation::validate_utf8_with_errors( + const char *buf, size_t len) const noexcept { + return scalar::utf8::validate_with_errors(buf, len); +} +#endif // SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_ASCII +simdutf_warn_unused bool +implementation::validate_ascii(const char *buf, size_t len) const noexcept { + return scalar::ascii::validate(buf, len); +} + +simdutf_warn_unused result implementation::validate_ascii_with_errors( + const char *buf, size_t len) const noexcept { + return scalar::ascii::validate_with_errors(buf, len); +} +#endif // SIMDUTF_FEATURE_ASCII + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII +simdutf_warn_unused bool +implementation::validate_utf16le_as_ascii(const char16_t *buf, + size_t len) const noexcept { + return scalar::utf16::validate_as_ascii(buf, len); +} + +simdutf_warn_unused bool +implementation::validate_utf16be_as_ascii(const char16_t *buf, + size_t len) const noexcept { + return scalar::utf16::validate_as_ascii(buf, len); +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused bool +implementation::validate_utf16le(const char16_t *buf, + size_t len) const noexcept { + return scalar::utf16::validate(buf, len); +} +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused bool +implementation::validate_utf16be(const char16_t *buf, + size_t len) const noexcept { + return scalar::utf16::validate(buf, len); +} + +simdutf_warn_unused result implementation::validate_utf16le_with_errors( + const char16_t *buf, size_t len) const noexcept { + return scalar::utf16::validate_with_errors(buf, len); +} + +simdutf_warn_unused result implementation::validate_utf16be_with_errors( + const char16_t *buf, size_t len) const noexcept { + return scalar::utf16::validate_with_errors(buf, len); +} + +void implementation::to_well_formed_utf16le(const char16_t *input, size_t len, + char16_t *output) const noexcept { + return scalar::utf16::to_well_formed_utf16(input, len, + output); +} + +void implementation::to_well_formed_utf16be(const char16_t *input, size_t len, + char16_t *output) const noexcept { + return scalar::utf16::to_well_formed_utf16(input, len, + output); +} +#endif // SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused bool +implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept { + return scalar::utf32::validate(buf, len); +} +#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused result implementation::validate_utf32_with_errors( + const char32_t *buf, size_t len) const noexcept { + return scalar::utf32::validate_with_errors(buf, len); +} +#endif // SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_latin1_to_utf8( + const char *buf, size_t len, char *utf8_output) const noexcept { + return scalar::latin1_to_utf8::convert(buf, len, utf8_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + return scalar::latin1_to_utf16::convert(buf, len, + utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + return scalar::latin1_to_utf16::convert(buf, len, + utf16_output); +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_latin1_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept { + return scalar::latin1_to_utf32::convert(buf, len, utf32_output); +} +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept { + return scalar::utf8_to_latin1::convert(buf, len, latin1_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors( + const char *buf, size_t len, char *latin1_output) const noexcept { + return scalar::utf8_to_latin1::convert_with_errors(buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept { + return scalar::utf8_to_latin1::convert_valid(buf, len, latin1_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + return scalar::utf8_to_utf16::convert(buf, len, + utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + return scalar::utf8_to_utf16::convert(buf, len, + utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + return scalar::utf8_to_utf16::convert_with_errors( + buf, len, utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + return scalar::utf8_to_utf16::convert_with_errors( + buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + return scalar::utf8_to_utf16::convert_valid(buf, len, + utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + return scalar::utf8_to_utf16::convert_valid(buf, len, + utf16_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_utf8_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept { + return scalar::utf8_to_utf32::convert(buf, len, utf32_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors( + const char *buf, size_t len, char32_t *utf32_output) const noexcept { + return scalar::utf8_to_utf32::convert_with_errors(buf, len, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32( + const char *input, size_t size, char32_t *utf32_output) const noexcept { + return scalar::utf8_to_utf32::convert_valid(input, size, utf32_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + return scalar::utf16_to_latin1::convert(buf, len, + latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + return scalar::utf16_to_latin1::convert(buf, len, + latin1_output); +} + +simdutf_warn_unused result +implementation::convert_utf16le_to_latin1_with_errors( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + return scalar::utf16_to_latin1::convert_with_errors( + buf, len, latin1_output); +} + +simdutf_warn_unused result +implementation::convert_utf16be_to_latin1_with_errors( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + return scalar::utf16_to_latin1::convert_with_errors( + buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + return scalar::utf16_to_latin1::convert_valid( + buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + return scalar::utf16_to_latin1::convert_valid(buf, len, + latin1_output); +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + return scalar::utf16_to_utf8::convert(buf, len, + utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + return scalar::utf16_to_utf8::convert(buf, len, utf8_output); +} + +simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + return scalar::utf16_to_utf8::convert_with_errors( + buf, len, utf8_output); +} + +simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + return scalar::utf16_to_utf8::convert_with_errors( + buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + return scalar::utf16_to_utf8::convert_valid(buf, len, + utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + return scalar::utf16_to_utf8::convert_valid(buf, len, + utf8_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_utf32_to_latin1( + const char32_t *buf, size_t len, char *latin1_output) const noexcept { + return scalar::utf32_to_latin1::convert(buf, len, latin1_output); +} + +simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors( + const char32_t *buf, size_t len, char *latin1_output) const noexcept { + return scalar::utf32_to_latin1::convert_with_errors(buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1( + const char32_t *buf, size_t len, char *latin1_output) const noexcept { + return scalar::utf32_to_latin1::convert_valid(buf, len, latin1_output); +} +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_output) const noexcept { + return scalar::utf32_to_utf8::convert(buf, len, utf8_output); +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors( + const char32_t *buf, size_t len, char *utf8_output) const noexcept { + return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_output) const noexcept { + return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + return scalar::utf32_to_utf16::convert(buf, len, + utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + return scalar::utf32_to_utf16::convert(buf, len, + utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + return scalar::utf32_to_utf16::convert_with_errors( + buf, len, utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + return scalar::utf32_to_utf16::convert_with_errors( + buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + return scalar::utf32_to_utf16::convert_valid( + buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + return scalar::utf32_to_utf16::convert_valid(buf, len, + utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + return scalar::utf16_to_utf32::convert(buf, len, + utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + return scalar::utf16_to_utf32::convert(buf, len, + utf32_output); +} + +simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + return scalar::utf16_to_utf32::convert_with_errors( + buf, len, utf32_output); +} + +simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + return scalar::utf16_to_utf32::convert_with_errors( + buf, len, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + return scalar::utf16_to_utf32::convert_valid( + buf, len, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + return scalar::utf16_to_utf32::convert_valid(buf, len, + utf32_output); +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 +void implementation::change_endianness_utf16(const char16_t *input, + size_t length, + char16_t *output) const noexcept { + scalar::utf16::change_endianness_utf16(input, length, output); +} + +simdutf_warn_unused size_t implementation::count_utf16le( + const char16_t *input, size_t length) const noexcept { + return scalar::utf16::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::count_utf16be( + const char16_t *input, size_t length) const noexcept { + return scalar::utf16::count_code_points(input, length); +} +#endif // SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 +simdutf_warn_unused size_t +implementation::count_utf8(const char *input, size_t length) const noexcept { + return scalar::utf8::count_code_points(input, length); +} +#endif // SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::latin1_length_from_utf8( + const char *buf, size_t len) const noexcept { + return scalar::utf8::count_code_points(buf, len); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::utf8_length_from_latin1( + const char *input, size_t length) const noexcept { + return scalar::latin1_to_utf8::utf8_length_from_latin1(input, length); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::utf8_length_from_utf16le( + const char16_t *input, size_t length) const noexcept { + return scalar::utf16::utf8_length_from_utf16(input, + length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf16be( + const char16_t *input, size_t length) const noexcept { + return scalar::utf16::utf8_length_from_utf16(input, length); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf32_length_from_utf16le( + const char16_t *input, size_t length) const noexcept { + return scalar::utf16::utf32_length_from_utf16(input, + length); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf16be( + const char16_t *input, size_t length) const noexcept { + return scalar::utf16::utf32_length_from_utf16(input, length); +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::utf16_length_from_utf8( + const char *input, size_t length) const noexcept { + return scalar::utf8::utf16_length_from_utf8(input, length); +} +simdutf_warn_unused result +implementation::utf8_length_from_utf16le_with_replacement( + const char16_t *input, size_t length) const noexcept { + return scalar::utf16::utf8_length_from_utf16_with_replacement< + endianness::LITTLE>(input, length); +} + +simdutf_warn_unused result +implementation::utf8_length_from_utf16be_with_replacement( + const char16_t *input, size_t length) const noexcept { + return scalar::utf16::utf8_length_from_utf16_with_replacement< + endianness::BIG>(input, length); +} + +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf8_length_from_utf32( + const char32_t *input, size_t length) const noexcept { + return scalar::utf32::utf8_length_from_utf32(input, length); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf16_length_from_utf32( + const char32_t *input, size_t length) const noexcept { + return scalar::utf32::utf16_length_from_utf32(input, length); +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf32_length_from_utf8( + const char *input, size_t length) const noexcept { + return scalar::utf8::count_code_points(input, length); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_BASE64 + +simdutf_warn_unused result implementation::base64_to_binary( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + return simdutf::scalar::base64::base64_to_binary_details_impl( + input, length, output, options, last_chunk_options); +} + +simdutf_warn_unused result implementation::base64_to_binary( + const char16_t *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + return simdutf::scalar::base64::base64_to_binary_details_impl( + input, length, output, options, last_chunk_options); +} + +simdutf_warn_unused full_result implementation::base64_to_binary_details( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + return simdutf::scalar::base64::base64_to_binary_details_impl( + input, length, output, options, last_chunk_options); +} + +simdutf_warn_unused full_result implementation::base64_to_binary_details( + const char16_t *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + return simdutf::scalar::base64::base64_to_binary_details_impl( + input, length, output, options, last_chunk_options); +} + +size_t implementation::binary_to_base64(const char *input, size_t length, + char *output, + base64_options options) const noexcept { + return scalar::base64::tail_encode_base64(output, input, length, options); +} + +size_t implementation::binary_to_base64_with_lines( + const char *input, size_t length, char *output, size_t line_length, + base64_options options) const noexcept { + return scalar::base64::tail_encode_base64_impl(output, input, length, + options, line_length); +} + +const char *implementation::find(const char *start, const char *end, + char character) const noexcept { + return std::find(start, end, character); +} + +const char16_t *implementation::find(const char16_t *start, const char16_t *end, + char16_t character) const noexcept { + return std::find(start, end, character); +} +#endif // SIMDUTF_FEATURE_BASE64 + +} // namespace fallback +} // namespace simdutf + +/* begin file src/simdutf/fallback/end.h */ +/* end file src/simdutf/fallback/end.h */ +/* end file src/fallback/implementation.cpp */ +#endif +#if SIMDUTF_IMPLEMENTATION_ICELAKE +/* begin file src/icelake/implementation.cpp */ +#include +#include + +/* begin file src/simdutf/icelake/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "icelake" +// #define SIMDUTF_IMPLEMENTATION icelake + +#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE +// nothing needed. +#else +SIMDUTF_TARGET_ICELAKE +#endif + +#if SIMDUTF_GCC11ORMORE // workaround for + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593 +// clang-format off +SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized) +// clang-format on +#endif // end of workaround +/* end file src/simdutf/icelake/begin.h */ +namespace simdutf { +namespace icelake { +namespace { +#ifndef SIMDUTF_ICELAKE_H + #error "icelake.h must be included" +#endif +using namespace simd; + +/* begin file src/icelake/icelake_macros.inl.cpp */ + +/* + This upcoming macro (SIMDUTF_ICELAKE_TRANSCODE16) takes 16 + 4 bytes (of a + UTF-8 string) and loads all possible 4-byte substring into an AVX512 + register. + + For example if we have bytes abcdefgh... we create following 32-bit lanes + + [abcd|bcde|cdef|defg|efgh|...] + ^ ^ + byte 0 of reg byte 63 of reg +*/ +/** pshufb + # lane{0,1,2} have got bytes: [ 0, 1, 2, 3, 4, 5, 6, 8, 9, 10, + 11, 12, 13, 14, 15] # lane3 has got bytes: [ 16, 17, 18, 19, 4, 5, + 6, 8, 9, 10, 11, 12, 13, 14, 15] + + expand_ver2 = [ + # lane 0: + 0, 1, 2, 3, + 1, 2, 3, 4, + 2, 3, 4, 5, + 3, 4, 5, 6, + + # lane 1: + 4, 5, 6, 7, + 5, 6, 7, 8, + 6, 7, 8, 9, + 7, 8, 9, 10, + + # lane 2: + 8, 9, 10, 11, + 9, 10, 11, 12, + 10, 11, 12, 13, + 11, 12, 13, 14, + + # lane 3 order: 13, 14, 15, 16 14, 15, 16, 17, 15, 16, 17, 18, 16, + 17, 18, 19 12, 13, 14, 15, 13, 14, 15, 0, 14, 15, 0, 1, 15, 0, 1, 2, + ] +*/ + +#define SIMDUTF_ICELAKE_TRANSCODE16(LANE0, LANE1, MASKED) \ + { \ + const __m512i merged = _mm512_mask_mov_epi32(LANE0, 0x1000, LANE1); \ + const __m512i expand_ver2 = _mm512_setr_epi64( \ + 0x0403020103020100, 0x0605040305040302, 0x0807060507060504, \ + 0x0a09080709080706, 0x0c0b0a090b0a0908, 0x0e0d0c0b0d0c0b0a, \ + 0x000f0e0d0f0e0d0c, 0x0201000f01000f0e); \ + const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2); \ + \ + __mmask16 leading_bytes; \ + const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0); \ + const __m512i t0 = _mm512_and_si512(input, v_0000_00c0); \ + const __m512i v_0000_0080 = _mm512_set1_epi32(0x80); \ + leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080); \ + \ + __m512i char_class; \ + char_class = _mm512_srli_epi32(input, 4); \ + /* char_class = ((input >> 4) & 0x0f) | 0x80808000 */ \ + const __m512i v_0000_000f = _mm512_set1_epi32(0x0f); \ + const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000); \ + char_class = \ + _mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea); \ + \ + const int valid_count = static_cast(count_ones(leading_bytes)); \ + const __m512i utf32 = expanded_utf8_to_utf32(char_class, input); \ + \ + const __m512i out = _mm512_mask_compress_epi32(_mm512_setzero_si512(), \ + leading_bytes, utf32); \ + \ + if (UTF32) { \ + if (MASKED) { \ + const __mmask16 valid = uint16_t((1 << valid_count) - 1); \ + _mm512_mask_storeu_epi32((__m512i *)output, valid, out); \ + } else { \ + _mm512_storeu_si512((__m512i *)output, out); \ + } \ + output += valid_count; \ + } else { \ + if (MASKED) { \ + output += utf32_to_utf16_masked( \ + byteflip, out, valid_count, reinterpret_cast(output)); \ + } else { \ + output += utf32_to_utf16( \ + byteflip, out, valid_count, reinterpret_cast(output)); \ + } \ + } \ + } + +#define SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(INPUT, VALID_COUNT, MASKED) \ + { \ + if (UTF32) { \ + if (MASKED) { \ + const __mmask16 valid_mask = uint16_t((1 << VALID_COUNT) - 1); \ + _mm512_mask_storeu_epi32((__m512i *)output, valid_mask, INPUT); \ + } else { \ + _mm512_storeu_si512((__m512i *)output, INPUT); \ + } \ + output += VALID_COUNT; \ + } else { \ + if (MASKED) { \ + output += utf32_to_utf16_masked( \ + byteflip, INPUT, VALID_COUNT, \ + reinterpret_cast(output)); \ + } else { \ + output += \ + utf32_to_utf16(byteflip, INPUT, VALID_COUNT, \ + reinterpret_cast(output)); \ + } \ + } \ + } + +#define SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output) \ + if (UTF32) { \ + const __m128i t0 = _mm512_castsi512_si128(utf8); \ + const __m128i t1 = _mm512_extracti32x4_epi32(utf8, 1); \ + const __m128i t2 = _mm512_extracti32x4_epi32(utf8, 2); \ + const __m128i t3 = _mm512_extracti32x4_epi32(utf8, 3); \ + _mm512_storeu_si512((__m512i *)(output + 0 * 16), \ + _mm512_cvtepu8_epi32(t0)); \ + _mm512_storeu_si512((__m512i *)(output + 1 * 16), \ + _mm512_cvtepu8_epi32(t1)); \ + _mm512_storeu_si512((__m512i *)(output + 2 * 16), \ + _mm512_cvtepu8_epi32(t2)); \ + _mm512_storeu_si512((__m512i *)(output + 3 * 16), \ + _mm512_cvtepu8_epi32(t3)); \ + } else { \ + const __m256i h0 = _mm512_castsi512_si256(utf8); \ + const __m256i h1 = _mm512_extracti64x4_epi64(utf8, 1); \ + if (big_endian) { \ + _mm512_storeu_si512( \ + (__m512i *)(output + 0 * 16), \ + _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h0), byteflip)); \ + _mm512_storeu_si512( \ + (__m512i *)(output + 2 * 16), \ + _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h1), byteflip)); \ + } else { \ + _mm512_storeu_si512((__m512i *)(output + 0 * 16), \ + _mm512_cvtepu8_epi16(h0)); \ + _mm512_storeu_si512((__m512i *)(output + 2 * 16), \ + _mm512_cvtepu8_epi16(h1)); \ + } \ + } +/* end file src/icelake/icelake_macros.inl.cpp */ +/* begin file src/icelake/icelake_common.inl.cpp */ +// file included directly +/** + * Store the last N bytes of previous followed by 512-N bytes from input. + */ +template __m512i prev(__m512i input, __m512i previous) { + static_assert(N <= 32, "N must be no larger than 32"); + const __m512i movemask = + _mm512_setr_epi32(28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11); + const __m512i rotated = _mm512_permutex2var_epi32(input, movemask, previous); +#if SIMDUTF_GCC8 || SIMDUTF_GCC9 + constexpr int shift = 16 - N; // workaround for GCC8,9 + return _mm512_alignr_epi8(input, rotated, shift); +#else + return _mm512_alignr_epi8(input, rotated, 16 - N); +#endif // SIMDUTF_GCC8 || SIMDUTF_GCC9 +} + +template +__m512i shuffle_epi128(__m512i v) { + static_assert((idx0 >= 0 && idx0 <= 3), "idx0 must be in range 0..3"); + static_assert((idx1 >= 0 && idx1 <= 3), "idx1 must be in range 0..3"); + static_assert((idx2 >= 0 && idx2 <= 3), "idx2 must be in range 0..3"); + static_assert((idx3 >= 0 && idx3 <= 3), "idx3 must be in range 0..3"); + + constexpr unsigned shuffle = idx0 | (idx1 << 2) | (idx2 << 4) | (idx3 << 6); + return _mm512_shuffle_i32x4(v, v, shuffle); +} + +template constexpr __m512i broadcast_epi128(__m512i v) { + return shuffle_epi128(v); +} + +simdutf_really_inline __m512i broadcast_128bit_lane(__m128i lane) { + const __m512i tmp = _mm512_castsi128_si512(lane); + + return broadcast_epi128<0>(tmp); +} +/* end file src/icelake/icelake_common.inl.cpp */ +#if SIMDUTF_FEATURE_UTF8 +/* begin file src/icelake/icelake_utf8_common.inl.cpp */ +// Common procedures for both validating and non-validating conversions from +// UTF-8. +enum block_processing_mode { SIMDUTF_FULL, SIMDUTF_TAIL }; + +using utf8_to_utf16_result = std::pair; +using utf8_to_utf32_result = std::pair; + +/* + process_block_utf8_to_utf16 converts up to 64 bytes from 'in' from UTF-8 + to UTF-16. When tail = SIMDUTF_FULL, then the full input buffer (64 bytes) + might be used. When tail = SIMDUTF_TAIL, we take into account 'gap' which + indicates how many input bytes are relevant. + + Returns true when the result is correct, otherwise it returns false. + + The provided in and out pointers are advanced according to how many input + bytes have been processed, upon success. +*/ +template +simdutf_really_inline bool +process_block_utf8_to_utf16(const char *&in, char16_t *&out, size_t gap) { + // constants + __m512i mask_identity = _mm512_set_epi8( + 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, + 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, + 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, + 8, 7, 6, 5, 4, 3, 2, 1, 0); + __m512i mask_c0c0c0c0 = _mm512_set1_epi32(0xc0c0c0c0); + __m512i mask_80808080 = _mm512_set1_epi32(0x80808080); + __m512i mask_f0f0f0f0 = _mm512_set1_epi32(0xf0f0f0f0); + __m512i mask_dfdfdfdf_tail = _mm512_set_epi64( + 0xffffdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, + 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, + 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf); + __m512i mask_c2c2c2c2 = _mm512_set1_epi32(0xc2c2c2c2); + __m512i mask_ffffffff = _mm512_set1_epi32(0xffffffff); + __m512i mask_d7c0d7c0 = _mm512_set1_epi32(0xd7c0d7c0); + __m512i mask_dc00dc00 = _mm512_set1_epi32(0xdc00dc00); + __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809); + // Note that 'tail' is a compile-time constant ! + __mmask64 b = + (tail == SIMDUTF_FULL) ? 0xFFFFFFFFFFFFFFFF : (uint64_t(1) << gap) - 1; + __m512i input = (tail == SIMDUTF_FULL) ? _mm512_loadu_si512(in) + : _mm512_maskz_loadu_epi8(b, in); + __mmask64 m1 = (tail == SIMDUTF_FULL) + ? _mm512_cmplt_epu8_mask(input, mask_80808080) + : _mm512_mask_cmplt_epu8_mask(b, input, mask_80808080); + if (_ktestc_mask64_u8(m1, + b)) { // NOT(m1) AND b -- if all zeroes, then all ASCII + // alternatively, we could do 'if (m1 == b) { ' + if (tail == SIMDUTF_FULL) { + in += 64; // consumed 64 bytes + // we convert a full 64-byte block, writing 128 bytes. + __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input)); + if (big_endian) { + input1 = _mm512_shuffle_epi8(input1, byteflip); + } + _mm512_storeu_si512(out, input1); + out += 32; + __m512i input2 = + _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1)); + if (big_endian) { + input2 = _mm512_shuffle_epi8(input2, byteflip); + } + _mm512_storeu_si512(out, input2); + out += 32; + return true; // we are done + } else { + in += gap; + if (gap <= 32) { + __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input)); + if (big_endian) { + input1 = _mm512_shuffle_epi8(input1, byteflip); + } + _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << (gap)) - 1), + input1); + out += gap; + } else { + __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input)); + if (big_endian) { + input1 = _mm512_shuffle_epi8(input1, byteflip); + } + _mm512_storeu_si512(out, input1); + out += 32; + __m512i input2 = + _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1)); + if (big_endian) { + input2 = _mm512_shuffle_epi8(input2, byteflip); + } + _mm512_mask_storeu_epi16( + out, __mmask32((uint32_t(1) << (gap - 32)) - 1), input2); + out += gap - 32; + } + return true; // we are done + } + } + // classify characters further + __mmask64 m234 = _mm512_cmp_epu8_mask( + mask_c0c0c0c0, input, + _MM_CMPINT_LE); // 0xc0 <= input, 2, 3, or 4 leading byte + __mmask64 m34 = + _mm512_cmp_epu8_mask(mask_dfdfdfdf_tail, input, + _MM_CMPINT_LT); // 0xdf < input, 3 or 4 leading byte + + __mmask64 milltwobytes = _mm512_mask_cmp_epu8_mask( + m234, input, mask_c2c2c2c2, + _MM_CMPINT_LT); // 0xc0 <= input < 0xc2 (illegal two byte sequence) + // Overlong 2-byte sequence + if (_ktestz_mask64_u8(milltwobytes, milltwobytes) == 0) { + // Overlong 2-byte sequence + return false; + } + if (_ktestz_mask64_u8(m34, m34) == 0) { + // We have a 3-byte sequence and/or a 2-byte sequence, or possibly even a + // 4-byte sequence! + __mmask64 m4 = _mm512_cmp_epu8_mask( + input, mask_f0f0f0f0, + _MM_CMPINT_NLT); // 0xf0 <= zmm0 (4 byte start bytes) + + __mmask64 mask_not_ascii = (tail == SIMDUTF_FULL) + ? _knot_mask64(m1) + : _kand_mask64(_knot_mask64(m1), b); + + __mmask64 mp1 = _kshiftli_mask64(m234, 1); + __mmask64 mp2 = _kshiftli_mask64(m34, 2); + // We could do it as follows... + // if (_kortestz_mask64_u8(m4,m4)) { // compute the bitwise OR of the 64-bit + // masks a and b and return 1 if all zeroes but GCC generates better code + // when we do: + if (m4 == 0) { // compute the bitwise OR of the 64-bit masks a and b and + // return 1 if all zeroes + // Fast path with 1,2,3 bytes + __mmask64 mc = _kor_mask64(mp1, mp2); // expected continuation bytes + __mmask64 m1234 = _kor_mask64(m1, m234); + // mismatched continuation bytes: + if (tail == SIMDUTF_FULL) { + __mmask64 xnormcm1234 = _kxnor_mask64( + mc, + m1234); // XNOR of mc and m1234 should be all zero if they differ + // the presence of a 1 bit indicates that they overlap. + // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return + // 1 if all zeroes. + if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) { + return false; + } + } else { + __mmask64 bxorm1234 = _kxor_mask64(b, m1234); + if (mc != bxorm1234) { + return false; + } + } + // mend: identifying the last bytes of each sequence to be decoded + __mmask64 mend = _kshiftri_mask64(m1234, 1); + if (tail != SIMDUTF_FULL) { + mend = _kor_mask64(mend, (uint64_t(1) << (gap - 1))); + } + + __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity); + __m512i last_and_thirdu16 = + _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third)); + + __m512i nonasciitags = _mm512_maskz_mov_epi8( + mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000 other: 11000000 + __m512i clearedbytes = _mm512_andnot_si512( + nonasciitags, input); // high two bits cleared where not ASCII + __m512i lastbytes = _mm512_maskz_permutexvar_epi8( + 0x5555555555555555, last_and_thirdu16, + clearedbytes); // the last byte of each character + + __mmask64 mask_before_non_ascii = _kshiftri_mask64( + mask_not_ascii, 1); // bytes that precede non-ASCII bytes + __m512i indexofsecondlastbytes = _mm512_add_epi16( + mask_ffffffff, last_and_thirdu16); // indices of the second last bytes + __m512i beforeasciibytes = + _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes); + __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8( + 0x5555555555555555, indexofsecondlastbytes, + beforeasciibytes); // the second last bytes (of two, three byte seq, + // surrogates) + secondlastbytes = + _mm512_slli_epi16(secondlastbytes, 6); // shifted into position + + __m512i indexofthirdlastbytes = _mm512_add_epi16( + mask_ffffffff, + indexofsecondlastbytes); // indices of the second last bytes + __m512i thirdlastbyte = + _mm512_maskz_mov_epi8(m34, + clearedbytes); // only those that are the third + // last byte of a sequence + __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8( + 0x5555555555555555, indexofthirdlastbytes, + thirdlastbyte); // the third last bytes (of three byte sequences, hi + // surrogate) + thirdlastbytes = + _mm512_slli_epi16(thirdlastbytes, 12); // shifted into position + __m512i Wout = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes, + thirdlastbytes, 254); + // the elements of Wout excluding the last element if it happens to be a + // high surrogate: + + __mmask64 mprocessed = + (tail == SIMDUTF_FULL) + ? _pdep_u64(0xFFFFFFFF, mend) + : _pdep_u64( + 0xFFFFFFFF, + _kand_mask64( + mend, b)); // we adjust mend at the end of the output. + + // Encodings out of range... + { + // the location of 3-byte sequence start bytes in the input + __mmask64 m3 = m34 & (b ^ m4); + // code units in Wout corresponding to 3-byte sequences. + __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend)); + __m512i mask_08000800 = _mm512_set1_epi32(0x08000800); + __mmask32 Msmall800 = + _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800); + __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800); + __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800); + __mmask32 M3s = + _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800); + if (_kor_mask32(Msmall800, M3s)) { + return false; + } + } + int64_t nout = _mm_popcnt_u64(mprocessed); + in += 64 - _lzcnt_u64(mprocessed); + if (big_endian) { + Wout = _mm512_shuffle_epi8(Wout, byteflip); + } + _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout); + out += nout; + return true; // ok + } + // + // We have a 4-byte sequence, this is the general case. + // Slow! + __mmask64 mp3 = _kshiftli_mask64(m4, 3); + __mmask64 mc = + _kor_mask64(_kor_mask64(mp1, mp2), mp3); // expected continuation bytes + __mmask64 m1234 = _kor_mask64(m1, m234); + + // mend: identifying the last bytes of each sequence to be decoded + __mmask64 mend = + _kor_mask64(_kshiftri_mask64(_kor_mask64(mp3, m1234), 1), mp3); + if (tail != SIMDUTF_FULL) { + mend = _kor_mask64(mend, __mmask64(uint64_t(1) << (gap - 1))); + } + __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity); + __m512i last_and_thirdu16 = + _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third)); + + __m512i nonasciitags = _mm512_maskz_mov_epi8( + mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000 other: 11000000 + __m512i clearedbytes = _mm512_andnot_si512( + nonasciitags, input); // high two bits cleared where not ASCII + __m512i lastbytes = _mm512_maskz_permutexvar_epi8( + 0x5555555555555555, last_and_thirdu16, + clearedbytes); // the last byte of each character + + __mmask64 mask_before_non_ascii = _kshiftri_mask64( + mask_not_ascii, 1); // bytes that precede non-ASCII bytes + __m512i indexofsecondlastbytes = _mm512_add_epi16( + mask_ffffffff, last_and_thirdu16); // indices of the second last bytes + __m512i beforeasciibytes = + _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes); + __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8( + 0x5555555555555555, indexofsecondlastbytes, + beforeasciibytes); // the second last bytes (of two, three byte seq, + // surrogates) + secondlastbytes = + _mm512_slli_epi16(secondlastbytes, 6); // shifted into position + + __m512i indexofthirdlastbytes = _mm512_add_epi16( + mask_ffffffff, + indexofsecondlastbytes); // indices of the second last bytes + __m512i thirdlastbyte = _mm512_maskz_mov_epi8( + m34, + clearedbytes); // only those that are the third last byte of a sequence + __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8( + 0x5555555555555555, indexofthirdlastbytes, + thirdlastbyte); // the third last bytes (of three byte sequences, hi + // surrogate) + thirdlastbytes = + _mm512_slli_epi16(thirdlastbytes, 12); // shifted into position + __m512i thirdsecondandlastbytes = _mm512_ternarylogic_epi32( + lastbytes, secondlastbytes, thirdlastbytes, 254); + uint64_t Mlo_uint64 = _pext_u64(mp3, mend); + __mmask32 Mlo = __mmask32(Mlo_uint64); + __mmask32 Mhi = __mmask32(Mlo_uint64 >> 1); + __m512i lo_surr_mask = _mm512_maskz_mov_epi16( + Mlo, + mask_dc00dc00); // lo surr: 1101110000000000, other: 0000000000000000 + __m512i shifted4_thirdsecondandlastbytes = + _mm512_srli_epi16(thirdsecondandlastbytes, + 4); // hi surr: 00000WVUTSRQPNML vuts = WVUTS - 1 + __m512i tagged_lo_surrogates = _mm512_or_si512( + thirdsecondandlastbytes, + lo_surr_mask); // lo surr: 110111KJHGFEDCBA, other: unchanged + __m512i Wout = _mm512_mask_add_epi16( + tagged_lo_surrogates, Mhi, shifted4_thirdsecondandlastbytes, + mask_d7c0d7c0); // hi sur: 110110vutsRQPNML, other: unchanged + // the elements of Wout excluding the last element if it happens to be a + // high surrogate: + __mmask32 Mout = ~(Mhi & 0x80000000); + __mmask64 mprocessed = + (tail == SIMDUTF_FULL) + ? _pdep_u64(Mout, mend) + : _pdep_u64( + Mout, + _kand_mask64(mend, + b)); // we adjust mend at the end of the output. + + // mismatched continuation bytes: + if (tail == SIMDUTF_FULL) { + __mmask64 xnormcm1234 = _kxnor_mask64( + mc, m1234); // XNOR of mc and m1234 should be all zero if they differ + // the presence of a 1 bit indicates that they overlap. + // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1 + // if all zeroes. + if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) { + return false; + } + } else { + __mmask64 bxorm1234 = _kxor_mask64(b, m1234); + if (mc != bxorm1234) { + return false; + } + } + // Encodings out of range... + { + // the location of 3-byte sequence start bytes in the input + __mmask64 m3 = m34 & (b ^ m4); + // code units in Wout corresponding to 3-byte sequences. + __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend)); + __m512i mask_08000800 = _mm512_set1_epi32(0x08000800); + __mmask32 Msmall800 = + _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800); + __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800); + __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800); + __mmask32 M3s = + _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800); + __m512i mask_04000400 = _mm512_set1_epi32(0x04000400); + __mmask32 M4s = + _mm512_mask_cmpge_epu16_mask(Mhi, Moutminusd800, mask_04000400); + if (!_kortestz_mask32_u8(M4s, _kor_mask32(Msmall800, M3s))) { + return false; + } + } + in += 64 - _lzcnt_u64(mprocessed); + int64_t nout = _mm_popcnt_u64(mprocessed); + if (big_endian) { + Wout = _mm512_shuffle_epi8(Wout, byteflip); + } + _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout); + out += nout; + return true; // ok + } + // Fast path 2: all ASCII or 2 byte + __mmask64 continuation_or_ascii = (tail == SIMDUTF_FULL) + ? _knot_mask64(m234) + : _kand_mask64(_knot_mask64(m234), b); + // on top of -0xc0 we subtract -2 which we get back later of the + // continuation byte tags + __m512i leading2byte = _mm512_maskz_sub_epi8(m234, input, mask_c2c2c2c2); + __mmask64 leading = tail == (tail == SIMDUTF_FULL) + ? _kor_mask64(m1, m234) + : _kand_mask64(_kor_mask64(m1, m234), + b); // first bytes of each sequence + if (tail == SIMDUTF_FULL) { + __mmask64 xnor234leading = + _kxnor_mask64(_kshiftli_mask64(m234, 1), leading); + if (!_kortestz_mask64_u8(xnor234leading, xnor234leading)) { + return false; + } + } else { + __mmask64 bxorleading = _kxor_mask64(b, leading); + if (_kshiftli_mask64(m234, 1) != bxorleading) { + return false; + } + } + // + if (tail == SIMDUTF_FULL) { + // In the two-byte/ASCII scenario, we are easily latency bound, so we want + // to increment the input buffer as quickly as possible. + // We process 32 bytes unless the byte at index 32 is a continuation byte, + // in which case we include it as well for a total of 33 bytes. + // Note that if x is an ASCII byte, then the following is false: + // int8_t(x) <= int8_t(0xc0) under two's complement. + in += 32; + if (int8_t(*in) <= int8_t(0xc0)) + in++; + // The alternative is to do + // in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii)); + // but it requires loading the input, doing the mask computation, and + // converting back the mask to a general register. It just takes too long, + // leaving the processor likely to be idle. + } else { + in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii)); + } + __m512i lead = _mm512_maskz_compress_epi8( + leading, leading2byte); // will contain zero for ascii, and the data + lead = _mm512_cvtepu8_epi16( + _mm512_castsi512_si256(lead)); // ... zero extended into code units + __m512i follow = _mm512_maskz_compress_epi8( + continuation_or_ascii, input); // the last bytes of each sequence + follow = _mm512_cvtepu8_epi16( + _mm512_castsi512_si256(follow)); // ... zero extended into code units + lead = _mm512_slli_epi16(lead, 6); // shifted into position + __m512i final = _mm512_add_epi16(follow, lead); // combining lead and follow + + if (big_endian) { + final = _mm512_shuffle_epi8(final, byteflip); + } + if (tail == SIMDUTF_FULL) { + // Next part is UTF-16 specific and can be generalized to UTF-32. + int nout = _mm_popcnt_u32(uint32_t(leading)); + _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final); + out += nout; // UTF-8 to UTF-16 is only expansionary in this case. + } else { + int nout = int(_mm_popcnt_u64(_pdep_u64(0xFFFFFFFF, leading))); + _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final); + out += nout; // UTF-8 to UTF-16 is only expansionary in this case. + } + + return true; // we are fine. +} + +/* + utf32_to_utf16_masked converts `count` lower UTF-32 code units + from input `utf32` into UTF-16. It differs from utf32_to_utf16 + in that it 'masks' the writes. + + Returns how many 16-bit code units were stored. + + byteflip is used for flipping 16-bit code units, and it should be + __m512i byteflip = _mm512_setr_epi64( + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809 + ); + We pass it to the (always inlined) function to encourage the compiler to + keep the value in a (constant) register. +*/ +template +simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip, + __m512i utf32, + unsigned int count, + char16_t *output) { + + const __mmask16 valid = uint16_t((1 << count) - 1); + // 1. check if we have any surrogate pairs + const __m512i v_0000_ffff = _mm512_set1_epi32(0x0000ffff); + const __mmask16 sp_mask = + _mm512_mask_cmpgt_epu32_mask(valid, utf32, v_0000_ffff); + + if (sp_mask == 0) { + if (big_endian) { + _mm256_mask_storeu_epi16( + (__m256i *)output, valid, + _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32), + _mm512_castsi512_si256(byteflip))); + + } else { + _mm256_mask_storeu_epi16((__m256i *)output, valid, + _mm512_cvtepi32_epi16(utf32)); + } + return count; + } + + { + // build surrogate pair code units in 32-bit lanes + + // t0 = 8 x [000000000000aaaa|aaaaaabbbbbbbbbb] + const __m512i v_0001_0000 = _mm512_set1_epi32(0x00010000); + const __m512i t0 = _mm512_sub_epi32(utf32, v_0001_0000); + + // t1 = 8 x [000000aaaaaaaaaa|bbbbbbbbbb000000] + const __m512i t1 = _mm512_slli_epi32(t0, 6); + + // t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1 + // to t0 + // 0xe4 = (t1 and v_ffff_0000) or (t0 and not v_ffff_0000) + const __m512i v_ffff_0000 = _mm512_set1_epi32(0xffff0000); + const __m512i t2 = _mm512_ternarylogic_epi32(t1, t0, v_ffff_0000, 0xe4); + + // t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1 + // to t0 + // 0xba = (t2 and not v_fc00_fc000) or v_d800_dc00 + const __m512i v_fc00_fc00 = _mm512_set1_epi32(0xfc00fc00); + const __m512i v_d800_dc00 = _mm512_set1_epi32(0xd800dc00); + const __m512i t3 = + _mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba); + const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3); + __m512i t5 = _mm512_ror_epi32(t4, 16); + // Here we want to trim all of the upper 16-bit code units from the 2-byte + // characters represented as 4-byte values. We can compute it from + // sp_mask or the following... It can be more optimized! + const __mmask32 nonzero = _kor_mask32( + 0xaaaaaaaa, _mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512())); + const __mmask32 nonzero_masked = + _kand_mask32(nonzero, __mmask32((uint64_t(1) << (2 * count)) - 1)); + if (big_endian) { + t5 = _mm512_shuffle_epi8(t5, byteflip); + } + // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability + // (AMD Zen4 has terrible performance with it, it is effectively broken) + __m512i compressed = _mm512_maskz_compress_epi16(nonzero_masked, t5); + _mm512_mask_storeu_epi16( + output, _bzhi_u32(0xFFFFFFFF, count + _mm_popcnt_u32(sp_mask)), + compressed); + //_mm512_mask_compressstoreu_epi16(output, nonzero_masked, t5); + } + + return count + static_cast(count_ones(sp_mask)); +} + +/* + utf32_to_utf16 converts `count` lower UTF-32 code units + from input `utf32` into UTF-16. It may overflow. + + Returns how many 16-bit code units were stored. + + byteflip is used for flipping 16-bit code units, and it should be + __m512i byteflip = _mm512_setr_epi64( + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809 + ); + We pass it to the (always inlined) function to encourage the compiler to + keep the value in a (constant) register. +*/ +template +simdutf_really_inline size_t utf32_to_utf16(const __m512i byteflip, + __m512i utf32, unsigned int count, + char16_t *output) { + // check if we have any surrogate pairs + const __m512i v_0000_ffff = _mm512_set1_epi32(0x0000ffff); + const __mmask16 sp_mask = _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff); + + if (sp_mask == 0) { + // technically, it should be _mm256_storeu_epi16 + if (big_endian) { + _mm256_storeu_si256( + (__m256i *)output, + _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32), + _mm512_castsi512_si256(byteflip))); + } else { + _mm256_storeu_si256((__m256i *)output, _mm512_cvtepi32_epi16(utf32)); + } + return count; + } + + { + // build surrogate pair code units in 32-bit lanes + + // t0 = 8 x [000000000000aaaa|aaaaaabbbbbbbbbb] + const __m512i v_0001_0000 = _mm512_set1_epi32(0x00010000); + const __m512i t0 = _mm512_sub_epi32(utf32, v_0001_0000); + + // t1 = 8 x [000000aaaaaaaaaa|bbbbbbbbbb000000] + const __m512i t1 = _mm512_slli_epi32(t0, 6); + + // t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1 + // to t0 + // 0xe4 = (t1 and v_ffff_0000) or (t0 and not v_ffff_0000) + const __m512i v_ffff_0000 = _mm512_set1_epi32(0xffff0000); + const __m512i t2 = _mm512_ternarylogic_epi32(t1, t0, v_ffff_0000, 0xe4); + + // t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1 + // to t0 + // 0xba = (t2 and not v_fc00_fc000) or v_d800_dc00 + const __m512i v_fc00_fc00 = _mm512_set1_epi32(0xfc00fc00); + const __m512i v_d800_dc00 = _mm512_set1_epi32(0xd800dc00); + const __m512i t3 = + _mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba); + const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3); + __m512i t5 = _mm512_ror_epi32(t4, 16); + const __mmask32 nonzero = _kor_mask32( + 0xaaaaaaaa, _mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512())); + if (big_endian) { + t5 = _mm512_shuffle_epi8(t5, byteflip); + } + // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability + // (zen4) + __m512i compressed = _mm512_maskz_compress_epi16(nonzero, t5); + _mm512_mask_storeu_epi16( + output, + (1 << (count + static_cast(count_ones(sp_mask)))) - 1, + compressed); + //_mm512_mask_compressstoreu_epi16(output, nonzero, t5); + } + + return count + static_cast(count_ones(sp_mask)); +} + +/* + expanded_utf8_to_utf32 converts expanded UTF-8 characters (`utf8`) + stored at separate 32-bit lanes. + + For each lane we have also a character class (`char_class), given in form + 0x8080800N, where N is 4 highest bits from the leading byte; 0x80 resets + corresponding bytes during pshufb. +*/ +simdutf_really_inline __m512i expanded_utf8_to_utf32(__m512i char_class, + __m512i utf8) { + /* + Input: + - utf8: bytes stored at separate 32-bit code units + - valid: which code units have valid UTF-8 characters + + Bit layout of single word. We show 4 cases for each possible + UTF-8 character encoding. The `?` denotes bits we must not + assume their value. + + |10dd.dddd|10cc.cccc|10bb.bbbb|1111.0aaa| 4-byte char + |????.????|10cc.cccc|10bb.bbbb|1110.aaaa| 3-byte char + |????.????|????.????|10bb.bbbb|110a.aaaa| 2-byte char + |????.????|????.????|????.????|0aaa.aaaa| ASCII char + byte 3 byte 2 byte 1 byte 0 + */ + + /* 1. Reset control bits of continuation bytes and the MSB + of the leading byte; this makes all bytes unsigned (and + does not alter ASCII char). + + |00dd.dddd|00cc.cccc|00bb.bbbb|0111.0aaa| 4-byte char + |00??.????|00cc.cccc|00bb.bbbb|0110.aaaa| 3-byte char + |00??.????|00??.????|00bb.bbbb|010a.aaaa| 2-byte char + |00??.????|00??.????|00??.????|0aaa.aaaa| ASCII char + ^^ ^^ ^^ ^ + */ + __m512i values; + const __m512i v_3f3f_3f7f = _mm512_set1_epi32(0x3f3f3f7f); + values = _mm512_and_si512(utf8, v_3f3f_3f7f); + + /* 2. Swap and join fields A-B and C-D + + |0000.cccc|ccdd.dddd|0001.110a|aabb.bbbb| 4-byte char + |0000.cccc|cc??.????|0001.10aa|aabb.bbbb| 3-byte char + |0000.????|????.????|0001.0aaa|aabb.bbbb| 2-byte char + |0000.????|????.????|000a.aaaa|aa??.????| ASCII char */ + const __m512i v_0140_0140 = _mm512_set1_epi32(0x01400140); + values = _mm512_maddubs_epi16(values, v_0140_0140); + + /* 3. Swap and join fields AB & CD + + |0000.0001|110a.aabb|bbbb.cccc|ccdd.dddd| 4-byte char + |0000.0001|10aa.aabb|bbbb.cccc|cc??.????| 3-byte char + |0000.0001|0aaa.aabb|bbbb.????|????.????| 2-byte char + |0000.000a|aaaa.aa??|????.????|????.????| ASCII char */ + const __m512i v_0001_1000 = _mm512_set1_epi32(0x00011000); + values = _mm512_madd_epi16(values, v_0001_1000); + + /* 4. Shift left the values by variable amounts to reset highest UTF-8 bits + |aaab.bbbb|bccc.cccd|dddd.d000|0000.0000| 4-byte char -- by 11 + |aaaa.bbbb|bbcc.cccc|????.??00|0000.0000| 3-byte char -- by 10 + |aaaa.abbb|bbb?.????|????.???0|0000.0000| 2-byte char -- by 9 + |aaaa.aaa?|????.????|????.????|?000.0000| ASCII char -- by 7 */ + { + /** pshufb + + continuation = 0 + ascii = 7 + _2_bytes = 9 + _3_bytes = 10 + _4_bytes = 11 + + shift_left_v3 = 4 * [ + ascii, # 0000 + ascii, # 0001 + ascii, # 0010 + ascii, # 0011 + ascii, # 0100 + ascii, # 0101 + ascii, # 0110 + ascii, # 0111 + continuation, # 1000 + continuation, # 1001 + continuation, # 1010 + continuation, # 1011 + _2_bytes, # 1100 + _2_bytes, # 1101 + _3_bytes, # 1110 + _4_bytes, # 1111 + ] */ + const __m512i shift_left_v3 = _mm512_setr_epi64( + 0x0707070707070707, 0x0b0a090900000000, 0x0707070707070707, + 0x0b0a090900000000, 0x0707070707070707, 0x0b0a090900000000, + 0x0707070707070707, 0x0b0a090900000000); + + const __m512i shift = _mm512_shuffle_epi8(shift_left_v3, char_class); + values = _mm512_sllv_epi32(values, shift); + } + + /* 5. Shift right the values by variable amounts to reset lowest bits + |0000.0000|000a.aabb|bbbb.cccc|ccdd.dddd| 4-byte char -- by 11 + |0000.0000|0000.0000|aaaa.bbbb|bbcc.cccc| 3-byte char -- by 16 + |0000.0000|0000.0000|0000.0aaa|aabb.bbbb| 2-byte char -- by 21 + |0000.0000|0000.0000|0000.0000|0aaa.aaaa| ASCII char -- by 25 */ + { + // 4 * [25, 25, 25, 25, 25, 25, 25, 25, 0, 0, 0, 0, 21, 21, 16, 11] + const __m512i shift_right = _mm512_setr_epi64( + 0x1919191919191919, 0x0b10151500000000, 0x1919191919191919, + 0x0b10151500000000, 0x1919191919191919, 0x0b10151500000000, + 0x1919191919191919, 0x0b10151500000000); + + const __m512i shift = _mm512_shuffle_epi8(shift_right, char_class); + values = _mm512_srlv_epi32(values, shift); + } + + return values; +} + +simdutf_really_inline __m512i expand_and_identify(__m512i lane0, __m512i lane1, + int &count) { + const __m512i merged = _mm512_mask_mov_epi32(lane0, 0x1000, lane1); + const __m512i expand_ver2 = _mm512_setr_epi64( + 0x0403020103020100, 0x0605040305040302, 0x0807060507060504, + 0x0a09080709080706, 0x0c0b0a090b0a0908, 0x0e0d0c0b0d0c0b0a, + 0x000f0e0d0f0e0d0c, 0x0201000f01000f0e); + const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2); + const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0); + const __m512i t0 = _mm512_and_si512(input, v_0000_00c0); + const __m512i v_0000_0080 = _mm512_set1_epi32(0x80); + const __mmask16 leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080); + count = static_cast(count_ones(leading_bytes)); + return _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes, + input); +} + +simdutf_really_inline __m512i expand_utf8_to_utf32(__m512i input) { + __m512i char_class = _mm512_srli_epi32(input, 4); + /* char_class = ((input >> 4) & 0x0f) | 0x80808000 */ + const __m512i v_0000_000f = _mm512_set1_epi32(0x0f); + const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000); + char_class = + _mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea); + return expanded_utf8_to_utf32(char_class, input); +} +/* end file src/icelake/icelake_utf8_common.inl.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +/* begin file src/icelake/icelake_utf8_validation.inl.cpp */ +// file included directly + +simdutf_really_inline __m512i check_special_cases(__m512i input, + const __m512i prev1) { + __m512i mask1 = _mm512_setr_epi64(0x0202020202020202, 0x4915012180808080, + 0x0202020202020202, 0x4915012180808080, + 0x0202020202020202, 0x4915012180808080, + 0x0202020202020202, 0x4915012180808080); + const __m512i v_0f = _mm512_set1_epi8(0x0f); + __m512i index1 = _mm512_and_si512(_mm512_srli_epi16(prev1, 4), v_0f); + + __m512i byte_1_high = _mm512_shuffle_epi8(mask1, index1); + __m512i mask2 = _mm512_setr_epi64(0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb, + 0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb, + 0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb, + 0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb); + __m512i index2 = _mm512_and_si512(prev1, v_0f); + + __m512i byte_1_low = _mm512_shuffle_epi8(mask2, index2); + __m512i mask3 = + _mm512_setr_epi64(0x101010101010101, 0x1010101babaaee6, 0x101010101010101, + 0x1010101babaaee6, 0x101010101010101, 0x1010101babaaee6, + 0x101010101010101, 0x1010101babaaee6); + __m512i index3 = _mm512_and_si512(_mm512_srli_epi16(input, 4), v_0f); + __m512i byte_2_high = _mm512_shuffle_epi8(mask3, index3); + return _mm512_ternarylogic_epi64(byte_1_high, byte_1_low, byte_2_high, 128); +} + +simdutf_really_inline __m512i check_multibyte_lengths(const __m512i input, + const __m512i prev_input, + const __m512i sc) { + __m512i prev2 = prev<2>(input, prev_input); + __m512i prev3 = prev<3>(input, prev_input); + __m512i is_third_byte = _mm512_subs_epu8( + prev2, _mm512_set1_epi8(0b11100000u - 1)); // Only 111_____ will be > 0 + __m512i is_fourth_byte = _mm512_subs_epu8( + prev3, _mm512_set1_epi8(0b11110000u - 1)); // Only 1111____ will be > 0 + __m512i is_third_or_fourth_byte = + _mm512_or_si512(is_third_byte, is_fourth_byte); + const __m512i v_7f = _mm512_set1_epi8(char(0x7f)); + is_third_or_fourth_byte = _mm512_adds_epu8(v_7f, is_third_or_fourth_byte); + // We want to compute (is_third_or_fourth_byte AND v80) XOR sc. + const __m512i v_80 = _mm512_set1_epi8(char(0x80)); + return _mm512_ternarylogic_epi32(is_third_or_fourth_byte, v_80, sc, + 0b1101010); + //__m512i is_third_or_fourth_byte_mask = + //_mm512_and_si512(is_third_or_fourth_byte, v_80); return + // _mm512_xor_si512(is_third_or_fourth_byte_mask, sc); +} +// +// Return nonzero if there are incomplete multibyte characters at the end of the +// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end. +// +simdutf_really_inline __m512i is_incomplete(const __m512i input) { + // If the previous input's last 3 bytes match this, they're too short (they + // ended at EOF): + // ... 1111____ 111_____ 11______ + __m512i max_value = _mm512_setr_epi64(0xffffffffffffffff, 0xffffffffffffffff, + 0xffffffffffffffff, 0xffffffffffffffff, + 0xffffffffffffffff, 0xffffffffffffffff, + 0xffffffffffffffff, 0xbfdfefffffffffff); + return _mm512_subs_epu8(input, max_value); +} + +struct avx512_utf8_checker { + // If this is nonzero, there has been a UTF-8 error. + __m512i error{}; + + // The last input we received + __m512i prev_input_block{}; + // Whether the last input we received was incomplete (used for ASCII fast + // path) + __m512i prev_incomplete{}; + + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const __m512i input, + const __m512i prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ + // lead bytes (2, 3, 4-byte leads become large positive numbers instead of + // small negative numbers) + __m512i prev1 = prev<1>(input, prev_input); + __m512i sc = check_special_cases(input, prev1); + this->error = _mm512_or_si512( + check_multibyte_lengths(input, prev_input, sc), this->error); + } + + // The only problem that can happen at EOF is that a multibyte character is + // too short or a byte value too large in the last bytes: check_special_cases + // only checks for bytes too large in the first of two bytes. + simdutf_really_inline void check_eof() { + // If the previous block had incomplete UTF-8 characters at the end, an + // ASCII block can't possibly finish them. + this->error = _mm512_or_si512(this->error, this->prev_incomplete); + } + + // returns true if ASCII. + simdutf_really_inline bool check_next_input(const __m512i input) { + const __m512i v_80 = _mm512_set1_epi8(char(0x80)); + const __mmask64 ascii = _mm512_test_epi8_mask(input, v_80); + if (ascii == 0) { + this->error = _mm512_or_si512(this->error, this->prev_incomplete); + return true; + } else { + this->check_utf8_bytes(input, this->prev_input_block); + this->prev_incomplete = is_incomplete(input); + this->prev_input_block = input; + return false; + } + } + // do not forget to call check_eof! + simdutf_really_inline bool errors() const { + return _mm512_test_epi8_mask(this->error, this->error) != 0; + } +}; // struct avx512_utf8_checker +/* end file src/icelake/icelake_utf8_validation.inl.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF8 && \ + (SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_LATIN1) +/* begin file src/icelake/icelake_from_valid_utf8.inl.cpp */ +// file included directly + +// File contains conversion procedure from VALID UTF-8 strings. + +/* + valid_utf8_to_fixed_length converts a valid UTF-8 string into UTF-32. + + The `OUTPUT` template type decides what to do with UTF-32: store + it directly or convert into UTF-16 (with AVX512). + + Input: + - str - valid UTF-8 string + - len - string length + - out_buffer - output buffer + + Result: + - pair.first - the first unprocessed input byte + - pair.second - the first unprocessed output word +*/ +template +std::pair +valid_utf8_to_fixed_length(const char *str, size_t len, OUTPUT *dwords) { + constexpr bool UTF32 = std::is_same::value; + constexpr bool UTF16 = std::is_same::value; + static_assert( + UTF32 or UTF16, + "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)"); + static_assert(!(UTF32 and big_endian), + "we do not currently support big-endian UTF-32"); + + __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809); + const char *ptr = str; + const char *end = ptr + len; + + OUTPUT *output = dwords; + /** + * In the main loop, we consume 64 bytes per iteration, + * but we access 64 + 4 bytes. + * We check for ptr + 64 + 64 <= end because + * we want to be do maskless writes without overruns. + */ + while (end - ptr >= 64 + 4) { + const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr); + const __m512i v_80 = _mm512_set1_epi8(char(0x80)); + const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80); + if (ascii == 0) { + SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output) + output += 64; + ptr += 64; + continue; + } + + const __m512i lane0 = broadcast_epi128<0>(utf8); + const __m512i lane1 = broadcast_epi128<1>(utf8); + int valid_count0; + __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0); + const __m512i lane2 = broadcast_epi128<2>(utf8); + int valid_count1; + __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1); + if (valid_count0 + valid_count1 <= 16) { + vec0 = _mm512_mask_expand_epi32( + vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1); + valid_count0 += valid_count1; + vec0 = expand_utf8_to_utf32(vec0); + SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true) + } else { + vec0 = expand_utf8_to_utf32(vec0); + vec1 = expand_utf8_to_utf32(vec1); + SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true) + SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true) + } + const __m512i lane3 = broadcast_epi128<3>(utf8); + int valid_count2; + __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2); + uint32_t tmp1; + ::memcpy(&tmp1, ptr + 64, sizeof(tmp1)); + const __m512i lane4 = _mm512_set1_epi32(tmp1); + int valid_count3; + __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3); + if (valid_count2 + valid_count3 <= 16) { + vec2 = _mm512_mask_expand_epi32( + vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3); + valid_count2 += valid_count3; + vec2 = expand_utf8_to_utf32(vec2); + SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true) + } else { + vec2 = expand_utf8_to_utf32(vec2); + vec3 = expand_utf8_to_utf32(vec3); + SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true) + SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, true) + } + ptr += 4 * 16; + } + + if (end - ptr >= 64) { + const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr); + const __m512i v_80 = _mm512_set1_epi8(char(0x80)); + const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80); + if (ascii == 0) { + SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output) + output += 64; + ptr += 64; + } else { + const __m512i lane0 = broadcast_epi128<0>(utf8); + const __m512i lane1 = broadcast_epi128<1>(utf8); + int valid_count0; + __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0); + const __m512i lane2 = broadcast_epi128<2>(utf8); + int valid_count1; + __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1); + if (valid_count0 + valid_count1 <= 16) { + vec0 = _mm512_mask_expand_epi32( + vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1); + valid_count0 += valid_count1; + vec0 = expand_utf8_to_utf32(vec0); + SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true) + } else { + vec0 = expand_utf8_to_utf32(vec0); + vec1 = expand_utf8_to_utf32(vec1); + SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true) + SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true) + } + + const __m512i lane3 = broadcast_epi128<3>(utf8); + SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true) + + ptr += 3 * 16; + } + } + return {ptr, output}; +} + +using utf8_to_utf16_result = std::pair; +/* end file src/icelake/icelake_from_valid_utf8.inl.cpp */ +/* begin file src/icelake/icelake_from_utf8.inl.cpp */ +// file included directly + +// File contains conversion procedure from possibly invalid UTF-8 strings. + +template +// todo: replace with the utf-8 to utf-16 routine adapted to utf-32. This code +// is legacy. +std::pair +validating_utf8_to_fixed_length(const char *str, size_t len, OUTPUT *dwords) { + constexpr bool UTF32 = std::is_same::value; + constexpr bool UTF16 = std::is_same::value; + static_assert( + UTF32 or UTF16, + "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)"); + static_assert(!(UTF32 and big_endian), + "we do not currently support big-endian UTF-32"); + + const char *ptr = str; + const char *end = ptr + len; + __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809); + OUTPUT *output = dwords; + avx512_utf8_checker checker{}; + /** + * In the main loop, we consume 64 bytes per iteration, + * but we access 64 + 4 bytes. + * We use masked writes to avoid overruns, see + * https://github.com/simdutf/simdutf/issues/471 + */ + while (end - ptr >= 64 + 4) { + const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr); + if (checker.check_next_input(utf8)) { + SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output) + output += 64; + ptr += 64; + continue; + } + const __m512i lane0 = broadcast_epi128<0>(utf8); + const __m512i lane1 = broadcast_epi128<1>(utf8); + int valid_count0; + __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0); + const __m512i lane2 = broadcast_epi128<2>(utf8); + int valid_count1; + __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1); + if (valid_count0 + valid_count1 <= 16) { + vec0 = _mm512_mask_expand_epi32( + vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1); + valid_count0 += valid_count1; + vec0 = expand_utf8_to_utf32(vec0); + SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true) + } else { + vec0 = expand_utf8_to_utf32(vec0); + vec1 = expand_utf8_to_utf32(vec1); + SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true) + SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true) + } + const __m512i lane3 = broadcast_epi128<3>(utf8); + int valid_count2; + __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2); + uint32_t tmp1; + ::memcpy(&tmp1, ptr + 64, sizeof(tmp1)); + const __m512i lane4 = _mm512_set1_epi32(tmp1); + int valid_count3; + __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3); + if (valid_count2 + valid_count3 <= 16) { + vec2 = _mm512_mask_expand_epi32( + vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3); + valid_count2 += valid_count3; + vec2 = expand_utf8_to_utf32(vec2); + SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true) + } else { + vec2 = expand_utf8_to_utf32(vec2); + vec3 = expand_utf8_to_utf32(vec3); + SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true) + SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, true) + } + ptr += 4 * 16; + } + const char *validatedptr = ptr; // validated up to ptr + + // For the final pass, we validate 64 bytes, but we only transcode + // 3*16 bytes, so we may end up double-validating 16 bytes. + if (end - ptr >= 64) { + const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr); + if (checker.check_next_input(utf8)) { + SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output) + output += 64; + ptr += 64; + } else { + const __m512i lane0 = broadcast_epi128<0>(utf8); + const __m512i lane1 = broadcast_epi128<1>(utf8); + int valid_count0; + __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0); + const __m512i lane2 = broadcast_epi128<2>(utf8); + int valid_count1; + __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1); + if (valid_count0 + valid_count1 <= 16) { + vec0 = _mm512_mask_expand_epi32( + vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1); + valid_count0 += valid_count1; + vec0 = expand_utf8_to_utf32(vec0); + SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true) + } else { + vec0 = expand_utf8_to_utf32(vec0); + vec1 = expand_utf8_to_utf32(vec1); + SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true) + SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true) + } + + const __m512i lane3 = broadcast_epi128<3>(utf8); + SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true) + + ptr += 3 * 16; + } + validatedptr += 4 * 16; + } + if (end != validatedptr) { + const __m512i utf8 = + _mm512_maskz_loadu_epi8(~UINT64_C(0) >> (64 - (end - validatedptr)), + (const __m512i *)validatedptr); + checker.check_next_input(utf8); + } + checker.check_eof(); + if (checker.errors()) { + return {ptr, nullptr}; // We found an error. + } + return {ptr, output}; +} + +// Like validating_utf8_to_fixed_length but returns as soon as an error is +// identified todo: replace with the utf-8 to utf-16 routine adapted to utf-32. +// This code is legacy. +template +std::tuple +validating_utf8_to_fixed_length_with_constant_checks(const char *str, + size_t len, + OUTPUT *dwords) { + constexpr bool UTF32 = std::is_same::value; + constexpr bool UTF16 = std::is_same::value; + static_assert( + UTF32 or UTF16, + "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)"); + static_assert(!(UTF32 and big_endian), + "we do not currently support big-endian UTF-32"); + + const char *ptr = str; + const char *end = ptr + len; + __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809); + OUTPUT *output = dwords; + avx512_utf8_checker checker{}; + /** + * In the main loop, we consume 64 bytes per iteration, + * but we access 64 + 4 bytes. + */ + while (end - ptr >= 4 + 64) { + const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr); + bool ascii = checker.check_next_input(utf8); + if (checker.errors()) { + return {ptr, output, false}; // We found an error. + } + if (ascii) { + SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output) + output += 64; + ptr += 64; + continue; + } + const __m512i lane0 = broadcast_epi128<0>(utf8); + const __m512i lane1 = broadcast_epi128<1>(utf8); + int valid_count0; + __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0); + const __m512i lane2 = broadcast_epi128<2>(utf8); + int valid_count1; + __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1); + if (valid_count0 + valid_count1 <= 16) { + vec0 = _mm512_mask_expand_epi32( + vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1); + valid_count0 += valid_count1; + vec0 = expand_utf8_to_utf32(vec0); + SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true) + } else { + vec0 = expand_utf8_to_utf32(vec0); + vec1 = expand_utf8_to_utf32(vec1); + SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true) + SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true) + } + const __m512i lane3 = broadcast_epi128<3>(utf8); + int valid_count2; + __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2); + uint32_t tmp1; + ::memcpy(&tmp1, ptr + 64, sizeof(tmp1)); + const __m512i lane4 = _mm512_set1_epi32(tmp1); + int valid_count3; + __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3); + if (valid_count2 + valid_count3 <= 16) { + vec2 = _mm512_mask_expand_epi32( + vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3); + valid_count2 += valid_count3; + vec2 = expand_utf8_to_utf32(vec2); + SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true) + } else { + vec2 = expand_utf8_to_utf32(vec2); + vec3 = expand_utf8_to_utf32(vec3); + SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true) + SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, true) + } + ptr += 4 * 16; + } + const char *validatedptr = ptr; // validated up to ptr + + // For the final pass, we validate 64 bytes, but we only transcode + // 3*16 bytes, so we may end up double-validating 16 bytes. + if (end - ptr >= 64) { + const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr); + bool ascii = checker.check_next_input(utf8); + if (checker.errors()) { + return {ptr, output, false}; // We found an error. + } + if (ascii) { + SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output) + output += 64; + ptr += 64; + } else { + const __m512i lane0 = broadcast_epi128<0>(utf8); + const __m512i lane1 = broadcast_epi128<1>(utf8); + int valid_count0; + __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0); + const __m512i lane2 = broadcast_epi128<2>(utf8); + int valid_count1; + __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1); + if (valid_count0 + valid_count1 <= 16) { + vec0 = _mm512_mask_expand_epi32( + vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1); + valid_count0 += valid_count1; + vec0 = expand_utf8_to_utf32(vec0); + SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true) + } else { + vec0 = expand_utf8_to_utf32(vec0); + vec1 = expand_utf8_to_utf32(vec1); + SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true) + SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true) + } + + const __m512i lane3 = broadcast_epi128<3>(utf8); + SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true) + + ptr += 3 * 16; + } + validatedptr += 4 * 16; + } + if (end != validatedptr) { + const __m512i utf8 = + _mm512_maskz_loadu_epi8(~UINT64_C(0) >> (64 - (end - validatedptr)), + (const __m512i *)validatedptr); + checker.check_next_input(utf8); + } + checker.check_eof(); + if (checker.errors()) { + return {ptr, output, false}; // We found an error. + } + return {ptr, output, true}; +} +/* end file src/icelake/icelake_from_utf8.inl.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && (SIMDUTF_FEATURE_UTF16 || + // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_LATIN1) + +#if SIMDUTF_FEATURE_UTF16 +/* begin file src/icelake/icelake_utf16fix.cpp */ +#include + +/* + * Process one block of 32 characters. If in_place is false, + * copy the block from in to out. If there is a sequencing + * error in the block, overwrite the illsequenced characters + * with the replacement character. This function reads one + * character before the beginning of the buffer as a lookback. + * If that character is illsequenced, it too is overwritten. + */ +template +simdutf_really_inline void utf16fix_block(char16_t *out, const char16_t *in) { + const char16_t replacement = scalar::utf16::replacement(); + __m512i lookback, block, lb_masked, block_masked; + __mmask32 lb_is_high, block_is_low, illseq; + auto swap_if_needed = [](uint16_t x) simdutf_constexpr -> uint16_t { + return scalar::utf16::swap_if_needed(x); + }; + + lookback = _mm512_loadu_si512((const __m512i *)(in - 1)); + block = _mm512_loadu_si512((const __m512i *)in); + lb_masked = + _mm512_and_epi32(lookback, _mm512_set1_epi16(swap_if_needed(0xfc00U))); + block_masked = + _mm512_and_epi32(block, _mm512_set1_epi16(swap_if_needed(0xfc00U))); + + lb_is_high = _mm512_cmpeq_epi16_mask( + lb_masked, _mm512_set1_epi16(swap_if_needed(0xd800U))); + block_is_low = _mm512_cmpeq_epi16_mask( + block_masked, _mm512_set1_epi16(swap_if_needed(0xdc00U))); + illseq = _kxor_mask32(lb_is_high, block_is_low); + if (!_ktestz_mask32_u8(illseq, illseq)) { + __mmask32 lb_illseq, block_illseq; + + /* compute the cause of the illegal sequencing */ + lb_illseq = _kandn_mask32(block_is_low, lb_is_high); + block_illseq = _kor_mask32(_kandn_mask32(lb_is_high, block_is_low), + _kshiftri_mask32(lb_illseq, 1)); + + /* fix illegal sequencing in the lookback */ + lb_illseq = _kand_mask32(lb_illseq, _cvtu32_mask32(1)); + _mm512_mask_storeu_epi16(out - 1, lb_illseq, + _mm512_set1_epi16(replacement)); + + /* fix illegal sequencing in the main block */ + if (in_place) { + _mm512_mask_storeu_epi16(out, block_illseq, + _mm512_set1_epi16(replacement)); + } else { + _mm512_storeu_epi32( + out, _mm512_mask_blend_epi16(block_illseq, block, + _mm512_set1_epi16(replacement))); + } + } else if (!in_place) { + _mm512_storeu_si512((__m512i *)out, block); + } +} + +/* + * Special case for inputs of 0--32 bytes. Works for both in-place and + * out-of-place operation. + */ +template +void utf16fix_short(const char16_t *in, size_t n, char16_t *out) { + const char16_t replacement = scalar::utf16::replacement(); + __m512i lookback, block, lb_masked, block_masked; + __mmask32 lb_is_high, block_is_low, illseq; + uint32_t mask = 0xFFFFFFFF >> (32 - n); + auto swap_if_needed = [](uint16_t x) simdutf_constexpr -> uint16_t { + return scalar::utf16::swap_if_needed(x); + }; + lookback = _mm512_maskz_loadu_epi16(_cvtmask32_u32(mask << 1), + (const uint16_t *)(in - 1)); + block = _mm512_maskz_loadu_epi16(_cvtmask32_u32(mask), (const uint16_t *)in); + lb_masked = + _mm512_and_epi32(lookback, _mm512_set1_epi16(swap_if_needed(0xfc00u))); + block_masked = + _mm512_and_epi32(block, _mm512_set1_epi16(swap_if_needed(0xfc00u))); + + lb_is_high = _mm512_cmpeq_epi16_mask( + lb_masked, _mm512_set1_epi16(swap_if_needed(0xd800u))); + block_is_low = _mm512_cmpeq_epi16_mask( + block_masked, _mm512_set1_epi16(swap_if_needed(0xdc00u))); + illseq = _kxor_mask32(lb_is_high, block_is_low); + if (!_ktestz_mask32_u8(illseq, illseq)) { + __mmask32 lb_illseq, block_illseq; + + /* compute the cause of the illegal sequencing */ + lb_illseq = _kandn_mask32(block_is_low, lb_is_high); + block_illseq = _kor_mask32(_kandn_mask32(lb_is_high, block_is_low), + _kshiftri_mask32(lb_illseq, 1)); + + /* fix illegal sequencing in the main block */ + _mm512_mask_storeu_epi16( + (uint16_t *)out, _cvtmask32_u32(mask), + _mm512_mask_blend_epi16(block_illseq, block, + _mm512_set1_epi16(replacement))); + } else { + _mm512_mask_storeu_epi16((uint16_t *)out, _cvtmask32_u32(mask), block); + } + out[n - 1] = scalar::utf16::is_high_surrogate(out[n - 1]) + ? replacement + : out[n - 1]; +} + +template +void utf16fix_avx512(const char16_t *in, size_t n, char16_t *out) { + const char16_t replacement = scalar::utf16::replacement(); + size_t i; + + if (n == 0) + return; + else if (n < 33) { + utf16fix_short(in, n, out); + return; + } + out[0] = + scalar::utf16::is_low_surrogate(in[0]) ? replacement : in[0]; + + /* duplicate code to have the compiler specialise utf16fix_block() */ + if (in == out) { + for (i = 1; i + 32 < n; i += 32) { + utf16fix_block(out + i, in + i); + } + + utf16fix_block(out + n - 32, in + n - 32); + } else { + for (i = 1; i + 32 < n; i += 32) { + utf16fix_block(out + i, in + i); + } + + utf16fix_block(out + n - 32, in + n - 32); + } + + out[n - 1] = scalar::utf16::is_high_surrogate(out[n - 1]) + ? replacement + : out[n - 1]; +} +/* end file src/icelake/icelake_utf16fix.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/icelake/icelake_convert_utf8_to_latin1.inl.cpp */ +// file included directly + +// File contains conversion procedure from possibly invalid UTF-8 strings. + +template +simdutf_really_inline size_t process_block_from_utf8_to_latin1( + const char *buf, size_t len, char *latin_output, __m512i minus64, + __m512i one, __mmask64 *next_leading_ptr, __mmask64 *next_bit6_ptr) { + __mmask64 load_mask = + is_remaining ? _bzhi_u64(~0ULL, (unsigned int)len) : ~0ULL; + __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)buf); + __mmask64 nonascii = _mm512_movepi8_mask(input); + if (nonascii == 0) { + if (*next_leading_ptr) { // If we ended with a leading byte, it is an error. + return 0; // Indicates error + } + is_remaining + ? _mm512_mask_storeu_epi8((__m512i *)latin_output, load_mask, input) + : _mm512_storeu_si512((__m512i *)latin_output, input); + return len; + } + + const __mmask64 leading = _mm512_cmpge_epu8_mask(input, minus64); + + __m512i highbits = _mm512_xor_si512(input, _mm512_set1_epi8(-62)); + __mmask64 invalid_leading_bytes = + _mm512_mask_cmpgt_epu8_mask(leading, highbits, one); + + if (invalid_leading_bytes) { + return 0; // Indicates error + } + + __mmask64 leading_shift = (leading << 1) | *next_leading_ptr; + + if ((nonascii ^ leading) != leading_shift) { + return 0; // Indicates error + } + + const __mmask64 bit6 = _mm512_cmpeq_epi8_mask(highbits, one); + input = + _mm512_mask_sub_epi8(input, (bit6 << 1) | *next_bit6_ptr, input, minus64); + + __mmask64 retain = ~leading & load_mask; + __m512i output = _mm512_maskz_compress_epi8(retain, input); + int64_t written_out = count_ones(retain); + if (written_out == 0) { + return 0; // Indicates error + } + *next_bit6_ptr = bit6 >> 63; + *next_leading_ptr = leading >> 63; + + __mmask64 store_mask = ~UINT64_C(0) >> (64 - written_out); + + _mm512_mask_storeu_epi8((__m512i *)latin_output, store_mask, output); + + return written_out; +} + +size_t utf8_to_latin1_avx512(const char *&inbuf, size_t len, + char *&inlatin_output) { + const char *buf = inbuf; + char *latin_output = inlatin_output; + char *start = latin_output; + size_t pos = 0; + __m512i minus64 = _mm512_set1_epi8(-64); // 11111111111 ... 1100 0000 + __m512i one = _mm512_set1_epi8(1); + __mmask64 next_leading = 0; + __mmask64 next_bit6 = 0; + + while (pos + 64 <= len) { + size_t written = process_block_from_utf8_to_latin1( + buf + pos, 64, latin_output, minus64, one, &next_leading, &next_bit6); + if (written == 0) { + inlatin_output = latin_output; + inbuf = buf + pos - next_leading; + return 0; // Indicates error at pos or after, or just before pos (too + // short error) + } + latin_output += written; + pos += 64; + } + + if (pos < len) { + size_t remaining = len - pos; + size_t written = process_block_from_utf8_to_latin1( + buf + pos, remaining, latin_output, minus64, one, &next_leading, + &next_bit6); + if (written == 0) { + inbuf = buf + pos - next_leading; + inlatin_output = latin_output; + return 0; // Indicates error at pos or after, or just before pos (too + // short error) + } + latin_output += written; + } + if (next_leading) { + inbuf = buf + len - next_leading; + inlatin_output = latin_output; + return 0; // Indicates error at end of buffer + } + inlatin_output = latin_output; + inbuf += len; + return size_t(latin_output - start); +} +/* end file src/icelake/icelake_convert_utf8_to_latin1.inl.cpp */ +/* begin file src/icelake/icelake_convert_valid_utf8_to_latin1.inl.cpp */ +// file included directly + +// File contains conversion procedure from valid UTF-8 strings. + +template +simdutf_really_inline size_t process_valid_block_from_utf8_to_latin1( + const char *buf, size_t len, char *latin_output, __m512i minus64, + __m512i one, __mmask64 *next_leading_ptr, __mmask64 *next_bit6_ptr) { + __mmask64 load_mask = + is_remaining ? _bzhi_u64(~0ULL, (unsigned int)len) : ~0ULL; + __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)buf); + __mmask64 nonascii = _mm512_movepi8_mask(input); + + if (nonascii == 0) { + is_remaining + ? _mm512_mask_storeu_epi8((__m512i *)latin_output, load_mask, input) + : _mm512_storeu_si512((__m512i *)latin_output, input); + return len; + } + + __mmask64 leading = _mm512_cmpge_epu8_mask(input, minus64); + + __m512i highbits = _mm512_xor_si512(input, _mm512_set1_epi8(-62)); + + *next_leading_ptr = leading >> 63; + + __mmask64 bit6 = _mm512_cmpeq_epi8_mask(highbits, one); + input = + _mm512_mask_sub_epi8(input, (bit6 << 1) | *next_bit6_ptr, input, minus64); + *next_bit6_ptr = bit6 >> 63; + + __mmask64 retain = ~leading & load_mask; + __m512i output = _mm512_maskz_compress_epi8(retain, input); + int64_t written_out = count_ones(retain); + if (written_out == 0) { + return 0; // Indicates error + } + __mmask64 store_mask = ~UINT64_C(0) >> (64 - written_out); + // Optimization opportunity: sometimes, masked writes are not needed. + _mm512_mask_storeu_epi8((__m512i *)latin_output, store_mask, output); + return written_out; +} + +size_t valid_utf8_to_latin1_avx512(const char *buf, size_t len, + char *latin_output) { + char *start = latin_output; + size_t pos = 0; + __m512i minus64 = _mm512_set1_epi8(-64); // 11111111111 ... 1100 0000 + __m512i one = _mm512_set1_epi8(1); + __mmask64 next_leading = 0; + __mmask64 next_bit6 = 0; + + while (pos + 64 <= len) { + size_t written = process_valid_block_from_utf8_to_latin1( + buf + pos, 64, latin_output, minus64, one, &next_leading, &next_bit6); + latin_output += written; + pos += 64; + } + + if (pos < len) { + size_t remaining = len - pos; + size_t written = process_valid_block_from_utf8_to_latin1( + buf + pos, remaining, latin_output, minus64, one, &next_leading, + &next_bit6); + latin_output += written; + } + + return (size_t)(latin_output - start); +} +/* end file src/icelake/icelake_convert_valid_utf8_to_latin1.inl.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF16 +/* begin file src/icelake/icelake_convert_utf16_to_latin1.inl.cpp */ +// file included directly +template +size_t icelake_convert_utf16_to_latin1(const char16_t *buf, size_t len, + char *latin1_output) { + const char16_t *end = buf + len; + __m512i v_0xFF = _mm512_set1_epi16(0xff); + __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809); + __m512i shufmask = _mm512_set_epi8( + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, + 36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); + while (end - buf >= 32) { + __m512i in = _mm512_loadu_si512((__m512i *)buf); + if (big_endian) { + in = _mm512_shuffle_epi8(in, byteflip); + } + if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) { + return 0; + } + _mm256_storeu_si256( + (__m256i *)latin1_output, + _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in))); + latin1_output += 32; + buf += 32; + } + if (buf < end) { + uint32_t mask(uint32_t(1 << (end - buf)) - 1); + __m512i in = _mm512_maskz_loadu_epi16(mask, buf); + if (big_endian) { + in = _mm512_shuffle_epi8(in, byteflip); + } + if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) { + return 0; + } + _mm256_mask_storeu_epi8( + latin1_output, mask, + _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in))); + } + return len; +} + +template +std::pair +icelake_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len, + char *latin1_output) { + const char16_t *end = buf + len; + const char16_t *start = buf; + __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809); + __m512i v_0xFF = _mm512_set1_epi16(0xff); + __m512i shufmask = _mm512_set_epi8( + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, + 36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); + while (end - buf >= 32) { + __m512i in = _mm512_loadu_si512((__m512i *)buf); + if (big_endian) { + in = _mm512_shuffle_epi8(in, byteflip); + } + if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) { + uint16_t word; + while ((word = scalar::utf16::swap_if_needed( + uint16_t(*buf))) <= 0xff) { + *latin1_output++ = uint8_t(word); + buf++; + } + return std::make_pair(result(error_code::TOO_LARGE, buf - start), + latin1_output); + } + _mm256_storeu_si256( + (__m256i *)latin1_output, + _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in))); + latin1_output += 32; + buf += 32; + } + if (buf < end) { + uint32_t mask(uint32_t(1 << (end - buf)) - 1); + __m512i in = _mm512_maskz_loadu_epi16(mask, buf); + if (big_endian) { + in = _mm512_shuffle_epi8(in, byteflip); + } + if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) { + + uint16_t word; + while ((word = scalar::utf16::swap_if_needed( + uint16_t(*buf))) <= 0xff) { + *latin1_output++ = uint8_t(word); + buf++; + } + return std::make_pair(result(error_code::TOO_LARGE, buf - start), + latin1_output); + } + _mm256_mask_storeu_epi8( + latin1_output, mask, + _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in))); + } + return std::make_pair(result(error_code::SUCCESS, len), latin1_output); +} +/* end file src/icelake/icelake_convert_utf16_to_latin1.inl.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +/* begin file src/icelake/icelake_convert_utf16_to_utf8.inl.cpp */ +// file included directly + +/** + * This function converts the input (inbuf, inlen), assumed to be valid + * UTF16 (little endian) into UTF-8 (to outbuf). The number of code units + * written is written to 'outlen' and the function reports the number of input + * word consumed. + */ +template +size_t utf16_to_utf8_avx512i(const char16_t *inbuf, size_t inlen, + unsigned char *outbuf, size_t *outlen) { + __m512i in; + __mmask32 inmask = _cvtu32_mask32(0x7fffffff); + __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809); + const char16_t *const inbuf_orig = inbuf; + const unsigned char *const outbuf_orig = outbuf; + int adjust = 0; + int carry = 0; + + while (inlen >= 32) { + in = _mm512_loadu_si512(inbuf); + if (big_endian) { + in = _mm512_shuffle_epi8(in, byteflip); + } + inlen -= 31; + lastiteration: + inbuf += 31; + + failiteration: + const __mmask32 is234byte = _mm512_mask_cmp_epu16_mask( + inmask, in, _mm512_set1_epi16(0x0080), _MM_CMPINT_NLT); + + if (_ktestz_mask32_u8(inmask, is234byte)) { + // fast path for ASCII only + _mm512_mask_cvtepi16_storeu_epi8(outbuf, inmask, in); + outbuf += 31; + carry = 0; + + if (inlen < 32) { + goto tail; + } else { + continue; + } + } + + const __mmask32 is12byte = + _mm512_cmp_epu16_mask(in, _mm512_set1_epi16(0x0800), _MM_CMPINT_LT); + + if (_ktestc_mask32_u8(is12byte, inmask)) { + // fast path for 1 and 2 byte only + + const __m512i twobytes = _mm512_ternarylogic_epi32( + _mm512_slli_epi16(in, 8), _mm512_srli_epi16(in, 6), + _mm512_set1_epi16(0x3f3f), 0xa8); // (A|B)&C + in = _mm512_mask_add_epi16(in, is234byte, twobytes, + _mm512_set1_epi16(int16_t(0x80c0))); + const __m512i cmpmask = + _mm512_mask_blend_epi16(inmask, _mm512_set1_epi16(int16_t(0xffff)), + _mm512_set1_epi16(0x0800)); + const __mmask64 smoosh = + _mm512_cmp_epu8_mask(in, cmpmask, _MM_CMPINT_NLT); + const __m512i out = _mm512_maskz_compress_epi8(smoosh, in); + _mm512_mask_storeu_epi8(outbuf, + _cvtu64_mask64(_pext_u64(_cvtmask64_u64(smoosh), + _cvtmask64_u64(smoosh))), + out); + outbuf += 31 + _mm_popcnt_u32(_cvtmask32_u32(is234byte)); + carry = 0; + + if (inlen < 32) { + goto tail; + } else { + continue; + } + } + __m512i lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in)); + __m512i hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1)); + + __m512i taglo = _mm512_set1_epi32(0x8080e000); + __m512i taghi = taglo; + + const __m512i fc00masked = + _mm512_and_epi32(in, _mm512_set1_epi16(int16_t(0xfc00))); + const __mmask32 hisurr = _mm512_mask_cmp_epu16_mask( + inmask, fc00masked, _mm512_set1_epi16(int16_t(0xd800)), _MM_CMPINT_EQ); + const __mmask32 losurr = _mm512_cmp_epu16_mask( + fc00masked, _mm512_set1_epi16(int16_t(0xdc00)), _MM_CMPINT_EQ); + + int carryout = 0; + if (!_kortestz_mask32_u8(hisurr, losurr)) { + // handle surrogates + + __m512i los = _mm512_alignr_epi32(hi, lo, 1); + __m512i his = _mm512_alignr_epi32(lo, hi, 1); + + const __mmask32 hisurrhi = _kshiftri_mask32(hisurr, 16); + taglo = _mm512_mask_mov_epi32(taglo, __mmask16(hisurr), + _mm512_set1_epi32(0x808080f0)); + taghi = _mm512_mask_mov_epi32(taghi, __mmask16(hisurrhi), + _mm512_set1_epi32(0x808080f0)); + + lo = _mm512_mask_slli_epi32(lo, __mmask16(hisurr), lo, 10); + hi = _mm512_mask_slli_epi32(hi, __mmask16(hisurrhi), hi, 10); + los = _mm512_add_epi32(los, _mm512_set1_epi32(0xfca02400)); + his = _mm512_add_epi32(his, _mm512_set1_epi32(0xfca02400)); + lo = _mm512_mask_add_epi32(lo, __mmask16(hisurr), lo, los); + hi = _mm512_mask_add_epi32(hi, __mmask16(hisurrhi), hi, his); + + carryout = _cvtu32_mask32(_kshiftri_mask32(hisurr, 30)); + + const uint32_t h = _cvtmask32_u32(hisurr); + const uint32_t l = _cvtmask32_u32(losurr); + // check for mismatched surrogates + if ((h + h + carry) ^ l) { + const uint32_t lonohi = l & ~(h + h + carry); + const uint32_t hinolo = h & ~(l >> 1); + inlen = _tzcnt_u32(hinolo | lonohi); + inmask = __mmask32(0x7fffffff & ((1U << inlen) - 1)); + in = _mm512_maskz_mov_epi16(inmask, in); + adjust = (int)inlen - 31; + inlen = 0; + goto failiteration; + } + } + + hi = _mm512_maskz_mov_epi32(_cvtu32_mask16(0x7fff), hi); + carry = carryout; + + __m512i mslo = + _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), lo); + + __m512i mshi = + _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), hi); + + const __mmask32 outmask = __mmask32(_kandn_mask64(losurr, inmask)); + const __mmask64 outmhi = _kshiftri_mask64(outmask, 16); + + const __mmask32 is1byte = __mmask32(_knot_mask64(is234byte)); + const __mmask64 is1bhi = _kshiftri_mask64(is1byte, 16); + const __mmask64 is12bhi = _kshiftri_mask64(is12byte, 16); + + taglo = _mm512_mask_mov_epi32(taglo, __mmask16(is12byte), + _mm512_set1_epi32(0x80c00000)); + taghi = _mm512_mask_mov_epi32(taghi, __mmask16(is12bhi), + _mm512_set1_epi32(0x80c00000)); + __m512i magiclo = _mm512_mask_blend_epi32(__mmask16(outmask), + _mm512_set1_epi32(0xffffffff), + _mm512_set1_epi32(0x00010101)); + __m512i magichi = _mm512_mask_blend_epi32(__mmask16(outmhi), + _mm512_set1_epi32(0xffffffff), + _mm512_set1_epi32(0x00010101)); + + magiclo = _mm512_mask_blend_epi32(__mmask16(outmask), + _mm512_set1_epi32(0xffffffff), + _mm512_set1_epi32(0x00010101)); + magichi = _mm512_mask_blend_epi32(__mmask16(outmhi), + _mm512_set1_epi32(0xffffffff), + _mm512_set1_epi32(0x00010101)); + + mslo = _mm512_ternarylogic_epi32(mslo, _mm512_set1_epi32(0x3f3f3f3f), taglo, + 0xea); // A&B|C + mshi = _mm512_ternarylogic_epi32(mshi, _mm512_set1_epi32(0x3f3f3f3f), taghi, + 0xea); + mslo = _mm512_mask_slli_epi32(mslo, __mmask16(is1byte), lo, 24); + + mshi = _mm512_mask_slli_epi32(mshi, __mmask16(is1bhi), hi, 24); + + const __mmask64 wantlo = + _mm512_cmp_epu8_mask(mslo, magiclo, _MM_CMPINT_NLT); + const __mmask64 wanthi = + _mm512_cmp_epu8_mask(mshi, magichi, _MM_CMPINT_NLT); + const __m512i outlo = _mm512_maskz_compress_epi8(wantlo, mslo); + const __m512i outhi = _mm512_maskz_compress_epi8(wanthi, mshi); + const uint64_t wantlo_uint64 = _cvtmask64_u64(wantlo); + const uint64_t wanthi_uint64 = _cvtmask64_u64(wanthi); + + uint64_t advlo = _mm_popcnt_u64(wantlo_uint64); + uint64_t advhi = _mm_popcnt_u64(wanthi_uint64); + + _mm512_mask_storeu_epi8( + outbuf, _cvtu64_mask64(_pext_u64(wantlo_uint64, wantlo_uint64)), outlo); + _mm512_mask_storeu_epi8( + outbuf + advlo, _cvtu64_mask64(_pext_u64(wanthi_uint64, wanthi_uint64)), + outhi); + outbuf += advlo + advhi; + } + outbuf += -adjust; + +tail: + if (inlen != 0) { + // We must have inlen < 31. + inmask = _cvtu32_mask32((1U << inlen) - 1); + in = _mm512_maskz_loadu_epi16(inmask, inbuf); + if (big_endian) { + in = _mm512_shuffle_epi8(in, byteflip); + } + adjust = (int)inlen - 31; + inlen = 0; + goto lastiteration; + } + *outlen = (outbuf - outbuf_orig) + adjust; + return ((inbuf - inbuf_orig) + adjust); +} +/* end file src/icelake/icelake_convert_utf16_to_utf8.inl.cpp */ +/* begin file src/icelake/icelake_convert_utf8_to_utf16.inl.cpp */ +// file included directly + +// File contains conversion procedure from possibly invalid UTF-8 strings. + +/** + * Attempts to convert up to len 1-byte code units from in (in UTF-8 format) to + * out. + * Returns the position of the input and output after the processing is + * completed. Upon error, the output is set to null. + */ + +template +utf8_to_utf16_result +fast_avx512_convert_utf8_to_utf16(const char *in, size_t len, char16_t *out) { + const char *const final_in = in + len; + bool result = true; + while (result) { + if (final_in - in >= 64) { + result = process_block_utf8_to_utf16( + in, out, final_in - in); + } else if (in < final_in) { + result = process_block_utf8_to_utf16( + in, out, final_in - in); + } else { + break; + } + } + if (!result) { + out = nullptr; + } + return std::make_pair(in, out); +} + +template +simdutf::result fast_avx512_convert_utf8_to_utf16_with_errors(const char *in, + size_t len, + char16_t *out) { + const char *const init_in = in; + const char16_t *const init_out = out; + const char *const final_in = in + len; + bool result = true; + while (result) { + if (final_in - in >= 64) { + result = process_block_utf8_to_utf16( + in, out, final_in - in); + } else if (in < final_in) { + result = process_block_utf8_to_utf16( + in, out, final_in - in); + } else { + break; + } + } + if (!result) { + size_t pos = size_t(in - init_in); + if (pos < len && (init_in[pos] & 0xc0) == 0x80 && pos >= 64) { + // We must check whether we are the fourth continuation byte + bool c1 = (init_in[pos - 1] & 0xc0) == 0x80; + bool c2 = (init_in[pos - 2] & 0xc0) == 0x80; + bool c3 = (init_in[pos - 3] & 0xc0) == 0x80; + if (c1 && c2 && c3) { + return {simdutf::TOO_LONG, pos}; + } + } + // rewind_and_convert_with_errors will seek a potential error from in + // onward, with the ability to go back up to in - init_in bytes, and read + // final_in - in bytes forward. + simdutf::result res = + scalar::utf8_to_utf16::rewind_and_convert_with_errors( + in - init_in, in, final_in - in, out); + res.count += (in - init_in); + return res; + } else { + return simdutf::result(error_code::SUCCESS, out - init_out); + } +} +/* end file src/icelake/icelake_convert_utf8_to_utf16.inl.cpp */ +/* begin file src/icelake/icelake_utf8_length_from_utf16.inl.cpp */ +template +simdutf_really_inline size_t icelake_utf8_length_from_utf16(const char16_t *in, + size_t size) { + + using vector_u16 = simd16; + constexpr size_t N = vector_u16::ELEMENTS; // 32 on AVX-512 + if (N + 1 > size) { + result scalar_result = + scalar::utf16::utf8_length_from_utf16_with_replacement( + in, size); + return scalar_result.count; + } // special case for short inputs + size_t pos = 0; + + const __m512i byteflip = _mm512_setr_epi64( + 0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001, + 0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809); + + size_t count = 0; + + for (; pos < size / (2 * N) * (2 * N); pos += 2 * N) { + + __m512i input1 = + _mm512_loadu_si512(reinterpret_cast(in + pos)); + __m512i input2 = + _mm512_loadu_si512(reinterpret_cast(in + pos + N)); + + if simdutf_constexpr (!match_system(big_endian)) { + input1 = _mm512_shuffle_epi8(input1, byteflip); + input2 = _mm512_shuffle_epi8(input2, byteflip); + } + // 0xd800 .. 0xdbff - low surrogate + // 0xdc00 .. 0xdfff - high surrogate + __mmask32 is_surrogate1 = _mm512_cmpeq_epi16_mask( + _mm512_and_si512(input1, _mm512_set1_epi16(uint16_t(0xf800))), + _mm512_set1_epi16(uint16_t(0xd800))); + __mmask32 is_surrogate2 = _mm512_cmpeq_epi16_mask( + _mm512_and_si512(input2, _mm512_set1_epi16(uint16_t(0xf800))), + _mm512_set1_epi16(uint16_t(0xd800))); + // c0 - chars that yield 2- or 3-byte UTF-8 codes + __mmask32 c01 = + _mm512_test_epi16_mask(input1, _mm512_set1_epi16(uint16_t(0xff80))); + __mmask32 c02 = + _mm512_test_epi16_mask(input2, _mm512_set1_epi16(uint16_t(0xff80))); + + // c1 - chars that yield 3-byte UTF-8 codes (including surrogates) + __mmask32 c11 = + _mm512_test_epi16_mask(input1, _mm512_set1_epi16(uint16_t(0xf800))); + __mmask32 c12 = + _mm512_test_epi16_mask(input2, _mm512_set1_epi16(uint16_t(0xf800))); + count += count_ones32(c01); + count += count_ones32(c11); + count -= count_ones32(is_surrogate1); + count += count_ones32(c02); + count += count_ones32(c12); + count -= count_ones32(is_surrogate2); + } + if (pos + N <= size) { + __m512i input = + _mm512_loadu_si512(reinterpret_cast(in + pos)); + if simdutf_constexpr (!match_system(big_endian)) { + input = _mm512_shuffle_epi8(input, byteflip); + } + // 0xd800 .. 0xdbff - low surrogate + // 0xdc00 .. 0xdfff - high surrogate + __mmask32 is_surrogate = _mm512_cmpeq_epi16_mask( + _mm512_and_si512(input, _mm512_set1_epi16(uint16_t(0xf800))), + _mm512_set1_epi16(uint16_t(0xd800))); + + // c0 - chars that yield 2- or 3-byte UTF-8 codes + __mmask32 c0 = + _mm512_test_epi16_mask(input, _mm512_set1_epi16(uint16_t(0xff80))); + + // c1 - chars that yield 3-byte UTF-8 codes (including surrogates) + __mmask32 c1 = + _mm512_test_epi16_mask(input, _mm512_set1_epi16(uint16_t(0xf800))); + count += count_ones32(c0); + count += count_ones32(c1); + count -= count_ones32(is_surrogate); + pos += N; + } + // At this point, we have processed 'pos' char16 values and we have less than + // N remaining. + __mmask32 remaining_mask = + 0xFFFFFFFFULL >> + (32 - (size - pos)); // mask for the remaining char16 values + __m512i input = _mm512_maskz_loadu_epi16(remaining_mask, in + pos); + if simdutf_constexpr (!match_system(big_endian)) { + input = _mm512_shuffle_epi8(input, byteflip); + } + // 0xd800 .. 0xdbff - low surrogate + // 0xdc00 .. 0xdfff - high surrogate + __mmask32 is_surrogate = _mm512_cmpeq_epi16_mask( + _mm512_and_si512(input, _mm512_set1_epi16(uint16_t(0xf800))), + _mm512_set1_epi16(uint16_t(0xd800))); + + // c0 - chars that yield 2- or 3-byte UTF-8 codes + __mmask32 c0 = + _mm512_test_epi16_mask(input, _mm512_set1_epi16(uint16_t(0xff80))); + + // c1 - chars that yield 3-byte UTF-8 codes (including surrogates) + __mmask32 c1 = + _mm512_test_epi16_mask(input, _mm512_set1_epi16(uint16_t(0xf800))); + count += count_ones32(c0); + count += count_ones32(c1); + count -= count_ones32(is_surrogate); + pos = size; + + count += pos; + return count; +} + +template +simdutf_really_inline result icelake_utf8_length_from_utf16_with_replacement( + const char16_t *in, size_t size) { + /////// + // We repeat 3 times the same algorithm. + // First, we proceed with an unrolled loop of 2*N char16 values (for speed). + // Second, we process N char16 values. + // Finally, we process the remaining char16 values (less than N). + /////// + using vector_u16 = simd16; + constexpr size_t N = vector_u16::ELEMENTS; // 32 on AVX-512 + if (N + 1 > size) { + return scalar::utf16::utf8_length_from_utf16_with_replacement( + in, size); + } // special case for short inputs + size_t pos = 0; + + const __m512i byteflip = _mm512_setr_epi64( + 0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001, + 0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809); + + constexpr uint32_t straddle_mask = + match_system(big_endian) ? 0xfc00fc00 : 0x00fc00fc; + constexpr uint32_t straddle_pair = + match_system(big_endian) ? 0xdc00d800 : 0x00dc00d8; + + size_t count = 0; + bool any_surrogates = false; + // We assume all surrogates are mismatched and count here the matched + // ones. + size_t matches = 0; + + for (; pos < (size - 1) / (2 * N) * (2 * N); pos += 2 * N) { + __m512i current1 = + _mm512_loadu_si512(reinterpret_cast(in + pos)); + if simdutf_constexpr (!match_system(big_endian)) { + current1 = _mm512_shuffle_epi8(current1, byteflip); + } + __m512i current2 = + _mm512_loadu_si512(reinterpret_cast(in + pos + N)); + if simdutf_constexpr (!match_system(big_endian)) { + current2 = _mm512_shuffle_epi8(current2, byteflip); + } + + __mmask32 is_surrogate1 = _mm512_cmpeq_epi16_mask( + _mm512_and_si512(current1, _mm512_set1_epi16(uint16_t(0xf800))), + _mm512_set1_epi16(uint16_t(0xd800))); + __mmask32 is_surrogate2 = _mm512_cmpeq_epi16_mask( + _mm512_and_si512(current2, _mm512_set1_epi16(uint16_t(0xf800))), + _mm512_set1_epi16(uint16_t(0xd800))); + __mmask32 c01 = + _mm512_test_epi16_mask(current1, _mm512_set1_epi16(uint16_t(0xff80))); + __mmask32 c11 = + _mm512_test_epi16_mask(current1, _mm512_set1_epi16(uint16_t(0xf800))); + __mmask32 c02 = + _mm512_test_epi16_mask(current2, _mm512_set1_epi16(uint16_t(0xff80))); + __mmask32 c12 = + _mm512_test_epi16_mask(current2, _mm512_set1_epi16(uint16_t(0xf800))); + count += count_ones32(c01); + count += count_ones32(c11); + count += count_ones32(c02); + count += count_ones32(c12); + if (_kor_mask32(is_surrogate1, is_surrogate2)) { + any_surrogates = true; + __m512i lb_masked1 = + _mm512_and_si512(current1, _mm512_set1_epi16(uint16_t(0xfc00))); + __mmask32 hi_surrogates1 = _mm512_cmpeq_epi16_mask( + lb_masked1, _mm512_set1_epi16(uint16_t(0xd800))); + __mmask32 lo_surrogates1 = _mm512_cmpeq_epi16_mask( + lb_masked1, _mm512_set1_epi16(uint16_t(0xdc00))); + __m512i lb_masked2 = + _mm512_and_si512(current2, _mm512_set1_epi16(uint16_t(0xfc00))); + __mmask32 hi_surrogates2 = _mm512_cmpeq_epi16_mask( + lb_masked2, _mm512_set1_epi16(uint16_t(0xd800))); + __mmask32 lo_surrogates2 = _mm512_cmpeq_epi16_mask( + lb_masked2, _mm512_set1_epi16(uint16_t(0xdc00))); + matches += count_ones32( + _kand_mask32(_kshiftli_mask32(hi_surrogates1, 1), lo_surrogates1)); + matches += count_ones32( + _kand_mask32(_kshiftli_mask32(hi_surrogates2, 1), lo_surrogates2)); + uint32_t straddle1, straddle2; + memcpy(&straddle1, in + pos + 1 * N - 1, sizeof(uint32_t)); + memcpy(&straddle2, in + pos + 2 * N - 1, sizeof(uint32_t)); + matches += ((straddle1 & straddle_mask) == straddle_pair) + + ((straddle2 & straddle_mask) == straddle_pair); + } + } + if (pos + N + 1 <= size) { + __m512i input = + _mm512_loadu_si512(reinterpret_cast(in + pos)); + if simdutf_constexpr (!match_system(big_endian)) { + input = _mm512_shuffle_epi8(input, byteflip); + } + + __mmask32 is_surrogate = _mm512_cmpeq_epi16_mask( + _mm512_and_si512(input, _mm512_set1_epi16(uint16_t(0xf800))), + _mm512_set1_epi16(uint16_t(0xd800))); + __mmask32 c0 = + _mm512_test_epi16_mask(input, _mm512_set1_epi16(uint16_t(0xff80))); + __mmask32 c1 = + _mm512_test_epi16_mask(input, _mm512_set1_epi16(uint16_t(0xf800))); + count += count_ones32(c0); + count += count_ones32(c1); + if (is_surrogate) { + any_surrogates = true; + __m512i lb_masked = + _mm512_and_si512(input, _mm512_set1_epi16(uint16_t(0xfc00))); + __mmask32 hi_surrogates = _mm512_cmpeq_epi16_mask( + lb_masked, _mm512_set1_epi16(uint16_t(0xd800))); + __mmask32 lo_surrogates = _mm512_cmpeq_epi16_mask( + lb_masked, _mm512_set1_epi16(uint16_t(0xdc00))); + matches += count_ones32( + _kand_mask32(_kshiftli_mask32(hi_surrogates, 1), lo_surrogates)); + uint32_t straddle; + memcpy(&straddle, in + pos + N - 1, sizeof(uint32_t)); + matches += (straddle & straddle_mask) == straddle_pair; + } + pos += N; + } + + size_t overshoot = 32 - (size - pos); + __mmask32 remaining_mask(uint32_t(0xFFFFFFFFULL << overshoot)); + __m512i input = + _mm512_maskz_loadu_epi16(remaining_mask, in + pos - overshoot); + if simdutf_constexpr (!match_system(big_endian)) { + input = _mm512_shuffle_epi8(input, byteflip); + } + + __mmask32 is_surrogate = _mm512_cmpeq_epi16_mask( + _mm512_and_si512(input, _mm512_set1_epi16(uint16_t(0xf800))), + _mm512_set1_epi16(uint16_t(0xd800))); + __mmask32 c0 = + _mm512_test_epi16_mask(input, _mm512_set1_epi16(uint16_t(0xff80))); + __mmask32 c1 = + _mm512_test_epi16_mask(input, _mm512_set1_epi16(uint16_t(0xf800))); + + count += count_ones32(c0); + count += count_ones32(c1); + if (is_surrogate) { + any_surrogates = true; + __m512i lb_masked = + _mm512_and_si512(input, _mm512_set1_epi16(uint16_t(0xfc00))); + __mmask32 hi_surrogates = + _mm512_cmpeq_epi16_mask(lb_masked, _mm512_set1_epi16(uint16_t(0xd800))); + __mmask32 lo_surrogates = + _mm512_cmpeq_epi16_mask(lb_masked, _mm512_set1_epi16(uint16_t(0xdc00))); + matches += count_ones32( + _kand_mask32(_kshiftli_mask32(hi_surrogates, 1), lo_surrogates)); + } + pos = size; + count += pos; + + count -= 2 * matches; + return {any_surrogates ? SURROGATE : SUCCESS, count}; +} +/* end file src/icelake/icelake_utf8_length_from_utf16.inl.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +/* begin file src/icelake/icelake_convert_utf16_to_utf32.inl.cpp */ +// file included directly + +/* + Returns a pair: the first unprocessed byte from buf and utf32_output + A scalar routing should carry on the conversion of the tail. +*/ +template +std::tuple +convert_utf16_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_output) { + const char16_t *end = buf + len; + const __m512i v_fc00 = _mm512_set1_epi16((uint16_t)0xfc00); + const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800); + const __m512i v_dc00 = _mm512_set1_epi16((uint16_t)0xdc00); + __mmask32 carry{0}; + const __m512i byteflip = _mm512_setr_epi64( + 0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001, + 0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809); + while (std::distance(buf, end) >= 32) { + // Always safe because buf + 32 <= end so that end - buf >= 32 bytes: + __m512i in = _mm512_loadu_si512((__m512i *)buf); + if (big_endian) { + in = _mm512_shuffle_epi8(in, byteflip); + } + + // H - bitmask for high surrogates + const __mmask32 H = + _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_d800); + // H - bitmask for low surrogates + const __mmask32 L = + _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_dc00); + + if ((H | L)) { + // surrogate pair(s) in a register + const __mmask32 V = + (L ^ + (carry | (H << 1))); // A high surrogate must be followed by low one + // and a low one must be preceded by a high one. + // If valid, V should be equal to 0 + + if (V == 0) { + // valid case + /* + Input surrogate pair: + |1101.11aa.aaaa.aaaa|1101.10bb.bbbb.bbbb| + low surrogate high surrogate + */ + /* 1. Expand all code units to 32-bit code units + in + |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb| + */ + const __m512i first = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in)); + const __m512i second = + _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1)); + + /* 2. Shift by one 16-bit word to align low surrogates with high + surrogates in + |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb| + shifted + |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa| + */ + const __m512i shifted_first = _mm512_alignr_epi32(second, first, 1); + const __m512i shifted_second = + _mm512_alignr_epi32(_mm512_setzero_si512(), second, 1); + + /* 3. Align all high surrogates in first and second by shifting to the + left by 10 bits + |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000| + */ + const __m512i aligned_first = + _mm512_mask_slli_epi32(first, (__mmask16)H, first, 10); + const __m512i aligned_second = + _mm512_mask_slli_epi32(second, (__mmask16)(H >> 16), second, 10); + + /* 4. Remove surrogate prefixes and add offset 0x10000 by adding in, + shifted and constant in + |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000| + shifted + |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa| + constant|1111.1100.1010.0000.0010.0100.0000.0000|1111.1100.1010.0000.0010.0100.0000.0000| + */ + const __m512i constant = _mm512_set1_epi32((uint32_t)0xfca02400); + const __m512i added_first = _mm512_mask_add_epi32( + aligned_first, (__mmask16)H, aligned_first, shifted_first); + const __m512i utf32_first = _mm512_mask_add_epi32( + added_first, (__mmask16)H, added_first, constant); + + const __m512i added_second = + _mm512_mask_add_epi32(aligned_second, (__mmask16)(H >> 16), + aligned_second, shifted_second); + const __m512i utf32_second = _mm512_mask_add_epi32( + added_second, (__mmask16)(H >> 16), added_second, constant); + + // 5. Store all valid UTF-32 code units (low surrogate positions and + // 32nd word are invalid) + const __mmask32 valid = ~L & 0x7fffffff; + // We deliberately do a _mm512_maskz_compress_epi32 followed by + // storeu_epi32 to ease performance portability to Zen 4. + const __m512i compressed_first = + _mm512_maskz_compress_epi32((__mmask16)(valid), utf32_first); + const size_t howmany1 = count_ones((uint16_t)(valid)); + _mm512_storeu_si512((__m512i *)utf32_output, compressed_first); + utf32_output += howmany1; + const __m512i compressed_second = + _mm512_maskz_compress_epi32((__mmask16)(valid >> 16), utf32_second); + const size_t howmany2 = count_ones((uint16_t)(valid >> 16)); + // The following could be unsafe in some cases? + //_mm512_storeu_epi32((__m512i *) utf32_output, compressed_second); + _mm512_mask_storeu_epi32((__m512i *)utf32_output, + __mmask16((1 << howmany2) - 1), + compressed_second); + utf32_output += howmany2; + // Only process 31 code units, but keep track if the 31st word is a high + // surrogate as a carry + buf += 31; + carry = (H >> 30) & 0x1; + } else { + // invalid case + return std::make_tuple(buf + carry, utf32_output, false); + } + } else { + // no surrogates + // extend all thirty-two 16-bit code units to thirty-two 32-bit code units + _mm512_storeu_si512((__m512i *)(utf32_output), + _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in))); + _mm512_storeu_si512( + (__m512i *)(utf32_output) + 1, + _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1))); + utf32_output += 32; + buf += 32; + carry = 0; + } + } // while + return std::make_tuple(buf + carry, utf32_output, true); +} +/* end file src/icelake/icelake_convert_utf16_to_utf32.inl.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF32 +/* begin file src/icelake/icelake_convert_utf32_to_latin1.inl.cpp */ +// file included directly +size_t icelake_convert_utf32_to_latin1(const char32_t *buf, size_t len, + char *latin1_output) { + const char32_t *end = buf + len; + __m512i v_0xFF = _mm512_set1_epi32(0xff); + __m512i shufmask = _mm512_set_epi8( + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60, + 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0); + while (end - buf >= 16) { + __m512i in = _mm512_loadu_si512((__m512i *)buf); + if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) { + return 0; + } + _mm_storeu_si128( + (__m128i *)latin1_output, + _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in))); + latin1_output += 16; + buf += 16; + } + if (buf < end) { + uint16_t mask = uint16_t((1 << (end - buf)) - 1); + __m512i in = _mm512_maskz_loadu_epi32(mask, buf); + if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) { + return 0; + } + _mm_mask_storeu_epi8( + latin1_output, mask, + _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in))); + } + return len; +} + +std::pair +icelake_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len, + char *latin1_output) { + const char32_t *end = buf + len; + const char32_t *start = buf; + __m512i v_0xFF = _mm512_set1_epi32(0xff); + __m512i shufmask = _mm512_set_epi8( + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60, + 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0); + while (end - buf >= 16) { + __m512i in = _mm512_loadu_si512((__m512i *)buf); + if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) { + while (uint32_t(*buf) <= 0xff) { + *latin1_output++ = uint8_t(*buf++); + } + return std::make_pair(result(error_code::TOO_LARGE, buf - start), + latin1_output); + } + _mm_storeu_si128( + (__m128i *)latin1_output, + _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in))); + latin1_output += 16; + buf += 16; + } + if (buf < end) { + uint16_t mask = uint16_t((1 << (end - buf)) - 1); + __m512i in = _mm512_maskz_loadu_epi32(mask, buf); + if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) { + while (uint32_t(*buf) <= 0xff) { + *latin1_output++ = uint8_t(*buf++); + } + return std::make_pair(result(error_code::TOO_LARGE, buf - start), + latin1_output); + } + _mm_mask_storeu_epi8( + latin1_output, mask, + _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in))); + } + return std::make_pair(result(error_code::SUCCESS, len), latin1_output); +} +/* end file src/icelake/icelake_convert_utf32_to_latin1.inl.cpp */ +#endif // SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +/* begin file src/icelake/icelake_convert_utf32_to_utf8.inl.cpp */ +// file included directly + +// Todo: currently, this is just the haswell code, optimize for icelake kernel. +std::pair +avx512_convert_utf32_to_utf8(const char32_t *buf, size_t len, + char *utf8_output) { + const char32_t *end = buf + len; + const __m256i v_0000 = _mm256_setzero_si256(); + const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000); + const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80); + const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800); + const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080); + const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff); + __m256i running_max = _mm256_setzero_si256(); + __m256i forbidden_bytemask = _mm256_setzero_si256(); + + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 + + while (end - buf >= std::ptrdiff_t(16 + safety_margin)) { + __m256i in = _mm256_loadu_si256((__m256i *)buf); + __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1); + running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin); + + // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned + // saturation + __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), + _mm256_and_si256(nextin, v_7fffffff)); + in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000); + + // Try to apply UTF-16 => UTF-8 routine on 256 bits + // (haswell/avx2_convert_utf16_to_utf8.cpp) + + if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!! + // 1. pack the bytes + const __m128i utf8_packed = _mm_packus_epi16( + _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1)); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i *)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } + // no bits set above 7th bit + const __m256i one_byte_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000); + const uint32_t one_byte_bitmask = + static_cast(_mm256_movemask_epi8(one_byte_bytemask)); + + // no bits set above 11th bit + const __m256i one_or_two_bytes_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000); + const uint32_t one_or_two_bytes_bitmask = + static_cast(_mm256_movemask_epi8(one_or_two_bytes_bytemask)); + if (one_or_two_bytes_bitmask == 0xffffffff) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00); + const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f); + + // t0 = [000a|aaaa|bbbb|bb00] + const __m256i t0 = _mm256_slli_epi16(in_16, 2); + // t1 = [000a|aaaa|0000|0000] + const __m256i t1 = _mm256_and_si256(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const __m256i t2 = _mm256_and_si256(in_16, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const __m256i t3 = _mm256_or_si256(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const __m256i t4 = _mm256_or_si256(t3, v_c080); + + // 2. merge ASCII and 2-byte codewords + const __m256i utf8_unpacked = + _mm256_blendv_epi8(t4, in_16, one_byte_bytemask); + + // 3. prepare bitmask for 8-bit lookup + const uint32_t M0 = one_byte_bitmask & 0x55555555; + const uint32_t M1 = M0 >> 7; + const uint32_t M2 = (M1 | M0) & 0x00ff00ff; + // 4. pack the bytes + + const uint8_t *row = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0]; + const uint8_t *row_2 = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> + 16)][0]; + + const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1)); + const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1)); + + const __m256i utf8_packed = _mm256_shuffle_epi8( + utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2)); + // 5. store bytes + _mm_storeu_si128((__m128i *)utf8_output, + _mm256_castsi256_si128(utf8_packed)); + utf8_output += row[0]; + _mm_storeu_si128((__m128i *)utf8_output, + _mm256_extractf128_si256(utf8_packed, 1)); + utf8_output += row_2[0]; + + // 6. adjust pointers + buf += 16; + continue; + } + // Must check for overflow in packing + const __m256i saturation_bytemask = _mm256_cmpeq_epi32( + _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000); + const uint32_t saturation_bitmask = + static_cast(_mm256_movemask_epi8(saturation_bytemask)); + if (saturation_bitmask == 0xffffffff) { + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes + const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800); + forbidden_bytemask = _mm256_or_si256( + forbidden_bytemask, + _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800)); + + const __m256i dup_even = _mm256_setr_epi16( + 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e, + 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - + single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two + UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - + three UTF-8 bytes + + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. + + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + + Finally from these two code units we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define simdutf_vec(x) _mm256_set1_epi16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000)); + + // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] + const __m256i s0 = _mm256_srli_epi16(in_16, 4); + // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] + const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100)); + // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] + const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140)); + // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000)); + const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, + simdutf_vec(0b0100000000000000)); + const __m256i s4 = _mm256_xor_si256(s3, m0); +#undef simdutf_vec + + // 4. expand code units 16-bit => 32-bit + const __m256i out0 = _mm256_unpacklo_epi16(t2, s4); + const __m256i out1 = _mm256_unpackhi_epi16(t2, s4); + + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + const uint32_t mask = (one_byte_bitmask & 0x55555555) | + (one_or_two_bytes_bitmask & 0xaaaaaaaa); + // Due to the wider registers, the following path is less likely to be + // useful. + /*if(mask == 0) { + // We only have three-byte code units. Use fast path. + const __m256i shuffle = + _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, + 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 = + _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 = + _mm256_shuffle_epi8(out1, shuffle); + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, + _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, + _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16; + continue; + }*/ + const uint8_t mask0 = uint8_t(mask); + const uint8_t *row0 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1)); + const __m128i utf8_0 = + _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0); + + const uint8_t mask1 = static_cast(mask >> 8); + const uint8_t *row1 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1)); + const __m128i utf8_1 = + _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1); + + const uint8_t mask2 = static_cast(mask >> 16); + const uint8_t *row2 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0]; + const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1)); + const __m128i utf8_2 = + _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2); + + const uint8_t mask3 = static_cast(mask >> 24); + const uint8_t *row3 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0]; + const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1)); + const __m128i utf8_3 = + _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3); + + _mm_storeu_si128((__m128i *)utf8_output, utf8_0); + utf8_output += row0[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_1); + utf8_output += row1[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_2); + utf8_output += row2[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_3); + utf8_output += row3[0]; + buf += 16; + } else { + // case: at least one 32-bit word is larger than 0xFFFF <=> it will + // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem + // wasteful to use scalar code, but being efficient with SIMD may require + // large, non-trivial tables? + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint32_t word = buf[k]; + if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII) + *utf8_output++ = char(word); + } else if ((word & 0xFFFFF800) == 0) { // 2-byte + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if ((word & 0xFFFF0000) == 0) { // 3-byte + if (word >= 0xD800 && word <= 0xDFFF) { + return std::make_pair(nullptr, utf8_output); + } + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { // 4-byte + if (word > 0x10FFFF) { + return std::make_pair(nullptr, utf8_output); + } + *utf8_output++ = char((word >> 18) | 0b11110000); + *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + + // check for invalid input + const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff); + if (static_cast(_mm256_movemask_epi8(_mm256_cmpeq_epi32( + _mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) { + return std::make_pair(nullptr, utf8_output); + } + + if (static_cast(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { + return std::make_pair(nullptr, utf8_output); + } + + return std::make_pair(buf, utf8_output); +} + +// Todo: currently, this is just the haswell code, optimize for icelake kernel. +std::pair +avx512_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len, + char *utf8_output) { + const char32_t *end = buf + len; + const char32_t *start = buf; + + const __m256i v_0000 = _mm256_setzero_si256(); + const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000); + const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80); + const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800); + const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080); + const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff); + const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff); + + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 + + while (end - buf >= std::ptrdiff_t(16 + safety_margin)) { + __m256i in = _mm256_loadu_si256((__m256i *)buf); + __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1); + // Check for too large input + const __m256i max_input = + _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff); + if (static_cast(_mm256_movemask_epi8( + _mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) { + return std::make_pair(result(error_code::TOO_LARGE, buf - start), + utf8_output); + } + + // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned + // saturation + __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), + _mm256_and_si256(nextin, v_7fffffff)); + in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000); + + // Try to apply UTF-16 => UTF-8 routine on 256 bits + // (haswell/avx2_convert_utf16_to_utf8.cpp) + + if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!! + // 1. pack the bytes + const __m128i utf8_packed = _mm_packus_epi16( + _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1)); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i *)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } + // no bits set above 7th bit + const __m256i one_byte_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000); + const uint32_t one_byte_bitmask = + static_cast(_mm256_movemask_epi8(one_byte_bytemask)); + + // no bits set above 11th bit + const __m256i one_or_two_bytes_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000); + const uint32_t one_or_two_bytes_bitmask = + static_cast(_mm256_movemask_epi8(one_or_two_bytes_bytemask)); + if (one_or_two_bytes_bitmask == 0xffffffff) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00); + const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f); + + // t0 = [000a|aaaa|bbbb|bb00] + const __m256i t0 = _mm256_slli_epi16(in_16, 2); + // t1 = [000a|aaaa|0000|0000] + const __m256i t1 = _mm256_and_si256(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const __m256i t2 = _mm256_and_si256(in_16, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const __m256i t3 = _mm256_or_si256(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const __m256i t4 = _mm256_or_si256(t3, v_c080); + + // 2. merge ASCII and 2-byte codewords + const __m256i utf8_unpacked = + _mm256_blendv_epi8(t4, in_16, one_byte_bytemask); + + // 3. prepare bitmask for 8-bit lookup + const uint32_t M0 = one_byte_bitmask & 0x55555555; + const uint32_t M1 = M0 >> 7; + const uint32_t M2 = (M1 | M0) & 0x00ff00ff; + // 4. pack the bytes + + const uint8_t *row = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0]; + const uint8_t *row_2 = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> + 16)][0]; + + const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1)); + const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1)); + + const __m256i utf8_packed = _mm256_shuffle_epi8( + utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2)); + // 5. store bytes + _mm_storeu_si128((__m128i *)utf8_output, + _mm256_castsi256_si128(utf8_packed)); + utf8_output += row[0]; + _mm_storeu_si128((__m128i *)utf8_output, + _mm256_extractf128_si256(utf8_packed, 1)); + utf8_output += row_2[0]; + + // 6. adjust pointers + buf += 16; + continue; + } + // Must check for overflow in packing + const __m256i saturation_bytemask = _mm256_cmpeq_epi32( + _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000); + const uint32_t saturation_bitmask = + static_cast(_mm256_movemask_epi8(saturation_bytemask)); + if (saturation_bitmask == 0xffffffff) { + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes + + // Check for illegal surrogate code units + const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800); + const __m256i forbidden_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800); + if (static_cast(_mm256_movemask_epi8(forbidden_bytemask)) != + 0x0) { + return std::make_pair(result(error_code::SURROGATE, buf - start), + utf8_output); + } + + const __m256i dup_even = _mm256_setr_epi16( + 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e, + 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - + single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two + UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - + three UTF-8 bytes + + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. + + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + + Finally from these two code units we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define simdutf_vec(x) _mm256_set1_epi16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000)); + + // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] + const __m256i s0 = _mm256_srli_epi16(in_16, 4); + // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] + const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100)); + // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] + const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140)); + // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000)); + const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, + simdutf_vec(0b0100000000000000)); + const __m256i s4 = _mm256_xor_si256(s3, m0); +#undef simdutf_vec + + // 4. expand code units 16-bit => 32-bit + const __m256i out0 = _mm256_unpacklo_epi16(t2, s4); + const __m256i out1 = _mm256_unpackhi_epi16(t2, s4); + + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + const uint32_t mask = (one_byte_bitmask & 0x55555555) | + (one_or_two_bytes_bitmask & 0xaaaaaaaa); + // Due to the wider registers, the following path is less likely to be + // useful. + /*if(mask == 0) { + // We only have three-byte code units. Use fast path. + const __m256i shuffle = + _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, + 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 = + _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 = + _mm256_shuffle_epi8(out1, shuffle); + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, + _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, + _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16; + continue; + }*/ + const uint8_t mask0 = uint8_t(mask); + const uint8_t *row0 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1)); + const __m128i utf8_0 = + _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0); + + const uint8_t mask1 = static_cast(mask >> 8); + const uint8_t *row1 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1)); + const __m128i utf8_1 = + _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1); + + const uint8_t mask2 = static_cast(mask >> 16); + const uint8_t *row2 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0]; + const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1)); + const __m128i utf8_2 = + _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2); + + const uint8_t mask3 = static_cast(mask >> 24); + const uint8_t *row3 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0]; + const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1)); + const __m128i utf8_3 = + _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3); + + _mm_storeu_si128((__m128i *)utf8_output, utf8_0); + utf8_output += row0[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_1); + utf8_output += row1[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_2); + utf8_output += row2[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_3); + utf8_output += row3[0]; + buf += 16; + } else { + // case: at least one 32-bit word is larger than 0xFFFF <=> it will + // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem + // wasteful to use scalar code, but being efficient with SIMD may require + // large, non-trivial tables? + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint32_t word = buf[k]; + if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII) + *utf8_output++ = char(word); + } else if ((word & 0xFFFFF800) == 0) { // 2-byte + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if ((word & 0xFFFF0000) == 0) { // 3-byte + if (word >= 0xD800 && word <= 0xDFFF) { + return std::make_pair( + result(error_code::SURROGATE, buf - start + k), utf8_output); + } + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { // 4-byte + if (word > 0x10FFFF) { + return std::make_pair( + result(error_code::TOO_LARGE, buf - start + k), utf8_output); + } + *utf8_output++ = char((word >> 18) | 0b11110000); + *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + + return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output); +} +/* end file src/icelake/icelake_convert_utf32_to_utf8.inl.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +/* begin file src/icelake/icelake_convert_utf32_to_utf16.inl.cpp */ +// file included directly + +template +std::pair +avx512_convert_utf32_to_utf16(const char32_t *buf, size_t len, + char16_t *utf16_output) { + const char32_t *end = buf + len; + __mmask32 forbidden_bytemask = 0; + const __m512i v_00000000 = _mm512_setzero_si512(); + const __m512i v_ffff0000 = _mm512_set1_epi32((int32_t)0xffff0000); + const __m512i v_f800 = _mm512_set1_epi32((uint32_t)0xf800); + const __m512i v_d800 = _mm512_set1_epi32((uint32_t)0xd800); + const __m512i v_10ffff = _mm512_set1_epi32(0x10FFFF); + const __m512i v_10000 = _mm512_set1_epi32(0x10000); + const __m512i v_3ff0000 = _mm512_set1_epi32(0x3FF0000); + const __m512i v_3ff = _mm512_set1_epi32(0x3FF); + const __m512i v_dc00d800 = _mm512_set1_epi32((int32_t)0xDC00D800); + + while (end - buf >= std::ptrdiff_t(16)) { + __m512i in = _mm512_loadu_si512(buf); + + // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs + const __mmask16 saturation_bitmask = + _mm512_cmpeq_epi32_mask(_mm512_and_si512(in, v_ffff0000), v_00000000); + + if (saturation_bitmask == 0xffff) { + forbidden_bytemask |= + _mm512_cmpeq_epi32_mask(_mm512_and_si512(in, v_f800), v_d800); + + __m256i utf16_packed = _mm512_cvtepi32_epi16(in); + if (big_endian) { + const __m256i swap = _mm256_setr_epi8( + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, + 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + utf16_packed = _mm256_shuffle_epi8(utf16_packed, swap); + } + _mm256_storeu_si256((__m256i *)utf16_output, utf16_packed); + utf16_output += 16; + buf += 16; + } else { + // saturation_bitmask == 1 words will generate 1 utf16 char, + // and saturation_bitmask == 0 words will generate 2 utf16 chars assuming + // no errors. Thus we need a output_mask which has the structure b_2i = 1, + // b_2i+1 = !saturation_bitmask_i + const __mmask32 output_mask = ~_pdep_u32(saturation_bitmask, 0xAAAAAAAA); + const __mmask16 surrogate_bitmask = __mmask16(~saturation_bitmask); + __mmask32 error = _mm512_mask_cmpeq_epi32_mask( + saturation_bitmask, _mm512_and_si512(in, v_f800), v_d800); + error |= _mm512_mask_cmpgt_epu32_mask(surrogate_bitmask, in, v_10ffff); + if (simdutf_unlikely(error)) { + return std::make_pair(nullptr, utf16_output); + } + __m512i v1, v2, v; + // for the bits saturation_bitmask == 0, we need to unpack the 32-bit word + // into two 16 bit words corresponding to high_surrogate and + // low_surrogate. Once the bits are unpacked and merged, the output will + // be compressed as per output_mask. + in = _mm512_mask_sub_epi32(in, surrogate_bitmask, in, v_10000); + v1 = _mm512_mask_slli_epi32(in, surrogate_bitmask, in, 16); + v1 = _mm512_mask_and_epi32(in, surrogate_bitmask, v1, v_3ff0000); + v2 = _mm512_mask_srli_epi32(in, surrogate_bitmask, in, 10); + v2 = _mm512_mask_and_epi32(in, surrogate_bitmask, v2, v_3ff); + v = _mm512_or_si512(v1, v2); + in = _mm512_mask_add_epi32(in, surrogate_bitmask, v, v_dc00d800); + if (big_endian) { + const __m512i swap_512 = _mm512_set_epi8( + 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, + 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, + 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, + 2, 3, 0, 1); + in = _mm512_shuffle_epi8(in, swap_512); + } + // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability + // (AMD Zen4 has terrible performance with it, it is effectively broken) + __m512i compressed = _mm512_maskz_compress_epi16(output_mask, in); + auto written_out = _mm_popcnt_u32(output_mask); + _mm512_mask_storeu_epi16(utf16_output, _bzhi_u32(0xFFFFFFFF, written_out), + compressed); + //_mm512_mask_compressstoreu_epi16(utf16_output, output_mask, in); + utf16_output += written_out; + buf += 16; + } + } + + size_t remaining_len = size_t(end - buf); + if (remaining_len) { + __mmask16 input_mask = __mmask16((1 << remaining_len) - 1); + __m512i in = _mm512_maskz_loadu_epi32(input_mask, buf); + const __mmask16 saturation_bitmask = + _mm512_cmpeq_epi32_mask(_mm512_and_si512(in, v_ffff0000), v_00000000) & + input_mask; + if (saturation_bitmask == input_mask) { + forbidden_bytemask |= + _mm512_cmpeq_epi32_mask(_mm512_and_si512(in, v_f800), v_d800); + + __m256i utf16_packed = _mm512_cvtepi32_epi16(in); + if (big_endian) { + const __m256i swap = _mm256_setr_epi8( + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, + 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + utf16_packed = _mm256_shuffle_epi8(utf16_packed, swap); + } + _mm256_mask_storeu_epi16(utf16_output, input_mask, utf16_packed); + utf16_output += remaining_len; + buf += remaining_len; + } else { + const __mmask32 output_max_mask = (1 << (remaining_len * 2)) - 1; + const __mmask32 output_mask = + (~_pdep_u32(saturation_bitmask, 0xAAAAAAAA)) & output_max_mask; + const __mmask16 surrogate_bitmask = + __mmask16(~saturation_bitmask) & input_mask; + __mmask32 error = _mm512_mask_cmpeq_epi32_mask( + saturation_bitmask, _mm512_and_si512(in, v_f800), v_d800); + error |= _mm512_mask_cmpgt_epu32_mask(surrogate_bitmask, in, v_10ffff); + if (simdutf_unlikely(error)) { + return std::make_pair(nullptr, utf16_output); + } + __m512i v1, v2, v; + in = _mm512_mask_sub_epi32(in, surrogate_bitmask, in, v_10000); + v1 = _mm512_mask_slli_epi32(in, surrogate_bitmask, in, 16); + v1 = _mm512_mask_and_epi32(in, surrogate_bitmask, v1, v_3ff0000); + v2 = _mm512_mask_srli_epi32(in, surrogate_bitmask, in, 10); + v2 = _mm512_mask_and_epi32(in, surrogate_bitmask, v2, v_3ff); + v = _mm512_or_si512(v1, v2); + in = _mm512_mask_add_epi32(in, surrogate_bitmask, v, v_dc00d800); + if (big_endian) { + const __m512i swap_512 = _mm512_set_epi8( + 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, + 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, + 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, + 2, 3, 0, 1); + in = _mm512_shuffle_epi8(in, swap_512); + } + // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability + // (AMD Zen4 has terrible performance with it, it is effectively broken) + __m512i compressed = _mm512_maskz_compress_epi16(output_mask, in); + auto written_out = _mm_popcnt_u32(output_mask); + _mm512_mask_storeu_epi16(utf16_output, _bzhi_u32(0xFFFFFFFF, written_out), + compressed); + //_mm512_mask_compressstoreu_epi16(utf16_output, output_mask, in); + utf16_output += written_out; + buf += remaining_len; + } + } + + // check for invalid input + if (forbidden_bytemask != 0) { + return std::make_pair(nullptr, utf16_output); + } + + return std::make_pair(buf, utf16_output); +} + +template +std::pair +avx512_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len, + char16_t *utf16_output) { + const char32_t *start = buf; + const char32_t *end = buf + len; + const __m512i v_00000000 = _mm512_setzero_si512(); + const __m512i v_ffff0000 = _mm512_set1_epi32((int32_t)0xffff0000); + const __m512i v_f800 = _mm512_set1_epi32((uint32_t)0xf800); + const __m512i v_d800 = _mm512_set1_epi32((uint32_t)0xd800); + const __m512i v_10ffff = _mm512_set1_epi32(0x10FFFF); + const __m512i v_10000 = _mm512_set1_epi32(0x10000); + const __m512i v_3ff0000 = _mm512_set1_epi32(0x3FF0000); + const __m512i v_3ff = _mm512_set1_epi32(0x3FF); + const __m512i v_dc00d800 = _mm512_set1_epi32((int32_t)0xDC00D800); + int error_idx = 0; + error_code code = error_code::SUCCESS; + bool err = false; + + while (end - buf >= std::ptrdiff_t(16)) { + __m512i in = _mm512_loadu_si512(buf); + + // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs + const __mmask16 saturation_bitmask = + _mm512_cmpeq_epi32_mask(_mm512_and_si512(in, v_ffff0000), v_00000000); + + if (saturation_bitmask == 0xffff) { + __mmask32 forbidden_bytemask = + _mm512_cmpeq_epi32_mask(_mm512_and_si512(in, v_f800), v_d800); + + __m256i utf16_packed = _mm512_cvtepi32_epi16(in); + if (big_endian) { + const __m256i swap = _mm256_setr_epi8( + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, + 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + utf16_packed = _mm256_shuffle_epi8(utf16_packed, swap); + } + if (simdutf_unlikely(forbidden_bytemask)) { + int idx = _tzcnt_u32(forbidden_bytemask); + _mm256_mask_storeu_epi16( + utf16_output, __mmask16(_blsmsk_u32(forbidden_bytemask) >> 1), + utf16_packed); + return std::make_pair(result(error_code::SURROGATE, buf - start + idx), + utf16_output + idx); + } + _mm256_storeu_si256((__m256i *)utf16_output, utf16_packed); + utf16_output += 16; + } else { + __mmask32 output_mask = ~_pdep_u32(saturation_bitmask, 0xAAAAAAAA); + const __mmask16 surrogate_bitmask = __mmask16(~saturation_bitmask); + __mmask32 error_surrogate = _mm512_mask_cmpeq_epi32_mask( + saturation_bitmask, _mm512_and_si512(in, v_f800), v_d800); + __mmask32 error_too_large = + _mm512_mask_cmpgt_epu32_mask(surrogate_bitmask, in, v_10ffff); + if (simdutf_unlikely(error_surrogate || error_too_large)) { + // Need to find the lowest set bit between the two error masks + // Need to also write the partial chunk until the error index to output. + int large_idx = _tzcnt_u32(error_too_large); + int surrogate_idx = _tzcnt_u32(error_surrogate); + err = true; + if (large_idx < surrogate_idx) { + code = error_code::TOO_LARGE; + error_idx = large_idx; + } else { + code = error_code::SURROGATE; + error_idx = surrogate_idx; + } + output_mask &= ((1 << (2 * error_idx)) - 1); + } + __m512i v1, v2, v; + in = _mm512_mask_sub_epi32(in, surrogate_bitmask, in, v_10000); + v1 = _mm512_mask_slli_epi32(in, surrogate_bitmask, in, 16); + v1 = _mm512_mask_and_epi32(in, surrogate_bitmask, v1, v_3ff0000); + v2 = _mm512_mask_srli_epi32(in, surrogate_bitmask, in, 10); + v2 = _mm512_mask_and_epi32(in, surrogate_bitmask, v2, v_3ff); + v = _mm512_or_si512(v1, v2); + in = _mm512_mask_add_epi32(in, surrogate_bitmask, v, v_dc00d800); + if (big_endian) { + const __m512i swap_512 = _mm512_set_epi8( + 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, + 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, + 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, + 2, 3, 0, 1); + in = _mm512_shuffle_epi8(in, swap_512); + } + // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability + // (AMD Zen4 has terrible performance with it, it is effectively broken) + __m512i compressed = _mm512_maskz_compress_epi16(output_mask, in); + auto written_out = _mm_popcnt_u32(output_mask); + _mm512_mask_storeu_epi16(utf16_output, _bzhi_u32(0xFFFFFFFF, written_out), + compressed); + //_mm512_mask_compressstoreu_epi16(utf16_output, output_mask, in); + utf16_output += written_out; + if (simdutf_unlikely(err)) { + return std::make_pair(result(code, buf - start + error_idx), + utf16_output); + } + } + buf += 16; + } + + size_t remaining_len = size_t(end - buf); + if (remaining_len) { + __mmask16 input_mask = __mmask16((1 << remaining_len) - 1); + __m512i in = _mm512_maskz_loadu_epi32(input_mask, buf); + const __mmask16 saturation_bitmask = + _mm512_cmpeq_epi32_mask(_mm512_and_si512(in, v_ffff0000), v_00000000) & + input_mask; + if (saturation_bitmask == input_mask) { + __mmask32 forbidden_bytemask = + _mm512_cmpeq_epi32_mask(_mm512_and_si512(in, v_f800), v_d800); + __m256i utf16_packed = _mm512_cvtepi32_epi16(in); + if (big_endian) { + const __m256i swap = _mm256_setr_epi8( + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, + 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + utf16_packed = _mm256_shuffle_epi8(utf16_packed, swap); + } + if (simdutf_unlikely(forbidden_bytemask)) { + int idx = _tzcnt_u32(forbidden_bytemask); + _mm256_mask_storeu_epi16( + utf16_output, __mmask16(_blsmsk_u32(forbidden_bytemask) >> 1), + utf16_packed); + return std::make_pair(result(error_code::SURROGATE, buf - start + idx), + utf16_output + idx); + } + _mm256_mask_storeu_epi16(utf16_output, input_mask, utf16_packed); + utf16_output += remaining_len; + } else { + const __mmask32 output_max_mask = (1 << (remaining_len * 2)) - 1; + __mmask32 output_mask = + (~_pdep_u32(saturation_bitmask, 0xAAAAAAAA)) & output_max_mask; + const __mmask16 surrogate_bitmask = + __mmask16(~saturation_bitmask) & input_mask; + __mmask32 error_surrogate = _mm512_mask_cmpeq_epi32_mask( + saturation_bitmask, _mm512_and_si512(in, v_f800), v_d800); + __mmask32 error_too_large = + _mm512_mask_cmpgt_epu32_mask(surrogate_bitmask, in, v_10ffff); + if (simdutf_unlikely(error_surrogate || error_too_large)) { + int large_idx = _tzcnt_u32(error_too_large); + int surrogate_idx = _tzcnt_u32(error_surrogate); + err = true; + if (large_idx < surrogate_idx) { + code = error_code::TOO_LARGE; + error_idx = large_idx; + } else { + code = error_code::SURROGATE; + error_idx = surrogate_idx; + } + output_mask &= ((1 << (2 * error_idx)) - 1); + } + __m512i v1, v2, v; + in = _mm512_mask_sub_epi32(in, surrogate_bitmask, in, v_10000); + v1 = _mm512_mask_slli_epi32(in, surrogate_bitmask, in, 16); + v1 = _mm512_mask_and_epi32(in, surrogate_bitmask, v1, v_3ff0000); + v2 = _mm512_mask_srli_epi32(in, surrogate_bitmask, in, 10); + v2 = _mm512_mask_and_epi32(in, surrogate_bitmask, v2, v_3ff); + v = _mm512_or_si512(v1, v2); + in = _mm512_mask_add_epi32(in, surrogate_bitmask, v, v_dc00d800); + if (big_endian) { + const __m512i swap_512 = _mm512_set_epi8( + 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, + 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, + 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, + 2, 3, 0, 1); + in = _mm512_shuffle_epi8(in, swap_512); + } + // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability + // (AMD Zen4 has terrible performance with it, it is effectively broken) + __m512i compressed = _mm512_maskz_compress_epi16(output_mask, in); + auto written_out = _mm_popcnt_u32(output_mask); + _mm512_mask_storeu_epi16(utf16_output, _bzhi_u32(0xFFFFFFFF, written_out), + compressed); + //_mm512_mask_compressstoreu_epi16(utf16_output, output_mask, in); + utf16_output += written_out; + if (simdutf_unlikely(err)) { + return std::make_pair(result(code, buf - start + error_idx), + utf16_output); + } + } + buf += remaining_len; + } + + return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output); +} +/* end file src/icelake/icelake_convert_utf32_to_utf16.inl.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_ASCII +/* begin file src/icelake/icelake_ascii_validation.inl.cpp */ +// file included directly + +bool validate_ascii(const char *buf, size_t len) { + const char *end = buf + len; + const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80); + __m512i running_or = _mm512_setzero_si512(); + for (; end - buf >= 64; buf += 64) { + const __m512i utf8 = _mm512_loadu_si512((const __m512i *)buf); + running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii, + 0xf8); // running_or | (utf8 & ascii) + } + if (buf < end) { + const __m512i utf8 = _mm512_maskz_loadu_epi8( + (uint64_t(1) << (end - buf)) - 1, (const __m512i *)buf); + running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii, + 0xf8); // running_or | (utf8 & ascii) + } + return (_mm512_test_epi8_mask(running_or, running_or) == 0); +} +/* end file src/icelake/icelake_ascii_validation.inl.cpp */ +#endif // SIMDUTF_FEATURE_ASCII +#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING +/* begin file src/icelake/icelake_utf32_validation.inl.cpp */ +// file included directly + +bool validate_utf32(const char32_t *buf, size_t len) { + if (simdutf_unlikely(len == 0)) { + return true; + } + const char32_t *end = buf + len; + + const __m512i offset = _mm512_set1_epi32((uint32_t)0xffff2000); + __m512i currentmax = _mm512_setzero_si512(); + __m512i currentoffsetmax = _mm512_setzero_si512(); + + // Optimized: Process 32 values (2x 512-bit) per iteration for better + // throughput + while (end - buf >= 32) { + __m512i utf32_1 = _mm512_loadu_si512((const __m512i *)buf); + __m512i utf32_2 = _mm512_loadu_si512((const __m512i *)(buf + 16)); + buf += 32; + + // Process both blocks in parallel to maximize instruction-level parallelism + __m512i offsetmax_1 = _mm512_add_epi32(utf32_1, offset); + __m512i offsetmax_2 = _mm512_add_epi32(utf32_2, offset); + + currentoffsetmax = _mm512_max_epu32(offsetmax_1, currentoffsetmax); + currentmax = _mm512_max_epu32(utf32_1, currentmax); + + currentoffsetmax = _mm512_max_epu32(offsetmax_2, currentoffsetmax); + currentmax = _mm512_max_epu32(utf32_2, currentmax); + } + + // Handle remaining 16-31 values + if (end - buf >= 16) { + __m512i utf32 = _mm512_loadu_si512((const __m512i *)buf); + buf += 16; + currentoffsetmax = + _mm512_max_epu32(_mm512_add_epi32(utf32, offset), currentoffsetmax); + currentmax = _mm512_max_epu32(utf32, currentmax); + } + + // Handle remaining 0-15 values with masked load + if (buf < end) { + __m512i utf32 = + _mm512_maskz_loadu_epi32(__mmask16((1 << (end - buf)) - 1), buf); + currentoffsetmax = + _mm512_max_epu32(_mm512_add_epi32(utf32, offset), currentoffsetmax); + currentmax = _mm512_max_epu32(utf32, currentmax); + } + + const __m512i standardmax = _mm512_set1_epi32((uint32_t)0x10ffff); + const __m512i standardoffsetmax = _mm512_set1_epi32((uint32_t)0xfffff7ff); + const auto outside_range = _mm512_cmpgt_epu32_mask(currentmax, standardmax); + if (outside_range != 0) { + return false; + } + + const auto surrogate = + _mm512_cmpgt_epu32_mask(currentoffsetmax, standardoffsetmax); + if (surrogate != 0) { + return false; + } + + return true; +} +/* end file src/icelake/icelake_utf32_validation.inl.cpp */ +#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF8 +/* begin file src/icelake/icelake_convert_latin1_to_utf8.inl.cpp */ +// file included directly + +static inline size_t latin1_to_utf8_avx512_vec(__m512i input, size_t input_len, + char *utf8_output, + int mask_output) { + __mmask64 nonascii = _mm512_movepi8_mask(input); + size_t output_size = input_len + (size_t)count_ones(nonascii); + + // Mask to denote whether the byte is a leading byte that is not ascii + __mmask64 sixth = _mm512_cmpge_epu8_mask( + input, _mm512_set1_epi8(-64)); // binary representation of -64: 1100 0000 + + const uint64_t alternate_bits = UINT64_C(0x5555555555555555); + uint64_t ascii = ~nonascii; + // the bits in ascii are inverted and zeros are interspersed in between them + uint64_t maskA = ~_pdep_u64(ascii, alternate_bits); + uint64_t maskB = ~_pdep_u64(ascii >> 32, alternate_bits); + + // interleave bytes from top and bottom halves (abcd...ABCD -> aAbBcCdD) + __m512i input_interleaved = _mm512_permutexvar_epi8( + _mm512_set_epi32(0x3f1f3e1e, 0x3d1d3c1c, 0x3b1b3a1a, 0x39193818, + 0x37173616, 0x35153414, 0x33133212, 0x31113010, + 0x2f0f2e0e, 0x2d0d2c0c, 0x2b0b2a0a, 0x29092808, + 0x27072606, 0x25052404, 0x23032202, 0x21012000), + input); + + // double size of each byte, and insert the leading byte 1100 0010 + + /* + upscale the bytes to 16-bit value, adding the 0b11000000 leading byte in the + process. We adjust for the bytes that have their two most significant bits. + This takes care of the first 32 bytes, assuming we interleaved the bytes. */ + __m512i outputA = + _mm512_shldi_epi16(input_interleaved, _mm512_set1_epi8(-62), 8); + outputA = _mm512_mask_add_epi16( + outputA, (__mmask32)sixth, outputA, + _mm512_set1_epi16(1 - 0x4000)); // 1- 0x4000 = 1100 0000 0000 0001???? + + // in the second 32-bit half, set first or second option based on whether + // original input is leading byte (second case) or not (first case) + __m512i leadingB = + _mm512_mask_blend_epi16((__mmask32)(sixth >> 32), + _mm512_set1_epi16(0x00c2), // 0000 0000 1101 0010 + _mm512_set1_epi16(0x40c3)); // 0100 0000 1100 0011 + __m512i outputB = _mm512_ternarylogic_epi32( + input_interleaved, leadingB, _mm512_set1_epi16((short)0xff00), + (240 & 170) ^ 204); // (input_interleaved & 0xff00) ^ leadingB + + // prune redundant bytes + outputA = _mm512_maskz_compress_epi8(maskA, outputA); + outputB = _mm512_maskz_compress_epi8(maskB, outputB); + + size_t output_sizeA = (size_t)count_ones((uint32_t)nonascii) + 32; + + if (mask_output) { + if (input_len > 32) { // is the second half of the input vector used? + __mmask64 write_mask = _bzhi_u64(~0ULL, (unsigned int)output_sizeA); + _mm512_mask_storeu_epi8(utf8_output, write_mask, outputA); + utf8_output += output_sizeA; + write_mask = _bzhi_u64(~0ULL, (unsigned int)(output_size - output_sizeA)); + _mm512_mask_storeu_epi8(utf8_output, write_mask, outputB); + } else { + __mmask64 write_mask = _bzhi_u64(~0ULL, (unsigned int)output_size); + _mm512_mask_storeu_epi8(utf8_output, write_mask, outputA); + } + } else { + _mm512_storeu_si512(utf8_output, outputA); + utf8_output += output_sizeA; + _mm512_storeu_si512(utf8_output, outputB); + } + return output_size; +} + +static inline size_t latin1_to_utf8_avx512_branch(__m512i input, + char *utf8_output) { + __mmask64 nonascii = _mm512_movepi8_mask(input); + if (nonascii) { + return latin1_to_utf8_avx512_vec(input, 64, utf8_output, 0); + } else { + _mm512_storeu_si512(utf8_output, input); + return 64; + } +} + +size_t latin1_to_utf8_avx512_start(const char *buf, size_t len, + char *utf8_output) { + char *start = utf8_output; + size_t pos = 0; + // if there's at least 128 bytes remaining, we don't need to mask the output + for (; pos + 128 <= len; pos += 64) { + __m512i input = _mm512_loadu_si512((__m512i *)(buf + pos)); + utf8_output += latin1_to_utf8_avx512_branch(input, utf8_output); + } + // in the last 128 bytes, the first 64 may require masking the output + if (pos + 64 <= len) { + __m512i input = _mm512_loadu_si512((__m512i *)(buf + pos)); + utf8_output += latin1_to_utf8_avx512_vec(input, 64, utf8_output, 1); + pos += 64; + } + // with the last 64 bytes, the input also needs to be masked + if (pos < len) { + __mmask64 load_mask = _bzhi_u64(~0ULL, (unsigned int)(len - pos)); + __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)(buf + pos)); + utf8_output += latin1_to_utf8_avx512_vec(input, len - pos, utf8_output, 1); + } + return (size_t)(utf8_output - start); +} +/* end file src/icelake/icelake_convert_latin1_to_utf8.inl.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 +#if SIMDUTF_FEATURE_UTF16 +/* begin file src/icelake/icelake_convert_latin1_to_utf16.inl.cpp */ +// file included directly +template +size_t icelake_convert_latin1_to_utf16(const char *latin1_input, size_t len, + char16_t *utf16_output) { + size_t rounded_len = len & ~0x1F; // Round down to nearest multiple of 32 + + __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809); + for (size_t i = 0; i < rounded_len; i += 32) { + // Load 32 Latin1 characters into a 256-bit register + __m256i in = _mm256_loadu_si256((__m256i *)&latin1_input[i]); + // Zero extend each set of 8 Latin1 characters to 32 16-bit integers + __m512i out = _mm512_cvtepu8_epi16(in); + if (big_endian) { + out = _mm512_shuffle_epi8(out, byteflip); + } + // Store the results back to memory + _mm512_storeu_si512((__m512i *)&utf16_output[i], out); + } + if (rounded_len != len) { + uint32_t mask = uint32_t(1 << (len - rounded_len)) - 1; + __m256i in = _mm256_maskz_loadu_epi8(mask, latin1_input + rounded_len); + + // Zero extend each set of 8 Latin1 characters to 32 16-bit integers + __m512i out = _mm512_cvtepu8_epi16(in); + if (big_endian) { + out = _mm512_shuffle_epi8(out, byteflip); + } + // Store the results back to memory + _mm512_mask_storeu_epi16(utf16_output + rounded_len, mask, out); + } + + return len; +} +/* end file src/icelake/icelake_convert_latin1_to_utf16.inl.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF32 +/* begin file src/icelake/icelake_convert_latin1_to_utf32.inl.cpp */ +void avx512_convert_latin1_to_utf32(const char *buf, size_t len, + char32_t *utf32_output) { + while (len >= 16) { + // Load 16 Latin1 characters into a 128-bit register + __m128i in = _mm_loadu_si128((__m128i *)buf); + + // Zero extend each set of 8 Latin1 characters to 16 32-bit integers using + // vpmovzxbd + __m512i out = _mm512_cvtepu8_epi32(in); + + // Store the results back to memory + _mm512_storeu_si512((__m512i *)utf32_output, out); + + len -= 16; + buf += 16; + utf32_output += 16; + } + + __mmask16 mask = __mmask16((1 << len) - 1); + __m128i in = _mm_maskz_loadu_epi8(mask, buf); + __m512i out = _mm512_cvtepu8_epi32(in); + _mm512_mask_storeu_epi32((__m512i *)utf32_output, mask, out); +} +/* end file src/icelake/icelake_convert_latin1_to_utf32.inl.cpp */ +#endif // SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_BASE64 +/* begin file src/icelake/icelake_base64.inl.cpp */ +// file included directly +/** + * References and further reading: + * + * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the + * speed of a memory copy, Software: Practice and Experience 50 (2), 2020. + * https://arxiv.org/abs/1910.05109 + * + * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2 + * Instructions, ACM Transactions on the Web 12 (3), 2018. + * https://arxiv.org/abs/1704.00605 + * + * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings. + * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force, + * Request for Comments: 4648. + * + * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization. + * http://www.alfredklomp.com/programming/sse-base64/. (2014). + * + * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD + * acceleration. https://github.com/aklomp/base64. (2014). + * + * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014). + * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/ + * + * Nick Kopp. 2013. Base64 Encoding on a GPU. + * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013). + */ + +struct block64 { + __m512i chunks[1]; +}; + +template +size_t encode_base64_impl(char *dst, const char *src, size_t srclen, + base64_options options, + size_t line_length = simdutf::default_line_length) { + size_t offset = 0; + if (line_length < 4) { + line_length = 4; // We do not support line_length less than 4 + } + // credit: Wojciech Muła + const uint8_t *input = (const uint8_t *)src; + + uint8_t *out = (uint8_t *)dst; + static const char *lookup_tbl = + base64_url + ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_" + : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + const __m512i shuffle_input = _mm512_setr_epi32( + 0x01020001, 0x04050304, 0x07080607, 0x0a0b090a, 0x0d0e0c0d, 0x10110f10, + 0x13141213, 0x16171516, 0x191a1819, 0x1c1d1b1c, 0x1f201e1f, 0x22232122, + 0x25262425, 0x28292728, 0x2b2c2a2b, 0x2e2f2d2e); + const __m512i lookup = + _mm512_loadu_si512(reinterpret_cast(lookup_tbl)); + const __m512i multi_shifts = _mm512_set1_epi64(UINT64_C(0x3036242a1016040a)); + size_t size = srclen; + __mmask64 input_mask = 0xffffffffffff; // (1 << 48) - 1 + // We want that input == end_input means that we must stop. + const uint8_t *end_input = input + (size - (size % 48)); + while (input != end_input) { + const __m512i v = _mm512_maskz_loadu_epi8( + input_mask, reinterpret_cast(input)); + const __m512i in = _mm512_permutexvar_epi8(shuffle_input, v); + const __m512i indices = _mm512_multishift_epi64_epi8(multi_shifts, in); + const __m512i result = _mm512_permutexvar_epi8(indices, lookup); + if (use_lines) { + if (offset + 64 > line_length) { + if (line_length >= 64) { + __m512i expanded = _mm512_mask_expand_epi8( + _mm512_set1_epi8('\n'), ~(1ULL << ((line_length - offset))), + result); + _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), expanded); + __m128i last_lane = + _mm512_extracti32x4_epi32(result, 3); // Lane 3 (bytes 48-63) + uint8_t last_byte = + static_cast(_mm_extract_epi8(last_lane, 15)); + out[64] = last_byte; + out += 65; + offset = 64 - (line_length - offset); + } else { // slow path + alignas(64) uint8_t local_buffer[64]; + _mm512_storeu_si512(reinterpret_cast<__m512i *>(local_buffer), + result); + size_t out_pos = 0; + size_t local_offset = offset; + for (size_t j = 0; j < 64;) { + if (local_offset == line_length) { + out[out_pos++] = '\n'; + local_offset = 0; + } + out[out_pos++] = local_buffer[j++]; + local_offset++; + } + offset = local_offset; + out += out_pos; + } + } else { + _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), result); + offset += 64; + out += 64; + } + } else { + _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), result); + out += 64; + } + input += 48; + } + size = size % 48; + + input_mask = ((__mmask64)1 << size) - 1; + const __m512i v = _mm512_maskz_loadu_epi8( + input_mask, reinterpret_cast(input)); + const __m512i in = _mm512_permutexvar_epi8(shuffle_input, v); + const __m512i indices = _mm512_multishift_epi64_epi8(multi_shifts, in); + bool padding_needed = + (((options & base64_url) == 0) ^ + ((options & base64_reverse_padding) == base64_reverse_padding)); + size_t padding_amount = ((size % 3) > 0) ? (3 - (size % 3)) : 0; + size_t output_len = ((size + 2) / 3) * 4; + size_t non_padded_output_len = output_len - padding_amount; + if (!padding_needed) { + output_len = non_padded_output_len; + } + // If no output, we are done. + if (output_len == 0) { + return (size_t)(out - (uint8_t *)dst); + } + __mmask64 output_mask = 0xFFFFFFFFFFFFFFFF >> (64 - output_len); + __m512i result = _mm512_mask_permutexvar_epi8( + _mm512_set1_epi8('='), ((__mmask64)1 << non_padded_output_len) - 1, + indices, lookup); + if (use_lines) { + if (offset + output_len > line_length) { + if (line_length >= 64) { + __m512i expanded = _mm512_mask_expand_epi8( + _mm512_set1_epi8('\n'), ~(1ULL << ((line_length - offset))), + result); + if (output_len == 64) { + _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), expanded); + out += 64; + _mm512_mask_storeu_epi8(reinterpret_cast<__m512i *>(out - 63), + 1ULL << 63, result); + out++; + } else { + output_mask = 0xFFFFFFFFFFFFFFFF >> (64 - output_len - 1); + _mm512_mask_storeu_epi8(reinterpret_cast<__m512i *>(out), output_mask, + expanded); + out += output_len + 1; + } + } else { + alignas(64) uint8_t local_buffer[64]; + _mm512_storeu_si512(reinterpret_cast<__m512i *>(local_buffer), result); + size_t out_pos = 0; + size_t local_offset = offset; + for (size_t j = 0; j < output_len;) { + if (local_offset == line_length) { + out[out_pos++] = '\n'; + local_offset = 0; + } + out[out_pos++] = local_buffer[j++]; + local_offset++; + } + offset = local_offset; + out += out_pos; + } + } else { + _mm512_mask_storeu_epi8(reinterpret_cast<__m512i *>(out), output_mask, + result); + out += output_len; + } + } else { + _mm512_mask_storeu_epi8(reinterpret_cast<__m512i *>(out), output_mask, + result); + out += output_len; + } + return (size_t)(out - (uint8_t *)dst); +} + +template +size_t encode_base64(char *dst, const char *src, size_t srclen, + base64_options options) { + return encode_base64_impl(dst, src, srclen, options); +} + +template +static inline uint64_t to_base64_mask(block64 *b, uint64_t *error, + uint64_t input_mask = UINT64_MAX) { + __m512i input = b->chunks[0]; + const __m512i ascii_space_tbl = _mm512_set_epi8( + 0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 12, 0, 10, + 9, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0, + 0, 0, 32, 0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32); + __m512i lookup0; + if (default_or_url) { + lookup0 = _mm512_set_epi8( + -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53, + 52, 63, -128, 62, -128, 62, -128, -128, -128, -128, -128, -128, -128, + -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128, -128, + -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1, -128, + -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -1); + } else if (base64_url) { + lookup0 = _mm512_set_epi8( + -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53, + 52, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128, -128, + -128, -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128, + -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1, + -128, -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -1); + } else { + lookup0 = _mm512_set_epi8( + -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53, + 52, 63, -128, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128, + -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128, -128, + -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1, -128, + -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -128); + } + __m512i lookup1; + if (default_or_url) { + lookup1 = _mm512_set_epi8( + -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, + 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128, + 63, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, + 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128); + } else if (base64_url) { + lookup1 = _mm512_set_epi8( + -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, + 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128, + 63, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, + 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128); + } else { + lookup1 = _mm512_set_epi8( + -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, + 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128, + -128, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128); + } + + const __m512i translated = _mm512_permutex2var_epi8(lookup0, input, lookup1); + const __m512i combined = _mm512_or_si512(translated, input); + const __mmask64 mask = _mm512_movepi8_mask(combined) & input_mask; + if (!ignore_garbage && mask) { + const __mmask64 spaces = + _mm512_cmpeq_epi8_mask(_mm512_shuffle_epi8(ascii_space_tbl, input), + input) & + input_mask; + *error = (mask ^ spaces); + } + b->chunks[0] = translated; + + return mask | (~input_mask); +} + +static inline void copy_block(block64 *b, char *output) { + _mm512_storeu_si512(reinterpret_cast<__m512i *>(output), b->chunks[0]); +} + +static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) { + uint64_t nmask = ~mask; + __m512i c = _mm512_maskz_compress_epi8(nmask, b->chunks[0]); + _mm512_storeu_si512(reinterpret_cast<__m512i *>(output), c); + return _mm_popcnt_u64(nmask); +} + +// The caller of this function is responsible to ensure that there are 64 bytes +// available from reading at src. The data is read into a block64 structure. +static inline void load_block(block64 *b, const char *src) { + b->chunks[0] = _mm512_loadu_si512(reinterpret_cast(src)); +} + +static inline void load_block_partial(block64 *b, const char *src, + __mmask64 input_mask) { + b->chunks[0] = _mm512_maskz_loadu_epi8( + input_mask, reinterpret_cast(src)); +} + +// The caller of this function is responsible to ensure that there are 128 bytes +// available from reading at src. The data is read into a block64 structure. +static inline void load_block(block64 *b, const char16_t *src) { + __m512i m1 = _mm512_loadu_si512(reinterpret_cast(src)); + __m512i m2 = _mm512_loadu_si512(reinterpret_cast(src + 32)); + __m512i p = _mm512_packus_epi16(m1, m2); + b->chunks[0] = + _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), p); +} + +static inline void load_block_partial(block64 *b, const char16_t *src, + __mmask64 input_mask) { + __m512i m1 = _mm512_maskz_loadu_epi16((__mmask32)input_mask, + reinterpret_cast(src)); + __m512i m2 = + _mm512_maskz_loadu_epi16((__mmask32)(input_mask >> 32), + reinterpret_cast(src + 32)); + __m512i p = _mm512_packus_epi16(m1, m2); + b->chunks[0] = + _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), p); +} + +static inline void base64_decode(char *out, __m512i str) { + const __m512i merge_ab_and_bc = + _mm512_maddubs_epi16(str, _mm512_set1_epi32(0x01400140)); + const __m512i merged = + _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000)); + const __m512i pack = _mm512_set_epi8( + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60, 61, 62, 56, 57, 58, + 52, 53, 54, 48, 49, 50, 44, 45, 46, 40, 41, 42, 36, 37, 38, 32, 33, 34, + 28, 29, 30, 24, 25, 26, 20, 21, 22, 16, 17, 18, 12, 13, 14, 8, 9, 10, 4, + 5, 6, 0, 1, 2); + const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged); + _mm512_mask_storeu_epi8( + (__m512i *)out, 0xffffffffffff, + shuffled); // mask would be 0xffffffffffff since we write 48 bytes. +} +// decode 64 bytes and output 48 bytes +static inline void base64_decode_block(char *out, const char *src) { + base64_decode(out, + _mm512_loadu_si512(reinterpret_cast(src))); +} +static inline void base64_decode_block(char *out, block64 *b) { + base64_decode(out, b->chunks[0]); +} + +template +full_result +compress_decode_base64(char *dst, const chartype *src, size_t srclen, + base64_options options, + last_chunk_handling_options last_chunk_options) { + (void)options; + const uint8_t *to_base64 = + default_or_url ? tables::base64::to_base64_default_or_url_value + : (base64_url ? tables::base64::to_base64_url_value + : tables::base64::to_base64_value); + auto ri = simdutf::scalar::base64::find_end(src, srclen, options); + size_t equallocation = ri.equallocation; + size_t padding_characters = ri.equalsigns; + srclen = ri.srclen; + size_t full_input_length = ri.full_input_length; + if (srclen == 0) { + if (!ignore_garbage && padding_characters > 0) { + return {INVALID_BASE64_CHARACTER, equallocation, 0}; + } + return {SUCCESS, full_input_length, 0}; + } + const chartype *const srcinit = src; + const char *const dstinit = dst; + const chartype *const srcend = src + srclen; + + // figure out why block_size == 2 is sometimes best??? + constexpr size_t block_size = 6; + char buffer[block_size * 64]; + char *bufferptr = buffer; + if (srclen >= 64) { + const chartype *const srcend64 = src + srclen - 64; + while (src <= srcend64) { + block64 b; + load_block(&b, src); + src += 64; + uint64_t error = 0; + uint64_t badcharmask = + to_base64_mask(&b, + &error); + if (!ignore_garbage && error) { + src -= 64; + size_t error_offset = _tzcnt_u64(error); + return {error_code::INVALID_BASE64_CHARACTER, + size_t(src - srcinit + error_offset), size_t(dst - dstinit)}; + } + if (badcharmask != 0) { + // optimization opportunity: check for simple masks like those made of + // continuous 1s followed by continuous 0s. And masks containing a + // single bad character. + bufferptr += compress_block(&b, badcharmask, bufferptr); + } else if (bufferptr != buffer) { + copy_block(&b, bufferptr); + bufferptr += 64; + } else { + base64_decode_block(dst, &b); + dst += 48; + } + if (bufferptr >= (block_size - 1) * 64 + buffer) { + for (size_t i = 0; i < (block_size - 1); i++) { + base64_decode_block(dst, buffer + i * 64); + dst += 48; + } + std::memcpy(buffer, buffer + (block_size - 1) * 64, + 64); // 64 might be too much + bufferptr -= (block_size - 1) * 64; + } + } + } + + int last_block_len = (int)(srcend - src); + if (last_block_len != 0) { + __mmask64 input_mask = ((__mmask64)1 << last_block_len) - 1; + block64 b; + load_block_partial(&b, src, input_mask); + uint64_t error = 0; + uint64_t badcharmask = + to_base64_mask(&b, &error, + input_mask); + if (!ignore_garbage && error) { + size_t error_offset = _tzcnt_u64(error); + return {error_code::INVALID_BASE64_CHARACTER, + size_t(src - srcinit + error_offset), size_t(dst - dstinit)}; + } + src += last_block_len; + bufferptr += compress_block(&b, badcharmask, bufferptr); + } + + char *buffer_start = buffer; + for (; buffer_start + 64 <= bufferptr; buffer_start += 64) { + base64_decode_block(dst, buffer_start); + dst += 48; + } + if ((bufferptr - buffer_start) != 0) { + // For efficiency reasons, we end up reproducing much of the code + // in base64_tail_decode_impl. Better engineering would be to + // refactor the code so that we can call it without a performance hit. + size_t rem = (bufferptr - buffer_start); + int idx = rem % 4; + __mmask64 mask = ((__mmask64)1 << rem) - 1; + __m512i input = _mm512_maskz_loadu_epi8(mask, buffer_start); + size_t output_len = (rem / 4) * 3; + __mmask64 output_mask = mask >> (rem - output_len); + const __m512i merge_ab_and_bc = + _mm512_maddubs_epi16(input, _mm512_set1_epi32(0x01400140)); + const __m512i merged = + _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000)); + const __m512i pack = _mm512_set_epi8( + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60, 61, 62, 56, 57, 58, + 52, 53, 54, 48, 49, 50, 44, 45, 46, 40, 41, 42, 36, 37, 38, 32, 33, 34, + 28, 29, 30, 24, 25, 26, 20, 21, 22, 16, 17, 18, 12, 13, 14, 8, 9, 10, 4, + 5, 6, 0, 1, 2); + const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged); + // We never should have that the number of base64 characters + the + // number of padding characters is more than 4. + if (!ignore_garbage && (idx + padding_characters > 4)) { + return {INVALID_BASE64_CHARACTER, size_t(src - srcinit), + size_t(dst - dstinit), true}; + } + // The idea here is that in loose mode, + // if there is padding at all, it must be used + // to form 4-wise chunk. However, in loose mode, + // we do accept no padding at all. + if (!ignore_garbage && + last_chunk_options == last_chunk_handling_options::loose && + (idx >= 2) && padding_characters > 0 && + ((idx + padding_characters) & 3) != 0) { + return {INVALID_BASE64_CHARACTER, size_t(src - srcinit), + size_t(dst - dstinit), true}; + } else + // The idea here is that in strict mode, we do not want to accept + // incomplete base64 chunks. So if the chunk was otherwise valid, we + // return BASE64_INPUT_REMAINDER. + if (!ignore_garbage && + last_chunk_options == last_chunk_handling_options::strict && + (idx >= 2) && ((idx + padding_characters) & 3) != 0) { + // The partial chunk was at src - idx + _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled); + dst += output_len; + return {BASE64_INPUT_REMAINDER, equallocation, size_t(dst - dstinit)}; + } else + // If there is a partial chunk with insufficient padding, with + // stop_before_partial, we need to just ignore it. In "only full" mode, + // skip the minute there are padding characters. + if ((last_chunk_options == + last_chunk_handling_options::stop_before_partial && + (padding_characters + idx < 4) && (idx != 0) && + (idx >= 2 || padding_characters == 0)) || + (last_chunk_options == + last_chunk_handling_options::only_full_chunks && + (idx >= 2 || padding_characters == 0))) { + _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled); + dst += output_len; + // we need to rewind src to before the partial chunk + size_t characters_to_skip = idx; + while (characters_to_skip > 0) { + src--; + auto c = *src; + uint8_t code = to_base64[uint8_t(c)]; + if (simdutf::scalar::base64::is_eight_byte(c) && code <= 63) { + characters_to_skip--; + } + } + // And then we need to skip ignored characters + // See https://github.com/simdutf/simdutf/issues/824 + while (src > srcinit) { + auto c = *(src - 1); + uint8_t code = to_base64[uint8_t(c)]; + if (simdutf::scalar::base64::is_eight_byte(c) && code <= 63) { + break; + } + src--; + } + return {SUCCESS, size_t(src - srcinit), size_t(dst - dstinit)}; + } else { + if (idx == 2) { + if (!ignore_garbage && + last_chunk_options == last_chunk_handling_options::strict) { + uint32_t triple = (uint32_t(bufferptr[-2]) << 3 * 6) + + (uint32_t(bufferptr[-1]) << 2 * 6); + if (triple & 0xffff) { + _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled); + dst += output_len; + return {BASE64_EXTRA_BITS, size_t(src - srcinit), + size_t(dst - dstinit)}; + } + } + output_mask = (output_mask << 1) | 1; + output_len += 1; + _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled); + dst += output_len; + } else if (idx == 3) { + if (!ignore_garbage && + last_chunk_options == last_chunk_handling_options::strict) { + uint32_t triple = (uint32_t(bufferptr[-3]) << 3 * 6) + + (uint32_t(bufferptr[-2]) << 2 * 6) + + (uint32_t(bufferptr[-1]) << 1 * 6); + if (triple & 0xff) { + _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled); + dst += output_len; + return {BASE64_EXTRA_BITS, size_t(src - srcinit), + size_t(dst - dstinit)}; + } + } + output_mask = (output_mask << 2) | 3; + output_len += 2; + _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled); + dst += output_len; + } else if (!ignore_garbage && idx == 1 && + (!is_partial(last_chunk_options) || + (is_partial(last_chunk_options) && + padding_characters > 0))) { + _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled); + dst += output_len; + return {BASE64_INPUT_REMAINDER, size_t(src - srcinit), + size_t(dst - dstinit)}; + } else if (!ignore_garbage && idx == 0 && padding_characters > 0) { + _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled); + dst += output_len; + return {INVALID_BASE64_CHARACTER, equallocation, + size_t(dst - dstinit)}; + } else { + _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled); + dst += output_len; + } + } + if (!ignore_garbage && !is_partial(last_chunk_options) && + padding_characters > 0) { + size_t output_count = size_t(dst - dstinit); + if ((output_count % 3 == 0) || + ((output_count % 3) + 1 + padding_characters != 4)) { + return {INVALID_BASE64_CHARACTER, equallocation, output_count}; + } + } + return {SUCCESS, full_input_length, size_t(dst - dstinit)}; + } + + if (!ignore_garbage && padding_characters > 0) { + if ((size_t(dst - dstinit) % 3 == 0) || + ((size_t(dst - dstinit) % 3) + 1 + padding_characters != 4)) { + return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)}; + } + } + return {SUCCESS, srclen, size_t(dst - dstinit)}; +} +/* end file src/icelake/icelake_base64.inl.cpp */ +/* begin file src/icelake/icelake_find.inl.cpp */ +simdutf_really_inline const char *util_find(const char *start, const char *end, + char character) noexcept { + // Handle empty or invalid range + if (start >= end) + return end; + const size_t step = 64; + __m512i char_vec = _mm512_set1_epi8(character); + + // Handle unaligned beginning with a masked load + uintptr_t misalignment = reinterpret_cast(start) % step; + if (misalignment != 0) { + size_t adjustment = step - misalignment; + if (size_t(end - start) < adjustment) { + adjustment = end - start; + } + __mmask64 load_mask = 0xFFFFFFFFFFFFFFFF >> (64 - adjustment); + __m512i data = _mm512_maskz_loadu_epi8( + load_mask, reinterpret_cast(start)); + __mmask64 match_mask = _mm512_cmpeq_epi8_mask(data, char_vec); + + if (match_mask != 0) { + size_t index = _tzcnt_u64(match_mask); + return start + index; + } + start += adjustment; + } + // Process 64 bytes (512 bits) at a time with AVX-512 + // Main loop for full 128-byte chunks + while (size_t(end - start) >= 2 * step) { + __m512i data1 = + _mm512_loadu_si512(reinterpret_cast(start)); + __mmask64 mask1 = _mm512_cmpeq_epi8_mask(data1, char_vec); + + __m512i data2 = + _mm512_loadu_si512(reinterpret_cast(start + step)); + __mmask64 mask2 = _mm512_cmpeq_epi8_mask(data2, char_vec); + if (!_kortestz_mask64_u8(mask1, mask2)) { + if (mask1 != 0) { + // Found a match, return the first one + size_t index = _tzcnt_u64(mask1); + return start + index; + } + size_t index = _tzcnt_u64(mask2); + return start + index + step; + } + start += 2 * step; + } + + // Main loop for full 64-byte chunks + while (size_t(end - start) >= step) { + __m512i data = _mm512_loadu_si512(reinterpret_cast(start)); + __mmask64 mask = _mm512_cmpeq_epi8_mask(data, char_vec); + + if (mask != 0) { + // Found a match, return the first one + size_t index = _tzcnt_u64(mask); + return start + index; + } + + start += step; + } + + // Handle remaining bytes with masked load + size_t remaining = end - start; + if (remaining > 0) { + // Create a mask for the remaining bytes using shifted 0xFFFFFFFFFFFFFFFF + __mmask64 load_mask = 0xFFFFFFFFFFFFFFFF >> (64 - remaining); + __m512i data = _mm512_maskz_loadu_epi8( + load_mask, reinterpret_cast(start)); + __mmask64 match_mask = _mm512_cmpeq_epi8_mask(data, char_vec); + + // Apply load mask to avoid false positives + match_mask &= load_mask; + + if (match_mask != 0) { + // Found a match in the remaining bytes + size_t index = _tzcnt_u64(match_mask); + return start + index; + } + } + + return end; +} + +simdutf_really_inline const char16_t *util_find(const char16_t *start, + const char16_t *end, + char16_t character) noexcept { + // Handle empty or invalid range + if (start >= end) + return end; + + // Process 32 char16_t (64 bytes, 512 bits) at a time with AVX-512 + const size_t step = 32; + __m512i char_vec = _mm512_set1_epi16(character); + + // Handle unaligned beginning with a masked load + uintptr_t misalignment = + reinterpret_cast(start) % (step * sizeof(char16_t)); + if (misalignment != 0 && misalignment % 2 == 0) { + size_t adjustment = + (step * sizeof(char16_t) - misalignment) / sizeof(char16_t); + if (size_t(end - start) < adjustment) { + adjustment = end - start; + } + __mmask32 load_mask = 0xFFFFFFFF >> (32 - adjustment); + __m512i data = _mm512_maskz_loadu_epi16( + load_mask, reinterpret_cast(start)); + __mmask32 match_mask = _mm512_cmpeq_epi16_mask(data, char_vec); + + if (match_mask != 0) { + size_t index = _tzcnt_u32(match_mask); + return start + index; + } + start += adjustment; + } + + // Main loop for full 32-element chunks + while (size_t(end - start) >= step) { + __m512i data = _mm512_loadu_si512(reinterpret_cast(start)); + __mmask32 mask = _mm512_cmpeq_epi16_mask(data, char_vec); + + if (mask != 0) { + // Found a match, return the first one + size_t index = _tzcnt_u32(mask); + return start + index; + } + + start += step; + } + + // Handle remaining elements with masked load + size_t remaining = end - start; + if (remaining > 0) { + __mmask32 load_mask = 0xFFFFFFFF >> (32 - remaining); + __m512i data = _mm512_maskz_loadu_epi16( + load_mask, reinterpret_cast(start)); + __mmask32 match_mask = _mm512_cmpeq_epi16_mask(data, char_vec); + + if (match_mask != 0) { + size_t index = _tzcnt_u32(match_mask); + return start + index; + } + } + + return end; +} +/* end file src/icelake/icelake_find.inl.cpp */ +#endif // SIMDUTF_FEATURE_BASE64 + +#include + +} // namespace +} // namespace icelake +} // namespace simdutf + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +/* begin file src/generic/utf32.h */ +#include + +namespace simdutf { +namespace icelake { +namespace { +namespace utf32 { + +template T min(T a, T b) { return a <= b ? a : b; } + +simdutf_really_inline size_t utf8_length_from_utf32(const char32_t *input, + size_t length) { + using vector_u32 = simd32; + + const char32_t *start = input; + + // we add up to three ones in a single iteration (see the vectorized loop in + // section #2 below) + const size_t max_increment = 3; + + const size_t N = vector_u32::ELEMENTS; + +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + const auto v_0000007f = vector_u32::splat(0x0000007f); + const auto v_000007ff = vector_u32::splat(0x000007ff); + const auto v_0000ffff = vector_u32::splat(0x0000ffff); +#else + const auto v_ffffff80 = vector_u32::splat(0xffffff80); + const auto v_fffff800 = vector_u32::splat(0xfffff800); + const auto v_ffff0000 = vector_u32::splat(0xffff0000); + const auto one = vector_u32::splat(1); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP + + size_t counter = 0; + + // 1. vectorized loop unrolled 4 times + { + // we use vector of uint32 counters, this is why this limit is used + const size_t max_iterations = + std::numeric_limits::max() / (max_increment * 4); + size_t blocks = length / (N * 4); + length -= blocks * (N * 4); + while (blocks != 0) { + const size_t iterations = min(blocks, max_iterations); + blocks -= iterations; + + simd32 acc = vector_u32::zero(); + for (size_t i = 0; i < iterations; i++) { + const auto in0 = vector_u32(input + 0 * N); + const auto in1 = vector_u32(input + 1 * N); + const auto in2 = vector_u32(input + 2 * N); + const auto in3 = vector_u32(input + 3 * N); + +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + acc -= as_vector_u32(in0 > v_0000007f); + acc -= as_vector_u32(in1 > v_0000007f); + acc -= as_vector_u32(in2 > v_0000007f); + acc -= as_vector_u32(in3 > v_0000007f); + + acc -= as_vector_u32(in0 > v_000007ff); + acc -= as_vector_u32(in1 > v_000007ff); + acc -= as_vector_u32(in2 > v_000007ff); + acc -= as_vector_u32(in3 > v_000007ff); + + acc -= as_vector_u32(in0 > v_0000ffff); + acc -= as_vector_u32(in1 > v_0000ffff); + acc -= as_vector_u32(in2 > v_0000ffff); + acc -= as_vector_u32(in3 > v_0000ffff); +#else + acc += min(one, in0 & v_ffffff80); + acc += min(one, in1 & v_ffffff80); + acc += min(one, in2 & v_ffffff80); + acc += min(one, in3 & v_ffffff80); + + acc += min(one, in0 & v_fffff800); + acc += min(one, in1 & v_fffff800); + acc += min(one, in2 & v_fffff800); + acc += min(one, in3 & v_fffff800); + + acc += min(one, in0 & v_ffff0000); + acc += min(one, in1 & v_ffff0000); + acc += min(one, in2 & v_ffff0000); + acc += min(one, in3 & v_ffff0000); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP + + input += 4 * N; + } + + counter += acc.sum(); + } + } + + // 2. vectorized loop for tail + { + const size_t max_iterations = + std::numeric_limits::max() / max_increment; + size_t blocks = length / N; + length -= blocks * N; + while (blocks != 0) { + const size_t iterations = min(blocks, max_iterations); + blocks -= iterations; + + auto acc = vector_u32::zero(); + for (size_t i = 0; i < iterations; i++) { + const auto in = vector_u32(input); + +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + acc -= as_vector_u32(in > v_0000007f); + acc -= as_vector_u32(in > v_000007ff); + acc -= as_vector_u32(in > v_0000ffff); +#else + acc += min(one, in & v_ffffff80); + acc += min(one, in & v_fffff800); + acc += min(one, in & v_ffff0000); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP + + input += N; + } + + counter += acc.sum(); + } + } + + const size_t consumed = input - start; + if (consumed != 0) { + // We don't count 0th bytes in the vectorized loops above, this + // is why we need to count them in the end. + counter += consumed; + } + + return counter + scalar::utf32::utf8_length_from_utf32(input, length); +} + +} // namespace utf32 +} // unnamed namespace +} // namespace icelake +} // namespace simdutf +/* end file src/generic/utf32.h */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +namespace simdutf { +namespace icelake { + +#if SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused int +implementation::detect_encodings(const char *input, + size_t length) const noexcept { + // If there is a BOM, then we trust it. + auto bom_encoding = simdutf::BOM::check_bom(input, length); + if (bom_encoding != encoding_type::unspecified) { + return bom_encoding; + } + + int out = 0; + uint32_t utf16_err = (length % 2); + uint32_t utf32_err = (length % 4); + uint32_t ends_with_high = 0; + avx512_utf8_checker checker{}; + const __m512i offset = _mm512_set1_epi32((uint32_t)0xffff2000); + __m512i currentmax = _mm512_setzero_si512(); + __m512i currentoffsetmax = _mm512_setzero_si512(); + const char *ptr = input; + const char *end = ptr + length; + for (; end - ptr >= 64; ptr += 64) { + // utf8 checks + const __m512i data = _mm512_loadu_si512((const __m512i *)ptr); + checker.check_next_input(data); + + // utf16le_checks + __m512i diff = _mm512_sub_epi16(data, _mm512_set1_epi16(uint16_t(0xD800))); + __mmask32 surrogates = + _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800))); + __mmask32 highsurrogates = + _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400))); + __mmask32 lowsurrogates = surrogates ^ highsurrogates; + utf16_err |= (((highsurrogates << 1) | ends_with_high) != lowsurrogates); + ends_with_high = ((highsurrogates & 0x80000000) != 0); + + // utf32le checks + currentoffsetmax = + _mm512_max_epu32(_mm512_add_epi32(data, offset), currentoffsetmax); + currentmax = _mm512_max_epu32(data, currentmax); + } + + // last block with 0 <= len < 64 + __mmask64 read_mask = (__mmask64(1) << (end - ptr)) - 1; + const __m512i data = _mm512_maskz_loadu_epi8(read_mask, (const __m512i *)ptr); + checker.check_next_input(data); + + __m512i diff = _mm512_sub_epi16(data, _mm512_set1_epi16(uint16_t(0xD800))); + __mmask32 surrogates = + _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800))); + __mmask32 highsurrogates = + _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400))); + __mmask32 lowsurrogates = surrogates ^ highsurrogates; + utf16_err |= (((highsurrogates << 1) | ends_with_high) != lowsurrogates); + + currentoffsetmax = + _mm512_max_epu32(_mm512_add_epi32(data, offset), currentoffsetmax); + currentmax = _mm512_max_epu32(data, currentmax); + + const __m512i standardmax = _mm512_set1_epi32((uint32_t)0x10ffff); + const __m512i standardoffsetmax = _mm512_set1_epi32((uint32_t)0xfffff7ff); + __m512i is_zero = + _mm512_xor_si512(_mm512_max_epu32(currentmax, standardmax), standardmax); + utf32_err |= (_mm512_test_epi8_mask(is_zero, is_zero) != 0); + is_zero = _mm512_xor_si512( + _mm512_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax); + utf32_err |= (_mm512_test_epi8_mask(is_zero, is_zero) != 0); + checker.check_eof(); + bool is_valid_utf8 = !checker.errors(); + if (is_valid_utf8) { + out |= encoding_type::UTF8; + } + if (utf16_err == 0) { + out |= encoding_type::UTF16_LE; + } + if (utf32_err == 0) { + out |= encoding_type::UTF32_LE; + } + return out; +} +#endif // SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused bool +implementation::validate_utf8(const char *buf, size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + return true; + } + avx512_utf8_checker checker{}; + const char *ptr = buf; + const char *end = ptr + len; + for (; end - ptr >= 64; ptr += 64) { + const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr); + checker.check_next_input(utf8); + } + if (end != ptr) { + const __m512i utf8 = _mm512_maskz_loadu_epi8( + ~UINT64_C(0) >> (64 - (end - ptr)), (const __m512i *)ptr); + checker.check_next_input(utf8); + } + checker.check_eof(); + return !checker.errors(); +} +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF8 +simdutf_warn_unused result implementation::validate_utf8_with_errors( + const char *buf, size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + return result(error_code::SUCCESS, len); + } + avx512_utf8_checker checker{}; + const char *ptr = buf; + const char *end = ptr + len; + size_t count{0}; + for (; end - ptr >= 64; ptr += 64) { + const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr); + checker.check_next_input(utf8); + if (checker.errors()) { + if (count != 0) { + count--; + } // Sometimes the error is only detected in the next chunk + result res = scalar::utf8::rewind_and_validate_with_errors( + reinterpret_cast(buf), + reinterpret_cast(buf + count), len - count); + res.count += count; + return res; + } + count += 64; + } + if (end != ptr) { + const __m512i utf8 = _mm512_maskz_loadu_epi8( + ~UINT64_C(0) >> (64 - (end - ptr)), (const __m512i *)ptr); + checker.check_next_input(utf8); + } + checker.check_eof(); + if (checker.errors()) { + if (count != 0) { + count--; + } // Sometimes the error is only detected in the next chunk + result res = scalar::utf8::rewind_and_validate_with_errors( + reinterpret_cast(buf), + reinterpret_cast(buf + count), len - count); + res.count += count; + return res; + } + return result(error_code::SUCCESS, len); +} +#endif // SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_ASCII +simdutf_warn_unused bool +implementation::validate_ascii(const char *buf, size_t len) const noexcept { + return icelake::validate_ascii(buf, len); +} + +simdutf_warn_unused result implementation::validate_ascii_with_errors( + const char *buf, size_t len) const noexcept { + const char *buf_orig = buf; + const char *end = buf + len; + const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80); + for (; end - buf >= 64; buf += 64) { + const __m512i input = _mm512_loadu_si512((const __m512i *)buf); + __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT); + if (notascii) { + return result(error_code::TOO_LARGE, + buf - buf_orig + _tzcnt_u64(notascii)); + } + } + if (end != buf) { + const __m512i input = _mm512_maskz_loadu_epi8( + ~UINT64_C(0) >> (64 - (end - buf)), (const __m512i *)buf); + __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT); + if (notascii) { + return result(error_code::TOO_LARGE, + buf - buf_orig + _tzcnt_u64(notascii)); + } + } + return result(error_code::SUCCESS, len); +} +#endif // SIMDUTF_FEATURE_ASCII +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII +simdutf_warn_unused bool +implementation::validate_utf16le_as_ascii(const char16_t *buf, + size_t len) const noexcept { + const char16_t *end = buf + len; + __m512i limit = _mm512_set1_epi16(uint16_t(0x007F)); + for (; end - buf >= 32;) { + __m512i in = _mm512_loadu_si512((__m512i *)buf); + auto mask = _mm512_cmpgt_epu16_mask(in, limit); + if (mask) { + return false; + } + buf += 32; + } + if (buf < end) { + __m512i in = + _mm512_maskz_loadu_epi16((1U << (end - buf)) - 1, (__m512i *)buf); + auto mask = _mm512_cmpgt_epu16_mask(in, limit); + if (mask) { + return false; + } + } + return true; +} + +simdutf_warn_unused bool +implementation::validate_utf16be_as_ascii(const char16_t *buf, + size_t len) const noexcept { + const char16_t *end = buf + len; + const __m512i byteflip = _mm512_setr_epi64( + 0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001, + 0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809); + __m512i limit = _mm512_set1_epi16(uint16_t(0x007F)); + for (; end - buf >= 32;) { + __m512i in = _mm512_loadu_si512((__m512i *)buf); + in = _mm512_shuffle_epi8(in, byteflip); + auto mask = _mm512_cmpgt_epu16_mask(in, limit); + if (mask) { + return false; + } + buf += 32; + } + if (buf < end) { + __m512i in = + _mm512_maskz_loadu_epi16((1U << (end - buf)) - 1, (__m512i *)buf); + in = _mm512_shuffle_epi8(in, byteflip); + auto mask = _mm512_cmpgt_epu16_mask(in, limit); + if (mask) { + return false; + } + } + return true; +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused bool +implementation::validate_utf16le(const char16_t *buf, + size_t len) const noexcept { + const char16_t *end = buf + len; + + // Optimized: Process 64 code units (2x 512-bit) per iteration + const __m512i surr_base = _mm512_set1_epi16(uint16_t(0xD800)); + const __m512i surr_range = _mm512_set1_epi16(uint16_t(0x0800)); + const __m512i high_range = _mm512_set1_epi16(uint16_t(0x0400)); + + for (; end - buf >= 64;) { + __m512i in_1 = _mm512_loadu_si512((__m512i *)buf); + __m512i in_2 = _mm512_loadu_si512((__m512i *)(buf + 32)); + + __m512i diff_1 = _mm512_sub_epi16(in_1, surr_base); + __m512i diff_2 = _mm512_sub_epi16(in_2, surr_base); + + __mmask32 surrogates_1 = _mm512_cmplt_epu16_mask(diff_1, surr_range); + __mmask32 surrogates_2 = _mm512_cmplt_epu16_mask(diff_2, surr_range); + + if (surrogates_1 | surrogates_2) { + __mmask32 highsurrogates_1 = _mm512_cmplt_epu16_mask(diff_1, high_range); + __mmask32 lowsurrogates_1 = surrogates_1 ^ highsurrogates_1; + + __mmask32 highsurrogates_2 = _mm512_cmplt_epu16_mask(diff_2, high_range); + __mmask32 lowsurrogates_2 = surrogates_2 ^ highsurrogates_2; + + // Validate first block: high must be followed by low + if ((highsurrogates_1 << 1) != lowsurrogates_1) { + return false; + } + + // Check boundary between blocks: if first block ends with high, second + // must start with low + bool ends_with_high_1 = ((highsurrogates_1 & 0x80000000) != 0); + bool starts_with_low_2 = ((lowsurrogates_2 & 0x1) != 0); + if (ends_with_high_1 && !starts_with_low_2) { + return false; + } + + // Validate second block (shift by 1 if first ended with high) + __mmask32 expected_low_2 = ends_with_high_1 + ? (highsurrogates_2 << 1) | 0x1 + : (highsurrogates_2 << 1); + if (expected_low_2 != lowsurrogates_2) { + return false; + } + + bool ends_with_high_2 = ((highsurrogates_2 & 0x80000000) != 0); + if (ends_with_high_2) { + buf += 63; // advance by 63 to start with high surrogate next round + } else { + buf += 64; + } + } else { + buf += 64; + } + } + + // Handle remaining 32-63 code units + for (; end - buf >= 32;) { + __m512i in = _mm512_loadu_si512((__m512i *)buf); + __m512i diff = _mm512_sub_epi16(in, surr_base); + __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, surr_range); + if (surrogates) { + __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, high_range); + __mmask32 lowsurrogates = surrogates ^ highsurrogates; + // high must be followed by low + if ((highsurrogates << 1) != lowsurrogates) { + return false; + } + bool ends_with_high = ((highsurrogates & 0x80000000) != 0); + if (ends_with_high) { + buf += 31; // advance only by 31 code units so that we start with the + // high surrogate on the next round. + } else { + buf += 32; + } + } else { + buf += 32; + } + } + if (buf < end) { + __m512i in = + _mm512_maskz_loadu_epi16((1U << (end - buf)) - 1, (__m512i *)buf); + __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800))); + __mmask32 surrogates = + _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800))); + if (surrogates) { + __mmask32 highsurrogates = + _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400))); + __mmask32 lowsurrogates = surrogates ^ highsurrogates; + // high must be followed by low + if ((highsurrogates << 1) != lowsurrogates) { + return false; + } + } + } + return true; +} +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused bool +implementation::validate_utf16be(const char16_t *buf, + size_t len) const noexcept { + const char16_t *end = buf + len; + + for (; end - buf >= 32;) { + __m512i in = _mm512_slli_epi32(_mm512_loadu_si512((__m512i *)buf), 8); + __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800))); + __mmask32 surrogates = + _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800))); + if (surrogates) { + __mmask32 highsurrogates = + _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400))); + __mmask32 lowsurrogates = surrogates ^ highsurrogates; + // high must be followed by low + if ((highsurrogates << 1) != lowsurrogates) { + return false; + } + bool ends_with_high = ((highsurrogates & 0x80000000) != 0); + if (ends_with_high) { + buf += 31; // advance only by 31 code units so that we start with the + // high surrogate on the next round. + } else { + buf += 32; + } + } else { + buf += 32; + } + } + if (buf < end) { + __m512i in = _mm512_slli_epi16( + _mm512_maskz_loadu_epi16((1U << (end - buf)) - 1, (__m512i *)buf), 8); + __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800))); + __mmask32 surrogates = + _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800))); + if (surrogates) { + __mmask32 highsurrogates = + _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400))); + __mmask32 lowsurrogates = surrogates ^ highsurrogates; + // high must be followed by low + if ((highsurrogates << 1) != lowsurrogates) { + return false; + } + } + } + return true; +} + +simdutf_warn_unused result implementation::validate_utf16le_with_errors( + const char16_t *buf, size_t len) const noexcept { + const char16_t *start_buf = buf; + const char16_t *end = buf + len; + for (; end - buf >= 32;) { + __m512i in = _mm512_loadu_si512((__m512i *)buf); + __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800))); + __mmask32 surrogates = + _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800))); + if (surrogates) { + __mmask32 highsurrogates = + _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400))); + __mmask32 lowsurrogates = surrogates ^ highsurrogates; + // high must be followed by low + if ((highsurrogates << 1) != lowsurrogates) { + uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1)); + uint32_t extra_high = + _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1)); + return result(error_code::SURROGATE, + (buf - start_buf) + + (extra_low < extra_high ? extra_low : extra_high)); + } + bool ends_with_high = ((highsurrogates & 0x80000000) != 0); + if (ends_with_high) { + buf += 31; // advance only by 31 code units so that we start with the + // high surrogate on the next round. + } else { + buf += 32; + } + } else { + buf += 32; + } + } + if (buf < end) { + __m512i in = + _mm512_maskz_loadu_epi16((1U << (end - buf)) - 1, (__m512i *)buf); + __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800))); + __mmask32 surrogates = + _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800))); + if (surrogates) { + __mmask32 highsurrogates = + _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400))); + __mmask32 lowsurrogates = surrogates ^ highsurrogates; + // high must be followed by low + if ((highsurrogates << 1) != lowsurrogates) { + uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1)); + uint32_t extra_high = + _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1)); + return result(error_code::SURROGATE, + (buf - start_buf) + + (extra_low < extra_high ? extra_low : extra_high)); + } + } + } + return result(error_code::SUCCESS, len); +} + +simdutf_warn_unused result implementation::validate_utf16be_with_errors( + const char16_t *buf, size_t len) const noexcept { + const char16_t *start_buf = buf; + const char16_t *end = buf + len; + + for (; end - buf >= 32;) { + __m512i in = _mm512_slli_epi16(_mm512_loadu_si512((__m512i *)buf), 8); + __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800))); + __mmask32 surrogates = + _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800))); + if (surrogates) { + __mmask32 highsurrogates = + _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400))); + __mmask32 lowsurrogates = surrogates ^ highsurrogates; + // high must be followed by low + if ((highsurrogates << 1) != lowsurrogates) { + uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1)); + uint32_t extra_high = + _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1)); + return result(error_code::SURROGATE, + (buf - start_buf) + + (extra_low < extra_high ? extra_low : extra_high)); + } + bool ends_with_high = ((highsurrogates & 0x80000000) != 0); + if (ends_with_high) { + buf += 31; // advance only by 31 code units so that we start with the + // high surrogate on the next round. + } else { + buf += 32; + } + } else { + buf += 32; + } + } + if (buf < end) { + __m512i in = _mm512_slli_epi16( + _mm512_maskz_loadu_epi16((1U << (end - buf)) - 1, (__m512i *)buf), 8); + __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800))); + __mmask32 surrogates = + _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800))); + if (surrogates) { + __mmask32 highsurrogates = + _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400))); + __mmask32 lowsurrogates = surrogates ^ highsurrogates; + // high must be followed by low + if ((highsurrogates << 1) != lowsurrogates) { + uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1)); + uint32_t extra_high = + _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1)); + return result(error_code::SURROGATE, + (buf - start_buf) + + (extra_low < extra_high ? extra_low : extra_high)); + } + } + } + return result(error_code::SUCCESS, len); +} + +void implementation::to_well_formed_utf16le(const char16_t *input, size_t len, + char16_t *output) const noexcept { + return utf16fix_avx512(input, len, output); +} + +void implementation::to_well_formed_utf16be(const char16_t *input, size_t len, + char16_t *output) const noexcept { + return utf16fix_avx512(input, len, output); +} +#endif // SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused bool +implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept { + return icelake::validate_utf32(buf, len); +} +#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused result implementation::validate_utf32_with_errors( + const char32_t *buf, size_t len) const noexcept { + const char32_t *buf_orig = buf; + if (len >= 16) { + const char32_t *end = buf + len - 16; + while (buf <= end) { + __m512i utf32 = _mm512_loadu_si512((const __m512i *)buf); + __mmask16 outside_range = _mm512_cmp_epu32_mask( + utf32, _mm512_set1_epi32(0x10ffff), _MM_CMPINT_GT); + + __m512i utf32_off = + _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000)); + + __mmask16 surrogate_range = _mm512_cmp_epu32_mask( + utf32_off, _mm512_set1_epi32(0xfffff7ff), _MM_CMPINT_GT); + if ((outside_range | surrogate_range)) { + auto outside_idx = _tzcnt_u32(outside_range); + auto surrogate_idx = _tzcnt_u32(surrogate_range); + + if (outside_idx < surrogate_idx) { + return result(error_code::TOO_LARGE, buf - buf_orig + outside_idx); + } + + return result(error_code::SURROGATE, buf - buf_orig + surrogate_idx); + } + + buf += 16; + } + } + if (len > 0) { + __m512i utf32 = _mm512_maskz_loadu_epi32( + __mmask16((1U << (buf_orig + len - buf)) - 1), (const __m512i *)buf); + __mmask16 outside_range = _mm512_cmp_epu32_mask( + utf32, _mm512_set1_epi32(0x10ffff), _MM_CMPINT_GT); + __m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000)); + + __mmask16 surrogate_range = _mm512_cmp_epu32_mask( + utf32_off, _mm512_set1_epi32(0xfffff7ff), _MM_CMPINT_GT); + if ((outside_range | surrogate_range)) { + auto outside_idx = _tzcnt_u32(outside_range); + auto surrogate_idx = _tzcnt_u32(surrogate_range); + + if (outside_idx < surrogate_idx) { + return result(error_code::TOO_LARGE, buf - buf_orig + outside_idx); + } + + return result(error_code::SURROGATE, buf - buf_orig + surrogate_idx); + } + } + + return result(error_code::SUCCESS, len); +} +#endif // SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_latin1_to_utf8( + const char *buf, size_t len, char *utf8_output) const noexcept { + return icelake::latin1_to_utf8_avx512_start(buf, len, utf8_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + return icelake_convert_latin1_to_utf16(buf, len, + utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + return icelake_convert_latin1_to_utf16(buf, len, + utf16_output); +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_latin1_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept { + avx512_convert_latin1_to_utf32(buf, len, utf32_output); + return len; +} +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept { + return icelake::utf8_to_latin1_avx512(buf, len, latin1_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors( + const char *buf, size_t len, char *latin1_output) const noexcept { + // First, try to convert as much as possible using the SIMD implementation. + const char *obuf = buf; + char *olatin1_output = latin1_output; + size_t written = icelake::utf8_to_latin1_avx512(obuf, len, olatin1_output); + + // If we have completely converted the string + if (obuf == buf + len) { + return {simdutf::SUCCESS, written}; + } + size_t pos = obuf - buf; + result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors( + pos, buf + pos, len - pos, latin1_output); + res.count += pos; + return res; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept { + return icelake::valid_utf8_to_latin1_avx512(buf, len, latin1_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + utf8_to_utf16_result ret = + fast_avx512_convert_utf8_to_utf16(buf, len, + utf16_output); + if (ret.second == nullptr) { + return 0; + } + return ret.second - utf16_output; +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + utf8_to_utf16_result ret = fast_avx512_convert_utf8_to_utf16( + buf, len, utf16_output); + if (ret.second == nullptr) { + return 0; + } + return ret.second - utf16_output; +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + return fast_avx512_convert_utf8_to_utf16_with_errors( + buf, len, utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + return fast_avx512_convert_utf8_to_utf16_with_errors( + buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + utf8_to_utf16_result ret = + icelake::valid_utf8_to_fixed_length( + buf, len, utf16_output); + size_t saved_bytes = ret.second - utf16_output; + const char *end = buf + len; + if (ret.first == end) { + return saved_bytes; + } + + // Note: AVX512 procedure looks up 4 bytes forward, and + // correctly converts multi-byte chars even if their + // continuation bytes lie outsiede 16-byte window. + // It meas, we have to skip continuation bytes from + // the beginning ret.first, as they were already consumed. + while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) { + ret.first += 1; + } + + if (ret.first != end) { + const size_t scalar_saved_bytes = + scalar::utf8_to_utf16::convert_valid( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + utf8_to_utf16_result ret = + icelake::valid_utf8_to_fixed_length( + buf, len, utf16_output); + size_t saved_bytes = ret.second - utf16_output; + const char *end = buf + len; + if (ret.first == end) { + return saved_bytes; + } + + // Note: AVX512 procedure looks up 4 bytes forward, and + // correctly converts multi-byte chars even if their + // continuation bytes lie outsiede 16-byte window. + // It meas, we have to skip continuation bytes from + // the beginning ret.first, as they were already consumed. + while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) { + ret.first += 1; + } + + if (ret.first != end) { + const size_t scalar_saved_bytes = + scalar::utf8_to_utf16::convert_valid( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + + return saved_bytes; +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_utf8_to_utf32( + const char *buf, size_t len, char32_t *utf32_out) const noexcept { + uint32_t *utf32_output = reinterpret_cast(utf32_out); + utf8_to_utf32_result ret = + icelake::validating_utf8_to_fixed_length( + buf, len, utf32_output); + if (ret.second == nullptr) + return 0; + + size_t saved_bytes = ret.second - utf32_output; + const char *end = buf + len; + if (ret.first == end) { + return saved_bytes; + } + + // Note: the AVX512 procedure looks up 4 bytes forward, and + // correctly converts multi-byte chars even if their + // continuation bytes lie outside 16-byte window. + // It means, we have to skip continuation bytes from + // the beginning ret.first, as they were already consumed. + while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) { + ret.first += 1; + } + if (ret.first != end) { + const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert( + ret.first, len - (ret.first - buf), utf32_out + saved_bytes); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors( + const char *buf, size_t len, char32_t *utf32) const noexcept { + if (simdutf_unlikely(len == 0)) { + return {error_code::SUCCESS, 0}; + } + uint32_t *utf32_output = reinterpret_cast(utf32); + auto ret = icelake::validating_utf8_to_fixed_length_with_constant_checks< + endianness::LITTLE, uint32_t>(buf, len, utf32_output); + + if (!std::get<2>(ret)) { + size_t pos = std::get<0>(ret) - buf; + // We might have an error that occurs right before pos. + // This is only a concern if buf[pos] is not a continuation byte. + if ((buf[pos] & 0xc0) != 0x80 && pos >= 64) { + pos -= 1; + } else if ((buf[pos] & 0xc0) == 0x80 && pos >= 64) { + // We must check whether we are the fourth continuation byte + bool c1 = (buf[pos - 1] & 0xc0) == 0x80; + bool c2 = (buf[pos - 2] & 0xc0) == 0x80; + bool c3 = (buf[pos - 3] & 0xc0) == 0x80; + if (c1 && c2 && c3) { + return {simdutf::TOO_LONG, pos}; + } + } + // todo: we reset the output to utf32 instead of using std::get<2.(ret) as + // you'd expect. that is because + // validating_utf8_to_fixed_length_with_constant_checks may have processed + // data beyond the error. + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors( + pos, buf + pos, len - pos, utf32); + res.count += pos; + return res; + } + size_t saved_bytes = std::get<1>(ret) - utf32_output; + const char *end = buf + len; + if (std::get<0>(ret) == end) { + return {simdutf::SUCCESS, saved_bytes}; + } + + // Note: the AVX512 procedure looks up 4 bytes forward, and + // correctly converts multi-byte chars even if their + // continuation bytes lie outside 16-byte window. + // It means, we have to skip continuation bytes from + // the beginning ret.first, as they were already consumed. + while (std::get<0>(ret) != end and + ((uint8_t(*std::get<0>(ret)) & 0xc0) == 0x80)) { + std::get<0>(ret) += 1; + } + + if (std::get<0>(ret) != end) { + auto scalar_result = scalar::utf8_to_utf32::convert_with_errors( + std::get<0>(ret), len - (std::get<0>(ret) - buf), + reinterpret_cast(utf32_output) + saved_bytes); + if (scalar_result.error != simdutf::SUCCESS) { + scalar_result.count += (std::get<0>(ret) - buf); + } else { + scalar_result.count += saved_bytes; + } + return scalar_result; + } + + return {simdutf::SUCCESS, size_t(std::get<1>(ret) - utf32_output)}; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32( + const char *buf, size_t len, char32_t *utf32_out) const noexcept { + uint32_t *utf32_output = reinterpret_cast(utf32_out); + utf8_to_utf32_result ret = + icelake::valid_utf8_to_fixed_length( + buf, len, utf32_output); + size_t saved_bytes = ret.second - utf32_output; + const char *end = buf + len; + if (ret.first == end) { + return saved_bytes; + } + + // Note: AVX512 procedure looks up 4 bytes forward, and + // correctly converts multi-byte chars even if their + // continuation bytes lie outsiede 16-byte window. + // It meas, we have to skip continuation bytes from + // the beginning ret.first, as they were already consumed. + while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) { + ret.first += 1; + } + + if (ret.first != end) { + const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert_valid( + ret.first, len - (ret.first - buf), utf32_out + saved_bytes); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + + return saved_bytes; +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + return icelake_convert_utf16_to_latin1(buf, len, + latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + return icelake_convert_utf16_to_latin1(buf, len, + latin1_output); +} + +simdutf_warn_unused result +implementation::convert_utf16le_to_latin1_with_errors( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + return icelake_convert_utf16_to_latin1_with_errors( + buf, len, latin1_output) + .first; +} + +simdutf_warn_unused result +implementation::convert_utf16be_to_latin1_with_errors( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + return icelake_convert_utf16_to_latin1_with_errors( + buf, len, latin1_output) + .first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + // optimization opportunity: implement custom function + return convert_utf16be_to_latin1(buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + // optimization opportunity: implement custom function + return convert_utf16le_to_latin1(buf, len, latin1_output); +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + size_t outlen; + size_t inlen = utf16_to_utf8_avx512i( + buf, len, (unsigned char *)utf8_output, &outlen); + if (inlen != len) { + return 0; + } + return outlen; +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + size_t outlen; + size_t inlen = utf16_to_utf8_avx512i( + buf, len, (unsigned char *)utf8_output, &outlen); + if (inlen != len) { + return 0; + } + return outlen; +} + +simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + size_t outlen; + size_t inlen = utf16_to_utf8_avx512i( + buf, len, (unsigned char *)utf8_output, &outlen); + if (inlen != len) { + result res = scalar::utf16_to_utf8::convert_with_errors( + buf + inlen, len - inlen, utf8_output + outlen); + res.count += inlen; + return res; + } + return {simdutf::SUCCESS, outlen}; +} + +simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + size_t outlen; + size_t inlen = utf16_to_utf8_avx512i( + buf, len, (unsigned char *)utf8_output, &outlen); + if (inlen != len) { + result res = scalar::utf16_to_utf8::convert_with_errors( + buf + inlen, len - inlen, utf8_output + outlen); + res.count += inlen; + return res; + } + return {simdutf::SUCCESS, outlen}; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + return convert_utf16le_to_utf8(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + return convert_utf16be_to_utf8(buf, len, utf8_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_utf32_to_latin1( + const char32_t *buf, size_t len, char *latin1_output) const noexcept { + return icelake_convert_utf32_to_latin1(buf, len, latin1_output); +} + +simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors( + const char32_t *buf, size_t len, char *latin1_output) const noexcept { + return icelake_convert_utf32_to_latin1_with_errors(buf, len, latin1_output) + .first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1( + const char32_t *buf, size_t len, char *latin1_output) const noexcept { + return icelake_convert_utf32_to_latin1(buf, len, latin1_output); +} +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_output) const noexcept { + std::pair ret = + avx512_convert_utf32_to_utf8(buf, len, utf8_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf8_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors( + const char32_t *buf, size_t len, char *utf8_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + icelake::avx512_convert_utf32_to_utf8_with_errors(buf, len, utf8_output); + if (ret.first.count != len) { + result scalar_res = scalar::utf32_to_utf8::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + utf8_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_output) const noexcept { + return convert_utf32_to_utf8(buf, len, utf8_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + std::pair ret = + avx512_convert_utf32_to_utf16(buf, len, utf16_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf16_output; + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + std::pair ret = + avx512_convert_utf32_to_utf16(buf, len, utf16_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf16_output; + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + avx512_convert_utf32_to_utf16_with_errors( + buf, len, utf16_output); + if (ret.first.error) { + return ret.first; + } + ret.first.count = + ret.second - + utf16_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + avx512_convert_utf32_to_utf16_with_errors(buf, len, + utf16_output); + if (ret.first.error) { + return ret.first; + } + ret.first.count = + ret.second - + utf16_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + return convert_utf32_to_utf16le(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + return convert_utf32_to_utf16be(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + std::tuple ret = + icelake::convert_utf16_to_utf32(buf, len, + utf32_output); + if (!std::get<2>(ret)) { + return 0; + } + size_t saved_bytes = std::get<1>(ret) - utf32_output; + if (std::get<0>(ret) != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_utf32::convert( + std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret)); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + std::tuple ret = + icelake::convert_utf16_to_utf32(buf, len, utf32_output); + if (!std::get<2>(ret)) { + return 0; + } + size_t saved_bytes = std::get<1>(ret) - utf32_output; + if (std::get<0>(ret) != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_utf32::convert( + std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret)); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + std::tuple ret = + icelake::convert_utf16_to_utf32(buf, len, + utf32_output); + if (!std::get<2>(ret)) { + result scalar_res = + scalar::utf16_to_utf32::convert_with_errors( + std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret)); + scalar_res.count += (std::get<0>(ret) - buf); + return scalar_res; + } + size_t saved_bytes = std::get<1>(ret) - utf32_output; + if (std::get<0>(ret) != buf + len) { + result scalar_res = + scalar::utf16_to_utf32::convert_with_errors( + std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret)); + if (scalar_res.error) { + scalar_res.count += (std::get<0>(ret) - buf); + return scalar_res; + } else { + scalar_res.count += saved_bytes; + return scalar_res; + } + } + return simdutf::result(simdutf::SUCCESS, saved_bytes); +} + +simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + std::tuple ret = + icelake::convert_utf16_to_utf32(buf, len, utf32_output); + if (!std::get<2>(ret)) { + result scalar_res = + scalar::utf16_to_utf32::convert_with_errors( + std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret)); + scalar_res.count += (std::get<0>(ret) - buf); + return scalar_res; + } + size_t saved_bytes = std::get<1>(ret) - utf32_output; + if (std::get<0>(ret) != buf + len) { + result scalar_res = + scalar::utf16_to_utf32::convert_with_errors( + std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret)); + if (scalar_res.error) { + scalar_res.count += (std::get<0>(ret) - buf); + return scalar_res; + } else { + scalar_res.count += saved_bytes; + return scalar_res; + } + } + return simdutf::result(simdutf::SUCCESS, saved_bytes); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + std::tuple ret = + icelake::convert_utf16_to_utf32(buf, len, + utf32_output); + if (!std::get<2>(ret)) { + return 0; + } + size_t saved_bytes = std::get<1>(ret) - utf32_output; + if (std::get<0>(ret) != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_utf32::convert( + std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret)); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + std::tuple ret = + icelake::convert_utf16_to_utf32(buf, len, utf32_output); + if (!std::get<2>(ret)) { + return 0; + } + size_t saved_bytes = std::get<1>(ret) - utf32_output; + if (std::get<0>(ret) != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_utf32::convert( + std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret)); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 +void implementation::change_endianness_utf16(const char16_t *input, + size_t length, + char16_t *output) const noexcept { + size_t pos = 0; + const __m512i byteflip = _mm512_setr_epi64( + 0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001, + 0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809); + while (pos + 32 <= length) { + __m512i utf16 = _mm512_loadu_si512((const __m512i *)(input + pos)); + utf16 = _mm512_shuffle_epi8(utf16, byteflip); + _mm512_storeu_si512(output + pos, utf16); + pos += 32; + } + if (pos < length) { + __mmask32 m((1U << (length - pos)) - 1); + __m512i utf16 = _mm512_maskz_loadu_epi16(m, (const __m512i *)(input + pos)); + utf16 = _mm512_shuffle_epi8(utf16, byteflip); + _mm512_mask_storeu_epi16(output + pos, m, utf16); + } +} + +simdutf_warn_unused size_t implementation::count_utf16le( + const char16_t *input, size_t length) const noexcept { + const char16_t *ptr = input; + size_t count{0}; + + if (length >= 32) { + const char16_t *end = input + length - 32; + + const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00); + const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff); + + while (ptr <= end) { + __m512i utf16 = _mm512_loadu_si512((const __m512i *)ptr); + ptr += 32; + uint64_t not_high_surrogate = + static_cast(_mm512_cmpgt_epu16_mask(utf16, high) | + _mm512_cmplt_epu16_mask(utf16, low)); + count += count_ones(not_high_surrogate); + } + } + + return count + scalar::utf16::count_code_points( + ptr, length - (ptr - input)); +} + +simdutf_warn_unused size_t implementation::count_utf16be( + const char16_t *input, size_t length) const noexcept { + const char16_t *ptr = input; + size_t count{0}; + if (length >= 32) { + + const char16_t *end = input + length - 32; + + const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00); + const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff); + + const __m512i byteflip = _mm512_setr_epi64( + 0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001, + 0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809); + while (ptr <= end) { + __m512i utf16 = + _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i *)ptr), byteflip); + ptr += 32; + uint64_t not_high_surrogate = + static_cast(_mm512_cmpgt_epu16_mask(utf16, high) | + _mm512_cmplt_epu16_mask(utf16, low)); + count += count_ones(not_high_surrogate); + } + } + + return count + scalar::utf16::count_code_points( + ptr, length - (ptr - input)); +} +#endif // SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 +simdutf_warn_unused size_t +implementation::count_utf8(const char *input, size_t length) const noexcept { + const uint8_t *str = reinterpret_cast(input); + size_t answer = + length / sizeof(__m512i) * + sizeof(__m512i); // Number of 512-bit chunks that fits into the length. + size_t i = 0; + __m512i unrolled_popcount{0}; + + const __m512i continuation = _mm512_set1_epi8(char(0b10111111)); + + while (i + sizeof(__m512i) <= length) { + size_t iterations = (length - i) / sizeof(__m512i); + + size_t max_i = i + iterations * sizeof(__m512i) - sizeof(__m512i); + for (; i + 8 * sizeof(__m512i) <= max_i; i += 8 * sizeof(__m512i)) { + __m512i input1 = _mm512_loadu_si512((const __m512i *)(str + i)); + __m512i input2 = + _mm512_loadu_si512((const __m512i *)(str + i + sizeof(__m512i))); + __m512i input3 = + _mm512_loadu_si512((const __m512i *)(str + i + 2 * sizeof(__m512i))); + __m512i input4 = + _mm512_loadu_si512((const __m512i *)(str + i + 3 * sizeof(__m512i))); + __m512i input5 = + _mm512_loadu_si512((const __m512i *)(str + i + 4 * sizeof(__m512i))); + __m512i input6 = + _mm512_loadu_si512((const __m512i *)(str + i + 5 * sizeof(__m512i))); + __m512i input7 = + _mm512_loadu_si512((const __m512i *)(str + i + 6 * sizeof(__m512i))); + __m512i input8 = + _mm512_loadu_si512((const __m512i *)(str + i + 7 * sizeof(__m512i))); + + __mmask64 mask1 = _mm512_cmple_epi8_mask(input1, continuation); + __mmask64 mask2 = _mm512_cmple_epi8_mask(input2, continuation); + __mmask64 mask3 = _mm512_cmple_epi8_mask(input3, continuation); + __mmask64 mask4 = _mm512_cmple_epi8_mask(input4, continuation); + __mmask64 mask5 = _mm512_cmple_epi8_mask(input5, continuation); + __mmask64 mask6 = _mm512_cmple_epi8_mask(input6, continuation); + __mmask64 mask7 = _mm512_cmple_epi8_mask(input7, continuation); + __mmask64 mask8 = _mm512_cmple_epi8_mask(input8, continuation); + + __m512i mask_register = _mm512_set_epi64(mask8, mask7, mask6, mask5, + mask4, mask3, mask2, mask1); + + unrolled_popcount = _mm512_add_epi64(unrolled_popcount, + _mm512_popcnt_epi64(mask_register)); + } + + for (; i <= max_i; i += sizeof(__m512i)) { + __m512i more_input = _mm512_loadu_si512((const __m512i *)(str + i)); + uint64_t continuation_bitmask = static_cast( + _mm512_cmple_epi8_mask(more_input, continuation)); + answer -= count_ones(continuation_bitmask); + } + } + + answer -= _mm512_reduce_add_epi64(unrolled_popcount); + + return answer + scalar::utf8::count_code_points( + reinterpret_cast(str + i), length - i); +} +#endif // SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::latin1_length_from_utf8( + const char *buf, size_t len) const noexcept { + return count_utf8(buf, len); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::utf8_length_from_utf16le( + const char16_t *input, size_t length) const noexcept { + return icelake_utf8_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf16be( + const char16_t *input, size_t length) const noexcept { + return icelake_utf8_length_from_utf16(input, length); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf32_length_from_utf16le( + const char16_t *input, size_t length) const noexcept { + return implementation::count_utf16le(input, length); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf16be( + const char16_t *input, size_t length) const noexcept { + return implementation::count_utf16be(input, length); +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::utf8_length_from_latin1( + const char *input, size_t length) const noexcept { + const uint8_t *str = reinterpret_cast(input); + size_t answer = length / sizeof(__m512i) * sizeof(__m512i); + size_t i = 0; + if (answer >= 2048) // long strings optimization + { + unsigned char v_0xFF = 0xff; + __m512i eight_64bits = _mm512_setzero_si512(); + while (i + sizeof(__m512i) <= length) { + __m512i runner = _mm512_setzero_si512(); + size_t iterations = (length - i) / sizeof(__m512i); + if (iterations > 255) { + iterations = 255; + } + size_t max_i = i + iterations * sizeof(__m512i) - sizeof(__m512i); + for (; i + 4 * sizeof(__m512i) <= max_i; i += 4 * sizeof(__m512i)) { + // Load four __m512i vectors + __m512i input1 = _mm512_loadu_si512((const __m512i *)(str + i)); + __m512i input2 = + _mm512_loadu_si512((const __m512i *)(str + i + sizeof(__m512i))); + __m512i input3 = _mm512_loadu_si512( + (const __m512i *)(str + i + 2 * sizeof(__m512i))); + __m512i input4 = _mm512_loadu_si512( + (const __m512i *)(str + i + 3 * sizeof(__m512i))); + + // Generate four masks + __mmask64 mask1 = + _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input1); + __mmask64 mask2 = + _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input2); + __mmask64 mask3 = + _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input3); + __mmask64 mask4 = + _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input4); + // Apply the masks and subtract from the runner + __m512i not_ascii1 = + _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask1, v_0xFF); + __m512i not_ascii2 = + _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask2, v_0xFF); + __m512i not_ascii3 = + _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask3, v_0xFF); + __m512i not_ascii4 = + _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask4, v_0xFF); + + runner = _mm512_sub_epi8(runner, not_ascii1); + runner = _mm512_sub_epi8(runner, not_ascii2); + runner = _mm512_sub_epi8(runner, not_ascii3); + runner = _mm512_sub_epi8(runner, not_ascii4); + } + + for (; i <= max_i; i += sizeof(__m512i)) { + __m512i more_input = _mm512_loadu_si512((const __m512i *)(str + i)); + + __mmask64 mask = + _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), more_input); + __m512i not_ascii = + _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask, v_0xFF); + runner = _mm512_sub_epi8(runner, not_ascii); + } + + eight_64bits = _mm512_add_epi64( + eight_64bits, _mm512_sad_epu8(runner, _mm512_setzero_si512())); + } + + answer += _mm512_reduce_add_epi64(eight_64bits); + } else if (answer > 0) { + for (; i + sizeof(__m512i) <= length; i += sizeof(__m512i)) { + __m512i latin = _mm512_loadu_si512((const __m512i *)(str + i)); + uint64_t non_ascii = _mm512_movepi8_mask(latin); + answer += count_ones(non_ascii); + } + } + return answer + scalar::latin1::utf8_length_from_latin1( + reinterpret_cast(str + i), length - i); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::utf16_length_from_utf8( + const char *input, size_t length) const noexcept { + size_t pos = 0; + + // UTF-16 char length based on the four most significant bits of UTF-8 bytes + const __m128i utf8_length_128 = _mm_setr_epi8( + // ASCII chars + /* 0000 */ 1, + /* 0001 */ 1, + /* 0010 */ 1, + /* 0011 */ 1, + /* 0100 */ 1, + /* 0101 */ 1, + /* 0110 */ 1, + /* 0111 */ 1, + + // continuation bytes + /* 1000 */ 0, + /* 1001 */ 0, + /* 1010 */ 0, + /* 1011 */ 0, + + // leading bytes + /* 1100 */ 1, // 2-byte UTF-8 char => 1 UTF-16 word + /* 1101 */ 1, // 2-byte UTF-8 char => 1 UTF-16 word + /* 1110 */ 1, // 3-byte UTF-8 char => 1 UTF-16 word + /* 1111 */ 2 // 4-byte UTF-8 char => 2 UTF-16 words (surrogate pair) + ); + + const __m512i char_length = broadcast_128bit_lane(utf8_length_128); + + constexpr size_t max_iterations = 255 / 2; + + size_t iterations = 0; + const auto zero = _mm512_setzero_si512(); + __m512i local = _mm512_setzero_si512(); // byte-wise counters + __m512i counters = _mm512_setzero_si512(); // 64-bit counters + for (; pos + 64 <= length; pos += 64) { + __m512i utf8 = _mm512_loadu_si512((const __m512i *)(input + pos)); + const auto t0 = _mm512_srli_epi32(utf8, 4); + const auto t1 = _mm512_and_si512(t0, _mm512_set1_epi8(0xf)); + const auto t2 = _mm512_shuffle_epi8(char_length, t1); + local = _mm512_add_epi8(local, t2); + + iterations += 1; + if (iterations == max_iterations) { + counters = _mm512_add_epi64(counters, _mm512_sad_epu8(local, zero)); + local = zero; + iterations = 0; + } + } + + size_t count = 0; + + if (pos > 0) { + // don't waste time for short strings + if (iterations > 0) { + counters = _mm512_add_epi64(counters, _mm512_sad_epu8(local, zero)); + } + + const auto l0 = _mm512_extracti32x4_epi32(counters, 0); + const auto l1 = _mm512_extracti32x4_epi32(counters, 1); + const auto l2 = _mm512_extracti32x4_epi32(counters, 2); + const auto l3 = _mm512_extracti32x4_epi32(counters, 3); + + const auto sum = + _mm_add_epi64(_mm_add_epi64(l0, l1), _mm_add_epi64(l2, l3)); + + count = uint64_t(_mm_extract_epi64(sum, 0)) + + uint64_t(_mm_extract_epi64(sum, 1)); + } + + return count + + scalar::utf8::utf16_length_from_utf8(input + pos, length - pos); +} +simdutf_warn_unused result +implementation::utf8_length_from_utf16le_with_replacement( + const char16_t *input, size_t length) const noexcept { + return icelake_utf8_length_from_utf16_with_replacement( + input, length); +} + +simdutf_warn_unused result +implementation::utf8_length_from_utf16be_with_replacement( + const char16_t *input, size_t length) const noexcept { + return icelake_utf8_length_from_utf16_with_replacement( + input, length); +} + +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf8_length_from_utf32( + const char32_t *input, size_t length) const noexcept { + return utf32::utf8_length_from_utf32(input, length); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf16_length_from_utf32( + const char32_t *input, size_t length) const noexcept { + const char32_t *ptr = input; + size_t count{0}; + + if (length >= 16) { + const char32_t *end = input + length - 16; + + const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff); + + while (ptr <= end) { + __m512i utf32 = _mm512_loadu_si512((const __m512i *)ptr); + ptr += 16; + __mmask16 surrogates_bitmask = + _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff); + + count += 16 + count_ones(surrogates_bitmask); + } + } + + return count + + scalar::utf32::utf16_length_from_utf32(ptr, length - (ptr - input)); +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf32_length_from_utf8( + const char *input, size_t length) const noexcept { + return implementation::count_utf8(input, length); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_BASE64 +simdutf_warn_unused result implementation::base64_to_binary( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + if (options & base64_default_or_url) { + if (options == base64_options::base64_default_or_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } +} + +simdutf_warn_unused full_result implementation::base64_to_binary_details( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + if (options & base64_default_or_url) { + if (options == base64_options::base64_default_or_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } +} + +simdutf_warn_unused result implementation::base64_to_binary( + const char16_t *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + if (options & base64_default_or_url) { + if (options == base64_options::base64_default_or_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } +} + +simdutf_warn_unused full_result implementation::base64_to_binary_details( + const char16_t *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + if (options & base64_default_or_url) { + if (options == base64_options::base64_default_or_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } +} + +size_t implementation::binary_to_base64(const char *input, size_t length, + char *output, + base64_options options) const noexcept { + if (options & base64_url) { + return encode_base64(output, input, length, options); + } else { + return encode_base64(output, input, length, options); + } +} + +size_t implementation::binary_to_base64_with_lines( + const char *input, size_t length, char *output, size_t line_length, + base64_options options) const noexcept { + if (options & base64_url) { + return encode_base64_impl(output, input, length, options, + line_length); + } else { + return encode_base64_impl(output, input, length, options, + line_length); + } +} + +const char *implementation::find(const char *start, const char *end, + char character) const noexcept { + return util_find(start, end, character); +} +const char16_t *implementation::find(const char16_t *start, const char16_t *end, + char16_t character) const noexcept { + return util_find(start, end, character); +} +#endif // SIMDUTF_FEATURE_BASE64 + +} // namespace icelake +} // namespace simdutf + +/* begin file src/simdutf/icelake/end.h */ +#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE +// nothing needed. +#else +SIMDUTF_UNTARGET_REGION +#endif + + +#if SIMDUTF_GCC11ORMORE // workaround for + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593 +SIMDUTF_POP_DISABLE_WARNINGS +#endif // end of workaround +/* end file src/simdutf/icelake/end.h */ +/* end file src/icelake/implementation.cpp */ +#endif +#if SIMDUTF_IMPLEMENTATION_HASWELL +/* begin file src/haswell/implementation.cpp */ +/* begin file src/simdutf/haswell/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "haswell" +// #define SIMDUTF_IMPLEMENTATION haswell +#define SIMDUTF_SIMD_HAS_BYTEMASK 1 + +#if SIMDUTF_CAN_ALWAYS_RUN_HASWELL +// nothing needed. +#else +SIMDUTF_TARGET_HASWELL +#endif + +#if SIMDUTF_GCC11ORMORE // workaround for + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593 +// clang-format off +SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized) +// clang-format on +#endif // end of workaround +/* end file src/simdutf/haswell/begin.h */ + +namespace simdutf { +namespace haswell { +namespace { +#ifndef SIMDUTF_HASWELL_H + #error "haswell.h must be included" +#endif +using namespace simd; + +#if SIMDUTF_FEATURE_ASCII || SIMDUTF_FEATURE_DETECT_ENCODING || \ + SIMDUTF_FEATURE_UTF8 +simdutf_really_inline bool is_ascii(const simd8x64 &input) { + return input.reduce_or().is_ascii(); +} +#endif // SIMDUTF_FEATURE_ASCII || SIMDUTF_FEATURE_DETECT_ENCODING || + // SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_really_inline simd8 +must_be_2_3_continuation(const simd8 prev2, + const simd8 prev3) { + simd8 is_third_byte = + prev2.saturating_sub(0xe0u - 0x80); // Only 111_____ will be > 0x80 + simd8 is_fourth_byte = + prev3.saturating_sub(0xf0u - 0x80); // Only 1111____ will be > 0x80 + return simd8(is_third_byte | is_fourth_byte); +} +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +namespace utf16 { +/* begin file src/haswell/avx2_validate_utf16.cpp */ +template +simd8 utf16_gather_high_bytes(const simd16 &in0, + const simd16 &in1) { + if (big_endian) { + // we want lower bytes + const auto mask = simd16(0x00ff); + const auto t0 = in0 & mask; + const auto t1 = in1 & mask; + + return simd16::pack(t0, t1); + } else { + const auto t0 = in0.shr<8>(); + const auto t1 = in1.shr<8>(); + + return simd16::pack(t0, t1); + } +} +/* end file src/haswell/avx2_validate_utf16.cpp */ +} +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF16 +/* begin file src/haswell/avx2_utf16fix.cpp */ +/* + * Process one block of 16 characters. If in_place is false, + * copy the block from in to out. If there is a sequencing + * error in the block, overwrite the illsequenced characters + * with the replacement character. This function reads one + * character before the beginning of the buffer as a lookback. + * If that character is illsequenced, it too is overwritten. + */ +template +void utf16fix_block(char16_t *out, const char16_t *in) { + auto swap_if_needed = [](uint16_t x) simdutf_constexpr -> uint16_t { + return scalar::utf16::swap_if_needed(x); + }; + const char16_t replacement = scalar::utf16::replacement(); + __m256i lookback, block, lb_masked, block_masked, lb_is_high, block_is_low; + __m256i illseq, lb_illseq, block_illseq, lb_illseq_shifted; + + lookback = _mm256_loadu_si256((const __m256i *)(in - 1)); + block = _mm256_loadu_si256((const __m256i *)in); + lb_masked = + _mm256_and_si256(lookback, _mm256_set1_epi16(swap_if_needed(0xfc00u))); + block_masked = + _mm256_and_si256(block, _mm256_set1_epi16(swap_if_needed(0xfc00u))); + lb_is_high = + _mm256_cmpeq_epi16(lb_masked, _mm256_set1_epi16(swap_if_needed(0xd800u))); + block_is_low = _mm256_cmpeq_epi16(block_masked, + _mm256_set1_epi16(swap_if_needed(0xdc00u))); + + illseq = _mm256_xor_si256(lb_is_high, block_is_low); + if (!_mm256_testz_si256(illseq, illseq)) { + int lb; + + /* compute the cause of the illegal sequencing */ + lb_illseq = _mm256_andnot_si256(block_is_low, lb_is_high); +#if SIMDUTF_GCC9OROLDER + // Old GCC versions are missing _mm256_zextsi128_si256, so we emulate it. + __m128i tmp_legacygcc = + _mm_bslli_si128(_mm256_extracti128_si256(lb_illseq, 1), 14); + __m256i tmp_legacygcc256 = + _mm256_set_m128i(_mm_setzero_si128(), tmp_legacygcc); + lb_illseq_shifted = + _mm256_or_si256(_mm256_bsrli_epi128(lb_illseq, 2), tmp_legacygcc256); +#else + lb_illseq_shifted = + _mm256_or_si256(_mm256_bsrli_epi128(lb_illseq, 2), + _mm256_zextsi128_si256(_mm_bslli_si128( + _mm256_extracti128_si256(lb_illseq, 1), 14))); +#endif // SIMDUTF_GCC9OROLDER + block_illseq = _mm256_or_si256( + _mm256_andnot_si256(lb_is_high, block_is_low), lb_illseq_shifted); + + /* fix illegal sequencing in the lookback */ +#if SIMDUTF_GCC10 || SIMDUTF_GCC9OROLDER + // GCC 10 is missing important intrinsics. + lb = _mm_cvtsi128_si32(_mm256_extractf128_si256(lb_illseq, 0)); +#else + lb = _mm256_cvtsi256_si32(lb_illseq); +#endif + lb = (lb & replacement) | (~lb & out[-1]); + out[-1] = char16_t(lb); + + /* fix illegal sequencing in the main block */ + block = + _mm256_blendv_epi8(block, _mm256_set1_epi16(replacement), block_illseq); + _mm256_storeu_si256((__m256i *)out, block); + } else if (!in_place) { + _mm256_storeu_si256((__m256i *)out, block); + } +} + +template +void utf16fix_block_sse(char16_t *out, const char16_t *in) { + auto swap_if_needed = [](uint16_t x) simdutf_constexpr -> uint16_t { + return scalar::utf16::swap_if_needed(x); + }; + const char16_t replacement = scalar::utf16::replacement(); + __m128i lookback, block, lb_masked, block_masked, lb_is_high, block_is_low; + __m128i illseq, lb_illseq, block_illseq; + + lookback = _mm_loadu_si128((const __m128i *)(in - 1)); + block = _mm_loadu_si128((const __m128i *)in); + lb_masked = _mm_and_si128(lookback, _mm_set1_epi16(swap_if_needed(0xfc00U))); + block_masked = _mm_and_si128(block, _mm_set1_epi16(swap_if_needed(0xfc00U))); + lb_is_high = + _mm_cmpeq_epi16(lb_masked, _mm_set1_epi16(swap_if_needed(0xd800U))); + block_is_low = + _mm_cmpeq_epi16(block_masked, _mm_set1_epi16(swap_if_needed(0xdc00U))); + + illseq = _mm_xor_si128(lb_is_high, block_is_low); + if (_mm_movemask_epi8(illseq) != 0) { + /* compute the cause of the illegal sequencing */ + lb_illseq = _mm_andnot_si128(block_is_low, lb_is_high); + block_illseq = _mm_or_si128(_mm_andnot_si128(lb_is_high, block_is_low), + _mm_bsrli_si128(lb_illseq, 2)); + /* fix illegal sequencing in the lookback */ + int lb = _mm_cvtsi128_si32(lb_illseq); + lb = (lb & replacement) | (~lb & out[-1]); + out[-1] = char16_t(lb); + /* fix illegal sequencing in the main block */ + block = + _mm_or_si128(_mm_andnot_si128(block_illseq, block), + _mm_and_si128(block_illseq, _mm_set1_epi16(replacement))); + _mm_storeu_si128((__m128i *)out, block); + } else if (!in_place) { + _mm_storeu_si128((__m128i *)out, block); + } +} + +template +void utf16fix_sse(const char16_t *in, size_t n, char16_t *out) { + const char16_t replacement = scalar::utf16::replacement(); + size_t i; + + if (n < 9) { + scalar::utf16::to_well_formed_utf16(in, n, out); + return; + } + + out[0] = + scalar::utf16::is_low_surrogate(in[0]) ? replacement : in[0]; + + /* duplicate code to have the compiler specialise utf16fix_block() */ + if (in == out) { + for (i = 1; i + 8 < n; i += 8) { + utf16fix_block_sse(out + i, in + i); + } + + utf16fix_block_sse(out + n - 8, in + n - 8); + } else { + for (i = 1; i + 8 < n; i += 8) { + utf16fix_block_sse(out + i, in + i); + } + + utf16fix_block_sse(out + n - 8, in + n - 8); + } + + out[n - 1] = scalar::utf16::is_high_surrogate(out[n - 1]) + ? replacement + : out[n - 1]; +} + +template +void utf16fix_avx(const char16_t *in, size_t n, char16_t *out) { + const char16_t replacement = scalar::utf16::replacement(); + size_t i; + + if (n < 17) { + utf16fix_sse(in, n, out); + return; + } + + out[0] = + scalar::utf16::is_low_surrogate(in[0]) ? replacement : in[0]; + + /* duplicate code to have the compiler specialise utf16fix_block() */ + if (in == out) { + for (i = 1; i + 16 < n; i += 16) { + utf16fix_block(out + i, in + i); + } + + utf16fix_block(out + n - 16, in + n - 16); + } else { + for (i = 1; i + 16 < n; i += 16) { + utf16fix_block(out + i, in + i); + } + + utf16fix_block(out + n - 16, in + n - 16); + } + + out[n - 1] = scalar::utf16::is_high_surrogate(out[n - 1]) + ? replacement + : out[n - 1]; +} +/* end file src/haswell/avx2_utf16fix.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/haswell/avx2_convert_latin1_to_utf8.cpp */ +std::pair +avx2_convert_latin1_to_utf8(const char *latin1_input, size_t len, + char *utf8_output) { + const char *end = latin1_input + len; + const __m256i v_0000 = _mm256_setzero_si256(); + const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080); + const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80); + const size_t safety_margin = 12; + + while (end - latin1_input >= std::ptrdiff_t(16 + safety_margin)) { + __m128i in8 = _mm_loadu_si128((__m128i *)latin1_input); + // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes + const __m128i v_80 = _mm_set1_epi8((char)0x80); + if (_mm_testz_si128(in8, v_80)) { // ASCII fast path!!!! + // 1. store (16 bytes) + _mm_storeu_si128((__m128i *)utf8_output, in8); + // 2. adjust pointers + latin1_input += 16; + utf8_output += 16; + continue; // we are done for this round! + } + // We proceed only with the first 16 bytes. + const __m256i in = _mm256_cvtepu8_epi16((in8)); + + // 1. prepare 2-byte values + // input 16-bit word : [0000|0000|aabb|bbbb] x 8 + // expected output : [1100|00aa|10bb|bbbb] x 8 + const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00); + const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f); + + // t0 = [0000|00aa|bbbb|bb00] + const __m256i t0 = _mm256_slli_epi16(in, 2); + // t1 = [0000|00aa|0000|0000] + const __m256i t1 = _mm256_and_si256(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const __m256i t2 = _mm256_and_si256(in, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const __m256i t3 = _mm256_or_si256(t1, t2); + // t4 = [1100|00aa|10bb|bbbb] + const __m256i t4 = _mm256_or_si256(t3, v_c080); + + // 2. merge ASCII and 2-byte codewords + + // no bits set above 7th bit + const __m256i one_byte_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000); + const uint32_t one_byte_bitmask = + static_cast(_mm256_movemask_epi8(one_byte_bytemask)); + + const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask); + + // 3. prepare bitmask for 8-bit lookup + const uint32_t M0 = one_byte_bitmask & 0x55555555; + const uint32_t M1 = M0 >> 7; + const uint32_t M2 = (M1 | M0) & 0x00ff00ff; + // 4. pack the bytes + + const uint8_t *row = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0]; + const uint8_t *row_2 = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> 16)] + [0]; + + const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1)); + const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1)); + + const __m256i utf8_packed = _mm256_shuffle_epi8( + utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2)); + // 5. store bytes + _mm_storeu_si128((__m128i *)utf8_output, + _mm256_castsi256_si128(utf8_packed)); + utf8_output += row[0]; + _mm_storeu_si128((__m128i *)utf8_output, + _mm256_extractf128_si256(utf8_packed, 1)); + utf8_output += row_2[0]; + + // 6. adjust pointers + latin1_input += 16; + continue; + + } // while + return std::make_pair(latin1_input, utf8_output); +} +/* end file src/haswell/avx2_convert_latin1_to_utf8.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/haswell/avx2_convert_latin1_to_utf16.cpp */ +template +std::pair +avx2_convert_latin1_to_utf16(const char *latin1_input, size_t len, + char16_t *utf16_output) { + size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16 + + size_t i = 0; + for (; i < rounded_len; i += 16) { + // Load 16 bytes from the address (input + i) into a xmm register + const __m128i latin1 = + _mm_loadu_si128(reinterpret_cast(latin1_input + i)); + + // Zero extend each byte in `in` to word + __m256i utf16 = _mm256_cvtepu8_epi16(latin1); + + if (big_endian) { + const __m128i swap128 = + _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + const __m256i swap = _mm256_set_m128i(swap128, swap128); + utf16 = _mm256_shuffle_epi8(utf16, swap); + } + + // Store the contents of xmm1 into the address pointed by (output + i) + _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf16_output + i), utf16); + } + + return std::make_pair(latin1_input + rounded_len, utf16_output + rounded_len); +} +/* end file src/haswell/avx2_convert_latin1_to_utf16.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/haswell/avx2_convert_latin1_to_utf32.cpp */ +std::pair +avx2_convert_latin1_to_utf32(const char *buf, size_t len, + char32_t *utf32_output) { + size_t rounded_len = ((len | 7) ^ 7); // Round down to nearest multiple of 8 + + for (size_t i = 0; i < rounded_len; i += 8) { + // Load 8 Latin1 characters into a 64-bit register + __m128i in = _mm_loadl_epi64((__m128i *)&buf[i]); + + // Zero extend each set of 8 Latin1 characters to 8 32-bit integers using + // vpmovzxbd + __m256i out = _mm256_cvtepu8_epi32(in); + + // Store the results back to memory + _mm256_storeu_si256((__m256i *)&utf32_output[i], out); + } + + // return pointers pointing to where we left off + return std::make_pair(buf + rounded_len, utf32_output + rounded_len); +} +/* end file src/haswell/avx2_convert_latin1_to_utf32.cpp */ +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +/* begin file src/haswell/avx2_convert_utf8_to_utf16.cpp */ +// depends on "tables/utf8_to_utf16_tables.h" + +// Convert up to 12 bytes from utf8 to utf16 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 12). +template +size_t convert_masked_utf8_to_utf16(const char *input, + uint64_t utf8_end_of_code_point_mask, + char16_t *&utf16_output) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + // + // Optimization note: our main path below is load-latency dependent. Thus it + // is maybe beneficial to have fast paths that depend on branch prediction but + // have less latency. This results in more instructions but, potentially, also + // higher speeds. + // + // We first try a few fast paths. + const __m128i swap = + _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + const __m128i in = _mm_loadu_si128((__m128i *)input); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & 0xfff; + if (utf8_end_of_code_point_mask == 0xfff) { + // We process the data in chunks of 12 bytes. + __m256i ascii = _mm256_cvtepu8_epi16(in); + if (big_endian) { + const __m256i swap256 = _mm256_setr_epi8( + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, + 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); + ascii = _mm256_shuffle_epi8(ascii, swap256); + } + _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf16_output), ascii); + utf16_output += 12; // We wrote 12 16-bit characters. + return 12; // We consumed 12 bytes. + } + if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) { + // We want to take 8 2-byte UTF-8 code units and turn them into 8 2-byte + // UTF-16 code units. There is probably a more efficient sequence, but the + // following might do. + const __m128i sh = + _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); + __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); + if (big_endian) + composed = _mm_shuffle_epi8(composed, swap); + _mm_storeu_si128((__m128i *)utf16_output, composed); + utf16_output += 8; // We wrote 16 bytes, 8 code points. + return 16; + } + if (input_utf8_end_of_code_point_mask == 0x924) { + // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte + // UTF-16 code units. There is probably a more efficient sequence, but the + // following might do. + const __m128i sh = + _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = + _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits + const __m128i middlebyte = + _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + const __m128i highbyte = + _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); + __m128i composed_repacked = _mm_packus_epi32(composed, composed); + if (big_endian) + composed_repacked = _mm_shuffle_epi8(composed_repacked, swap); + _mm_storeu_si128((__m128i *)utf16_output, composed_repacked); + utf16_output += 4; + return 12; + } + + const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex + [input_utf8_end_of_code_point_mask][0]; + const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex + [input_utf8_end_of_code_point_mask][1]; + if (idx < 64) { + // SIX (6) input code-code units + // this is a relatively easy scenario + // we process SIX (6) input code-code units. The max length in bytes of six + // code code units spanning between 1 and 2 bytes each is 12 bytes. On + // processors where pdep/pext is fast, we might be able to use a small + // lookup table. + const __m128i sh = _mm_loadu_si128( + (const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); + __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); + if (big_endian) + composed = _mm_shuffle_epi8(composed, swap); + _mm_storeu_si128((__m128i *)utf16_output, composed); + utf16_output += 6; // We wrote 12 bytes, 6 code points. There is a potential + // overflow of 4 bytes. + } else if (idx < 145) { + // FOUR (4) input code-code units + const __m128i sh = _mm_loadu_si128( + (const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = + _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits + const __m128i middlebyte = + _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + const __m128i highbyte = + _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); + __m128i composed_repacked = _mm_packus_epi32(composed, composed); + if (big_endian) + composed_repacked = _mm_shuffle_epi8(composed_repacked, swap); + _mm_storeu_si128((__m128i *)utf16_output, composed_repacked); + utf16_output += 4; // Here we overflow by 8 bytes. + } else if (idx < 209) { + // TWO (2) input code-code units + ////////////// + // There might be garbage inputs where a leading byte mascarades as a + // four-byte leading byte (by being followed by 3 continuation byte), but is + // not greater than 0xf0. This could trigger a buffer overflow if we only + // counted leading bytes of the form 0xf0 as generating surrogate pairs, + // without further UTF-8 validation. Thus we must be careful to ensure that + // only leading bytes at least as large as 0xf0 generate surrogate pairs. We + // do as at the cost of an extra mask. + ///////////// + const __m128i sh = _mm_loadu_si128( + (const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); + const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000)); + // correct for spurious high bit + const __m128i correct = + _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1); + middlehighbyte = _mm_xor_si128(correct, middlehighbyte); + const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4); + // We deliberately carry the leading four bits in highbyte if they are + // present, we remove them later when computing hightenbits. + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0xff000000)); + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6); + // When we need to generate a surrogate pair (leading byte > 0xF0), then + // the corresponding 32-bit value in 'composed' will be greater than + // > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the + // location of the surrogate pairs. + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), + _mm_or_si128(highbyte_shifted, middlehighbyte_shifted)); + const __m128i composedminus = + _mm_sub_epi32(composed, _mm_set1_epi32(0x10000)); + const __m128i lowtenbits = + _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff)); + // Notice the 0x3ff mask: + const __m128i hightenbits = + _mm_and_si128(_mm_srli_epi32(composedminus, 10), _mm_set1_epi32(0x3ff)); + const __m128i lowtenbitsadd = + _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00)); + const __m128i hightenbitsadd = + _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800)); + const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16); + __m128i surrogates = _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted); + uint32_t basic_buffer[4]; + uint32_t basic_buffer_swap[4]; + if (big_endian) { + _mm_storeu_si128((__m128i *)basic_buffer_swap, + _mm_shuffle_epi8(composed, swap)); + surrogates = _mm_shuffle_epi8(surrogates, swap); + } + _mm_storeu_si128((__m128i *)basic_buffer, composed); + uint32_t surrogate_buffer[4]; + _mm_storeu_si128((__m128i *)surrogate_buffer, surrogates); + for (size_t i = 0; i < 3; i++) { + if (basic_buffer[i] > 0x3c00000) { + utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff); + utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16); + utf16_output += 2; + } else { + utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i]) + : uint16_t(basic_buffer[i]); + utf16_output++; + } + } + } else { + // here we know that there is an error but we do not handle errors + } + return consumed; +} +/* end file src/haswell/avx2_convert_utf8_to_utf16.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +/* begin file src/haswell/avx2_convert_utf8_to_utf32.cpp */ +// depends on "tables/utf8_to_utf16_tables.h" + +// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 12). +size_t convert_masked_utf8_to_utf32(const char *input, + uint64_t utf8_end_of_code_point_mask, + char32_t *&utf32_output) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + // + // Optimization note: our main path below is load-latency dependent. Thus it + // is maybe beneficial to have fast paths that depend on branch prediction but + // have less latency. This results in more instructions but, potentially, also + // higher speeds. + // + // We first try a few fast paths. + const __m128i in = _mm_loadu_si128((__m128i *)input); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & 0xfff; + if (utf8_end_of_code_point_mask == 0xfff) { + // We process the data in chunks of 12 bytes. + _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), + _mm256_cvtepu8_epi32(in)); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output + 8), + _mm256_cvtepu8_epi32(_mm_srli_si128(in, 8))); + utf32_output += 12; // We wrote 12 32-bit characters. + return 12; // We consumed 12 bytes. + } + if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) { + // We want to take 8 2-byte UTF-8 code units and turn them into 8 4-byte + // UTF-32 code units. There is probably a more efficient sequence, but the + // following might do. + const __m128i sh = + _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); + const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); + _mm256_storeu_si256((__m256i *)utf32_output, + _mm256_cvtepu16_epi32(composed)); + utf32_output += 8; // We wrote 16 bytes, 8 code points. + return 16; + } + if (input_utf8_end_of_code_point_mask == 0x924) { + // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte + // UTF-32 code units. There is probably a more efficient sequence, but the + // following might do. + const __m128i sh = + _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = + _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits + const __m128i middlebyte = + _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + const __m128i highbyte = + _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); + _mm_storeu_si128((__m128i *)utf32_output, composed); + utf32_output += 4; + return 12; + } + /// We do not have a fast path available, so we fallback. + + const uint8_t idx = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; + const uint8_t consumed = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; + if (idx < 64) { + // SIX (6) input code-code units + // this is a relatively easy scenario + // we process SIX (6) input code-code units. The max length in bytes of six + // code code units spanning between 1 and 2 bytes each is 12 bytes. On + // processors where pdep/pext is fast, we might be able to use a small + // lookup table. + const __m128i sh = + _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); + const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); + _mm256_storeu_si256((__m256i *)utf32_output, + _mm256_cvtepu16_epi32(composed)); + utf32_output += 6; // We wrote 24 bytes, 6 code points. There is a potential + // overflow of 32 - 24 = 8 bytes. + } else if (idx < 145) { + // FOUR (4) input code-code units + const __m128i sh = + _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = + _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits + const __m128i middlebyte = + _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + const __m128i highbyte = + _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); + _mm_storeu_si128((__m128i *)utf32_output, composed); + utf32_output += 4; + } else if (idx < 209) { + // TWO (2) input code-code units + const __m128i sh = + _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); + const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000)); + // correct for spurious high bit + const __m128i correct = + _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1); + middlehighbyte = _mm_xor_si128(correct, middlehighbyte); + const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000)); + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), + _mm_or_si128(highbyte_shifted, middlehighbyte_shifted)); + _mm_storeu_si128((__m128i *)utf32_output, composed); + utf32_output += + 3; // We wrote 3 * 4 bytes, there is a potential overflow of 4 bytes. + } else { + // here we know that there is an error but we do not handle errors + } + return consumed; +} +/* end file src/haswell/avx2_convert_utf8_to_utf32.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/haswell/avx2_convert_utf16_to_latin1.cpp */ +template +std::pair +avx2_convert_utf16_to_latin1(const char16_t *buf, size_t len, + char *latin1_output) { + const char16_t *end = buf + len; + while (end - buf >= 32) { + // Load 16 UTF-16 characters into 256-bit AVX2 register + __m256i in0 = _mm256_loadu_si256(reinterpret_cast(buf)); + __m256i in1 = + _mm256_loadu_si256(reinterpret_cast(buf + 16)); + + if simdutf_constexpr (!match_system(big_endian)) { + const __m256i swap = _mm256_setr_epi8( + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, + 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); + in0 = _mm256_shuffle_epi8(in0, swap); + in1 = _mm256_shuffle_epi8(in1, swap); + } + + __m256i high_byte_mask = _mm256_set1_epi16((int16_t)0xFF00); + if (_mm256_testz_si256(_mm256_or_si256(in0, in1), high_byte_mask)) { + // Pack 16-bit characters into 8-bit and store in latin1_output + const __m256i packed = _mm256_packus_epi16(in0, in1); + + const __m256i result = _mm256_permute4x64_epi64(packed, 0b11011000); + + _mm256_storeu_si256(reinterpret_cast<__m256i *>(latin1_output), result); + // Adjust pointers for the next iteration + buf += 32; + latin1_output += 32; + } else { + return std::make_pair(nullptr, reinterpret_cast(latin1_output)); + } + } // while + return std::make_pair(buf, latin1_output); +} + +template +std::pair +avx2_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len, + char *latin1_output) { + const char16_t *start = buf; + const char16_t *end = buf + len; + while (end - buf >= 16) { + __m256i in = _mm256_loadu_si256(reinterpret_cast(buf)); + + if simdutf_constexpr (!match_system(big_endian)) { + const __m256i swap = _mm256_setr_epi8( + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, + 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); + in = _mm256_shuffle_epi8(in, swap); + } + + __m256i high_byte_mask = _mm256_set1_epi16((int16_t)0xFF00); + if (_mm256_testz_si256(in, high_byte_mask)) { + __m128i lo = _mm256_extractf128_si256(in, 0); + __m128i hi = _mm256_extractf128_si256(in, 1); + __m128i latin1_packed_lo = _mm_packus_epi16(lo, lo); + __m128i latin1_packed_hi = _mm_packus_epi16(hi, hi); + _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output), + latin1_packed_lo); + _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output + 8), + latin1_packed_hi); + buf += 16; + latin1_output += 16; + } else { + // Fallback to scalar code for handling errors + for (int k = 0; k < 16; k++) { + uint16_t word = scalar::utf16::swap_if_needed(buf[k]); + if (word <= 0xff) { + *latin1_output++ = char(word); + } else { + return std::make_pair( + result{error_code::TOO_LARGE, (size_t)(buf - start + k)}, + latin1_output); + } + } + buf += 16; + } + } // while + return std::make_pair(result{error_code::SUCCESS, (size_t)(buf - start)}, + latin1_output); +} +/* end file src/haswell/avx2_convert_utf16_to_latin1.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +/* begin file src/haswell/avx2_convert_utf16_to_utf8.cpp */ +/* + The vectorized algorithm works on single SSE register i.e., it + loads eight 16-bit code units. + + We consider three cases: + 1. an input register contains no surrogates and each value + is in range 0x0000 .. 0x07ff. + 2. an input register contains no surrogates and values are + is in range 0x0000 .. 0xffff. + 3. an input register contains surrogates --- i.e. codepoints + can have 16 or 32 bits. + + Ad 1. + + When values are less than 0x0800, it means that a 16-bit code unit + can be converted into: 1) single UTF8 byte (when it is an ASCII + char) or 2) two UTF8 bytes. + + For this case we do only some shuffle to obtain these 2-byte + codes and finally compress the whole SSE register with a single + shuffle. + + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + + Ad 2. + + When values fit in 16-bit code units, but are above 0x07ff, then + a single word may produce one, two or three UTF8 bytes. + + We prepare data for all these three cases in two registers. + The first register contains lower two UTF8 bytes (used in all + cases), while the second one contains just the third byte for + the three-UTF8-bytes case. + + Finally these two registers are interleaved forming eight-element + array of 32-bit values. The array spans two SSE registers. + The bytes from the registers are compressed using two shuffles. + + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + + + To summarize: + - We need two 256-entry tables that have 8704 bytes in total. +*/ + +/* + Returns a pair: the first unprocessed byte from buf and utf8_output + A scalar routing should carry on the conversion of the tail. +*/ +template +std::pair +avx2_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_output) { + const char16_t *end = buf + len; + const __m256i v_0000 = _mm256_setzero_si256(); + const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800); + const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800); + const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080); + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 + + while (end - buf >= std::ptrdiff_t(16 + safety_margin)) { + __m256i in = _mm256_loadu_si256((__m256i *)buf); + if (big_endian) { + const __m256i swap = _mm256_setr_epi8( + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, + 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); + in = _mm256_shuffle_epi8(in, swap); + } + // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes + const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80); + if (_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!! + // 1. pack the bytes + const __m128i utf8_packed = _mm_packus_epi16( + _mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1)); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i *)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } + // no bits set above 7th bit + const __m256i one_byte_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000); + const uint32_t one_byte_bitmask = + static_cast(_mm256_movemask_epi8(one_byte_bytemask)); + + // no bits set above 11th bit + const __m256i one_or_two_bytes_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000); + const uint32_t one_or_two_bytes_bitmask = + static_cast(_mm256_movemask_epi8(one_or_two_bytes_bytemask)); + if (one_or_two_bytes_bitmask == 0xffffffff) { + + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00); + const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f); + + // t0 = [000a|aaaa|bbbb|bb00] + const __m256i t0 = _mm256_slli_epi16(in, 2); + // t1 = [000a|aaaa|0000|0000] + const __m256i t1 = _mm256_and_si256(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const __m256i t2 = _mm256_and_si256(in, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const __m256i t3 = _mm256_or_si256(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const __m256i t4 = _mm256_or_si256(t3, v_c080); + + // 2. merge ASCII and 2-byte codewords + const __m256i utf8_unpacked = + _mm256_blendv_epi8(t4, in, one_byte_bytemask); + + // 3. prepare bitmask for 8-bit lookup + const uint32_t M0 = one_byte_bitmask & 0x55555555; + const uint32_t M1 = M0 >> 7; + const uint32_t M2 = (M1 | M0) & 0x00ff00ff; + // 4. pack the bytes + + const uint8_t *row = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0]; + const uint8_t *row_2 = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> + 16)][0]; + + const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1)); + const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1)); + + const __m256i utf8_packed = _mm256_shuffle_epi8( + utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2)); + // 5. store bytes + _mm_storeu_si128((__m128i *)utf8_output, + _mm256_castsi256_si128(utf8_packed)); + utf8_output += row[0]; + _mm_storeu_si128((__m128i *)utf8_output, + _mm256_extractf128_si256(utf8_packed, 1)); + utf8_output += row_2[0]; + + // 6. adjust pointers + buf += 16; + continue; + } + // 1. Check if there are any surrogate word in the input chunk. + // We have also deal with situation when there is a surrogate word + // at the end of a chunk. + const __m256i surrogates_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800); + + // bitmask = 0x0000 if there are no surrogates + // = 0xc000 if the last word is a surrogate + const uint32_t surrogates_bitmask = + static_cast(_mm256_movemask_epi8(surrogates_bytemask)); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. + // However, it is likely an uncommon occurrence. + if (surrogates_bitmask == 0x00000000) { + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes + const __m256i dup_even = _mm256_setr_epi16( + 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e, + 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - + single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two + UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - + three UTF-8 bytes + + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. + + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + + Finally from these two code units we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define simdutf_vec(x) _mm256_set1_epi16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const __m256i t0 = _mm256_shuffle_epi8(in, dup_even); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000)); + + // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] + const __m256i s0 = _mm256_srli_epi16(in, 4); + // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] + const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100)); + // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] + const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140)); + // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000)); + const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, + simdutf_vec(0b0100000000000000)); + const __m256i s4 = _mm256_xor_si256(s3, m0); +#undef simdutf_vec + + // 4. expand code units 16-bit => 32-bit + const __m256i out0 = _mm256_unpacklo_epi16(t2, s4); + const __m256i out1 = _mm256_unpackhi_epi16(t2, s4); + + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + const uint32_t mask = (one_byte_bitmask & 0x55555555) | + (one_or_two_bytes_bitmask & 0xaaaaaaaa); + // Due to the wider registers, the following path is less likely to be + // useful. + /*if(mask == 0) { + // We only have three-byte code units. Use fast path. + const __m256i shuffle = + _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, + 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 = + _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 = + _mm256_shuffle_epi8(out1, shuffle); + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, + _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, + _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16; + continue; + }*/ + const uint8_t mask0 = uint8_t(mask); + const uint8_t *row0 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1)); + const __m128i utf8_0 = + _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0); + + const uint8_t mask1 = static_cast(mask >> 8); + const uint8_t *row1 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1)); + const __m128i utf8_1 = + _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1); + + const uint8_t mask2 = static_cast(mask >> 16); + const uint8_t *row2 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0]; + const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1)); + const __m128i utf8_2 = + _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2); + + const uint8_t mask3 = static_cast(mask >> 24); + const uint8_t *row3 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0]; + const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1)); + const __m128i utf8_3 = + _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3); + + _mm_storeu_si128((__m128i *)utf8_output, utf8_0); + utf8_output += row0[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_1); + utf8_output += row1[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_2); + utf8_output += row2[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_3); + utf8_output += row3[0]; + buf += 16; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint16_t word = scalar::utf16::swap_if_needed(buf[k]); + if ((word & 0xFF80) == 0) { + *utf8_output++ = char(word); + } else if ((word & 0xF800) == 0) { + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if ((word & 0xF800) != 0xD800) { + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = + scalar::utf16::swap_if_needed(buf[k + 1]); + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if ((diff | diff2) > 0x3FF) { + return std::make_pair(nullptr, utf8_output); + } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf8_output++ = char((value >> 18) | 0b11110000); + *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + return std::make_pair(buf, utf8_output); +} + +/* + Returns a pair: a result struct and utf8_output. + If there is an error, the count field of the result is the position of the + error. Otherwise, it is the position of the first unprocessed byte in buf + (even if finished). A scalar routing should carry on the conversion of the + tail if needed. +*/ +template +std::pair +avx2_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len, + char *utf8_output) { + const char16_t *start = buf; + const char16_t *end = buf + len; + + const __m256i v_0000 = _mm256_setzero_si256(); + const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800); + const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800); + const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080); + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 + + while (end - buf >= std::ptrdiff_t(16 + safety_margin)) { + __m256i in = _mm256_loadu_si256((__m256i *)buf); + if (big_endian) { + const __m256i swap = _mm256_setr_epi8( + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, + 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); + in = _mm256_shuffle_epi8(in, swap); + } + // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes + const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80); + if (_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!! + // 1. pack the bytes + const __m128i utf8_packed = _mm_packus_epi16( + _mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1)); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i *)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } + // no bits set above 7th bit + const __m256i one_byte_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000); + const uint32_t one_byte_bitmask = + static_cast(_mm256_movemask_epi8(one_byte_bytemask)); + + // no bits set above 11th bit + const __m256i one_or_two_bytes_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000); + const uint32_t one_or_two_bytes_bitmask = + static_cast(_mm256_movemask_epi8(one_or_two_bytes_bytemask)); + if (one_or_two_bytes_bitmask == 0xffffffff) { + + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00); + const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f); + + // t0 = [000a|aaaa|bbbb|bb00] + const __m256i t0 = _mm256_slli_epi16(in, 2); + // t1 = [000a|aaaa|0000|0000] + const __m256i t1 = _mm256_and_si256(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const __m256i t2 = _mm256_and_si256(in, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const __m256i t3 = _mm256_or_si256(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const __m256i t4 = _mm256_or_si256(t3, v_c080); + + // 2. merge ASCII and 2-byte codewords + const __m256i utf8_unpacked = + _mm256_blendv_epi8(t4, in, one_byte_bytemask); + + // 3. prepare bitmask for 8-bit lookup + const uint32_t M0 = one_byte_bitmask & 0x55555555; + const uint32_t M1 = M0 >> 7; + const uint32_t M2 = (M1 | M0) & 0x00ff00ff; + // 4. pack the bytes + + const uint8_t *row = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0]; + const uint8_t *row_2 = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> + 16)][0]; + + const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1)); + const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1)); + + const __m256i utf8_packed = _mm256_shuffle_epi8( + utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2)); + // 5. store bytes + _mm_storeu_si128((__m128i *)utf8_output, + _mm256_castsi256_si128(utf8_packed)); + utf8_output += row[0]; + _mm_storeu_si128((__m128i *)utf8_output, + _mm256_extractf128_si256(utf8_packed, 1)); + utf8_output += row_2[0]; + + // 6. adjust pointers + buf += 16; + continue; + } + // 1. Check if there are any surrogate word in the input chunk. + // We have also deal with situation when there is a surrogate word + // at the end of a chunk. + const __m256i surrogates_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800); + + // bitmask = 0x0000 if there are no surrogates + // = 0xc000 if the last word is a surrogate + const uint32_t surrogates_bitmask = + static_cast(_mm256_movemask_epi8(surrogates_bytemask)); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. + // However, it is likely an uncommon occurrence. + if (surrogates_bitmask == 0x00000000) { + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes + const __m256i dup_even = _mm256_setr_epi16( + 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e, + 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - + single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two + UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - + three UTF-8 bytes + + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. + + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + + Finally from these two code units we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define simdutf_vec(x) _mm256_set1_epi16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const __m256i t0 = _mm256_shuffle_epi8(in, dup_even); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000)); + + // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] + const __m256i s0 = _mm256_srli_epi16(in, 4); + // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] + const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100)); + // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] + const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140)); + // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000)); + const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, + simdutf_vec(0b0100000000000000)); + const __m256i s4 = _mm256_xor_si256(s3, m0); +#undef simdutf_vec + + // 4. expand code units 16-bit => 32-bit + const __m256i out0 = _mm256_unpacklo_epi16(t2, s4); + const __m256i out1 = _mm256_unpackhi_epi16(t2, s4); + + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + const uint32_t mask = (one_byte_bitmask & 0x55555555) | + (one_or_two_bytes_bitmask & 0xaaaaaaaa); + // Due to the wider registers, the following path is less likely to be + // useful. + /*if(mask == 0) { + // We only have three-byte code units. Use fast path. + const __m256i shuffle = + _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, + 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 = + _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 = + _mm256_shuffle_epi8(out1, shuffle); + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, + _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, + _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16; + continue; + }*/ + const uint8_t mask0 = uint8_t(mask); + const uint8_t *row0 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1)); + const __m128i utf8_0 = + _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0); + + const uint8_t mask1 = static_cast(mask >> 8); + const uint8_t *row1 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1)); + const __m128i utf8_1 = + _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1); + + const uint8_t mask2 = static_cast(mask >> 16); + const uint8_t *row2 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0]; + const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1)); + const __m128i utf8_2 = + _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2); + + const uint8_t mask3 = static_cast(mask >> 24); + const uint8_t *row3 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0]; + const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1)); + const __m128i utf8_3 = + _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3); + + _mm_storeu_si128((__m128i *)utf8_output, utf8_0); + utf8_output += row0[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_1); + utf8_output += row1[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_2); + utf8_output += row2[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_3); + utf8_output += row3[0]; + buf += 16; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint16_t word = scalar::utf16::swap_if_needed(buf[k]); + if ((word & 0xFF80) == 0) { + *utf8_output++ = char(word); + } else if ((word & 0xF800) == 0) { + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if ((word & 0xF800) != 0xD800) { + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = + scalar::utf16::swap_if_needed(buf[k + 1]); + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if ((diff | diff2) > 0x3FF) { + return std::make_pair( + result(error_code::SURROGATE, buf - start + k - 1), + utf8_output); + } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf8_output++ = char((value >> 18) | 0b11110000); + *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output); +} +/* end file src/haswell/avx2_convert_utf16_to_utf8.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +/* begin file src/haswell/avx2_convert_utf16_to_utf32.cpp */ +/* + The vectorized algorithm works on single SSE register i.e., it + loads eight 16-bit code units. + + We consider three cases: + 1. an input register contains no surrogates and each value + is in range 0x0000 .. 0x07ff. + 2. an input register contains no surrogates and values are + in range 0x0000 .. 0xffff. + 3. an input register contains surrogates --- i.e. codepoints + can have 16 or 32 bits. + + Ad 1. + + When values are less than 0x0800, it means that a 16-bit code unit + can be converted into: 1) single UTF8 byte (when it is an ASCII + char) or 2) two UTF8 bytes. + + For this case we do only some shuffle to obtain these 2-byte + codes and finally compress the whole SSE register with a single + shuffle. + + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + + Ad 2. + + When values fit in 16-bit code units, but are above 0x07ff, then + a single word may produce one, two or three UTF8 bytes. + + We prepare data for all these three cases in two registers. + The first register contains lower two UTF8 bytes (used in all + cases), while the second one contains just the third byte for + the three-UTF8-bytes case. + + Finally these two registers are interleaved forming eight-element + array of 32-bit values. The array spans two SSE registers. + The bytes from the registers are compressed using two shuffles. + + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + + + To summarize: + - We need two 256-entry tables that have 8704 bytes in total. +*/ + +/* + Returns a pair: the first unprocessed byte from buf and utf32_output + A scalar routing should carry on the conversion of the tail. +*/ +template +std::pair +avx2_convert_utf16_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_output) { + const char16_t *end = buf + len; + const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800); + const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800); + + while (end - buf >= 16) { + __m256i in = _mm256_loadu_si256((__m256i *)buf); + if (big_endian) { + const __m256i swap = _mm256_setr_epi8( + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, + 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); + in = _mm256_shuffle_epi8(in, swap); + } + + // 1. Check if there are any surrogate word in the input chunk. + // We have also deal with situation when there is a surrogate word + // at the end of a chunk. + const __m256i surrogates_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800); + + // bitmask = 0x0000 if there are no surrogates + // = 0xc000 if the last word is a surrogate + const uint32_t surrogates_bitmask = + static_cast(_mm256_movemask_epi8(surrogates_bytemask)); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. + // However, it is likely an uncommon occurrence. + if (surrogates_bitmask == 0x00000000) { + // case: we extend all sixteen 16-bit code units to sixteen 32-bit code + // units + _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in))); + _mm256_storeu_si256( + reinterpret_cast<__m256i *>(utf32_output + 8), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in, 1))); + utf32_output += 16; + buf += 16; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint16_t word = scalar::utf16::swap_if_needed(buf[k]); + if ((word & 0xF800) != 0xD800) { + // No surrogate pair + *utf32_output++ = char32_t(word); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = + scalar::utf16::swap_if_needed(buf[k + 1]); + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if ((diff | diff2) > 0x3FF) { + return std::make_pair(nullptr, utf32_output); + } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf32_output++ = char32_t(value); + } + } + buf += k; + } + } // while + return std::make_pair(buf, utf32_output); +} + +/* + Returns a pair: a result struct and utf8_output. + If there is an error, the count field of the result is the position of the + error. Otherwise, it is the position of the first unprocessed byte in buf + (even if finished). A scalar routing should carry on the conversion of the + tail if needed. +*/ +template +std::pair +avx2_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len, + char32_t *utf32_output) { + const char16_t *start = buf; + const char16_t *end = buf + len; + const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800); + const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800); + + while (end - buf >= 16) { + __m256i in = _mm256_loadu_si256((__m256i *)buf); + if (big_endian) { + const __m256i swap = _mm256_setr_epi8( + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, + 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); + in = _mm256_shuffle_epi8(in, swap); + } + + // 1. Check if there are any surrogate word in the input chunk. + // We have also deal with situation when there is a surrogate word + // at the end of a chunk. + const __m256i surrogates_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800); + + // bitmask = 0x0000 if there are no surrogates + // = 0xc000 if the last word is a surrogate + const uint32_t surrogates_bitmask = + static_cast(_mm256_movemask_epi8(surrogates_bytemask)); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. + // However, it is likely an uncommon occurrence. + if (surrogates_bitmask == 0x00000000) { + // case: we extend all sixteen 16-bit code units to sixteen 32-bit code + // units + _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in))); + _mm256_storeu_si256( + reinterpret_cast<__m256i *>(utf32_output + 8), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in, 1))); + utf32_output += 16; + buf += 16; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint16_t word = scalar::utf16::swap_if_needed(buf[k]); + if ((word & 0xF800) != 0xD800) { + // No surrogate pair + *utf32_output++ = char32_t(word); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = + scalar::utf16::swap_if_needed(buf[k + 1]); + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if ((diff | diff2) > 0x3FF) { + return std::make_pair( + result(error_code::SURROGATE, buf - start + k - 1), + utf32_output); + } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf32_output++ = char32_t(value); + } + } + buf += k; + } + } // while + return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output); +} +/* end file src/haswell/avx2_convert_utf16_to_utf32.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/haswell/avx2_convert_utf32_to_latin1.cpp */ +std::pair +avx2_convert_utf32_to_latin1(const char32_t *buf, size_t len, + char *latin1_output) { + const size_t rounded_len = + len & ~0x1F; // Round down to nearest multiple of 32 + + const __m256i high_bytes_mask = _mm256_set1_epi32(0xFFFFFF00); + + for (size_t i = 0; i < rounded_len; i += 4 * 8) { + __m256i a = _mm256_loadu_si256((__m256i *)(buf + 0 * 8)); + __m256i b = _mm256_loadu_si256((__m256i *)(buf + 1 * 8)); + __m256i c = _mm256_loadu_si256((__m256i *)(buf + 2 * 8)); + __m256i d = _mm256_loadu_si256((__m256i *)(buf + 3 * 8)); + + const __m256i check_combined = + _mm256_or_si256(_mm256_or_si256(a, b), _mm256_or_si256(c, d)); + + if (!_mm256_testz_si256(check_combined, high_bytes_mask)) { + return std::make_pair(nullptr, latin1_output); + } + + b = _mm256_slli_epi32(b, 1 * 8); + c = _mm256_slli_epi32(c, 2 * 8); + d = _mm256_slli_epi32(d, 3 * 8); + + // clang-format off + + // a = [.. .. .. a7|.. .. .. a6|.. .. .. a5|.. .. .. a4||.. .. .. a3|.. .. .. a2|.. .. .. a1|.. .. .. a0] + // b = [.. .. b7 ..|.. .. b6 ..|.. .. b5 ..|.. .. b4 ..||.. .. b3 ..|.. .. b2 ..|.. .. b1 ..|.. .. b0 ..] + // c = [.. c7 .. ..|.. c6 .. ..|.. c5 .. ..|.. c4 .. ..||.. c3 .. ..|.. c2 .. ..|.. c1 .. ..|.. c0 .. ..] + // d = [d7 .. .. ..|d6 .. .. ..|d5 .. .. ..|d4 .. .. ..||d3 .. .. ..|d2 .. .. ..|d1 .. .. ..|d0 .. .. ..] + + // t0 = [d7 c7 b7 a7|d6 c6 b6 a6|d5 c5 b5 a5|d4 c4 b4 a4||d3 c3 b3 a3|d2 c2 b2 a2|d1 c1 b1 a1|d0 c0 b0 a0] + const __m256i t0 = + _mm256_or_si256(_mm256_or_si256(a, b), _mm256_or_si256(c, d)); + + // shuffle bytes within 128-bit lanes + // t1 = [d7 d6 d5 d4|c7 c6 c5 c4|b7 b6 b5 b4|a7 a6 a5 a4||d3 d2 d1 d0|c3 c2 c1 c0|b3 b2 b1 b0|a3 a2 a1 a0] + const __m256i shuffle_bytes = + _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, + 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); + + const __m256i t1 = _mm256_shuffle_epi8(t0, shuffle_bytes); + + // reshuffle dwords + // t2 = [d7 d6 d5 d4|d3 d2 d1 d0|c7 c6 c5 c4|c3 c2 c1 c0||b7 b6 b5 b4|b3 b2 b1 b0|a7 a6 a5 a4|a3 a2 a1 a0] + const __m256i shuffle_dwords = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7); + const __m256i t2 = _mm256_permutevar8x32_epi32(t1, shuffle_dwords); +// clang format on + + _mm256_storeu_si256((__m256i *)latin1_output, t2); + + latin1_output += 32; + buf += 32; + } + + return std::make_pair(buf, latin1_output); +} + +std::pair +avx2_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len, + char *latin1_output) { + const size_t rounded_len = + len & ~0x1F; // Round down to nearest multiple of 32 + + const char32_t *start = buf; + + const __m256i high_bytes_mask = _mm256_set1_epi32(0xFFFFFF00); + + for (size_t i = 0; i < rounded_len; i += 4 * 8) { + __m256i a = _mm256_loadu_si256((__m256i *)(buf + 0 * 8)); + __m256i b = _mm256_loadu_si256((__m256i *)(buf + 1 * 8)); + __m256i c = _mm256_loadu_si256((__m256i *)(buf + 2 * 8)); + __m256i d = _mm256_loadu_si256((__m256i *)(buf + 3 * 8)); + + const __m256i check_combined = + _mm256_or_si256(_mm256_or_si256(a, b), _mm256_or_si256(c, d)); + + if (!_mm256_testz_si256(check_combined, high_bytes_mask)) { + // Fallback to scalar code for handling errors + for (int k = 0; k < 4 * 8; k++) { + char32_t codepoint = buf[k]; + if (codepoint <= 0xFF) { + *latin1_output++ = static_cast(codepoint); + } else { + return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), + latin1_output); + } + } + } + + b = _mm256_slli_epi32(b, 1 * 8); + c = _mm256_slli_epi32(c, 2 * 8); + d = _mm256_slli_epi32(d, 3 * 8); + + const __m256i t0 = + _mm256_or_si256(_mm256_or_si256(a, b), _mm256_or_si256(c, d)); + + const __m256i shuffle_bytes = + _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, + 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); + + const __m256i t1 = _mm256_shuffle_epi8(t0, shuffle_bytes); + + const __m256i shuffle_dwords = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7); + const __m256i t2 = _mm256_permutevar8x32_epi32(t1, shuffle_dwords); + + _mm256_storeu_si256((__m256i *)latin1_output, t2); + + latin1_output += 32; + buf += 32; + } + + return std::make_pair(result(error_code::SUCCESS, buf - start), + latin1_output); +} +/* end file src/haswell/avx2_convert_utf32_to_latin1.cpp */ +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +/* begin file src/haswell/avx2_convert_utf32_to_utf8.cpp */ +std::pair +avx2_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_output) { + const char32_t *end = buf + len; + const __m256i v_0000 = _mm256_setzero_si256(); + const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000); + const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80); + const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800); + const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080); + const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff); + __m256i running_max = _mm256_setzero_si256(); + __m256i forbidden_bytemask = _mm256_setzero_si256(); + + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 + + while (end - buf >= std::ptrdiff_t(16 + safety_margin)) { + __m256i in = _mm256_loadu_si256((__m256i *)buf); + __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1); + running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin); + + // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned + // saturation + __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), + _mm256_and_si256(nextin, v_7fffffff)); + in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000); + + // Try to apply UTF-16 => UTF-8 routine on 256 bits + // (haswell/avx2_convert_utf16_to_utf8.cpp) + + if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!! + // 1. pack the bytes + const __m128i utf8_packed = _mm_packus_epi16( + _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1)); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i *)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } + // no bits set above 7th bit + const __m256i one_byte_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000); + const uint32_t one_byte_bitmask = + static_cast(_mm256_movemask_epi8(one_byte_bytemask)); + + // no bits set above 11th bit + const __m256i one_or_two_bytes_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000); + const uint32_t one_or_two_bytes_bitmask = + static_cast(_mm256_movemask_epi8(one_or_two_bytes_bytemask)); + if (one_or_two_bytes_bitmask == 0xffffffff) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00); + const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f); + + // t0 = [000a|aaaa|bbbb|bb00] + const __m256i t0 = _mm256_slli_epi16(in_16, 2); + // t1 = [000a|aaaa|0000|0000] + const __m256i t1 = _mm256_and_si256(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const __m256i t2 = _mm256_and_si256(in_16, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const __m256i t3 = _mm256_or_si256(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const __m256i t4 = _mm256_or_si256(t3, v_c080); + + // 2. merge ASCII and 2-byte codewords + const __m256i utf8_unpacked = + _mm256_blendv_epi8(t4, in_16, one_byte_bytemask); + + // 3. prepare bitmask for 8-bit lookup + const uint32_t M0 = one_byte_bitmask & 0x55555555; + const uint32_t M1 = M0 >> 7; + const uint32_t M2 = (M1 | M0) & 0x00ff00ff; + // 4. pack the bytes + + const uint8_t *row = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0]; + const uint8_t *row_2 = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> + 16)][0]; + + const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1)); + const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1)); + + const __m256i utf8_packed = _mm256_shuffle_epi8( + utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2)); + // 5. store bytes + _mm_storeu_si128((__m128i *)utf8_output, + _mm256_castsi256_si128(utf8_packed)); + utf8_output += row[0]; + _mm_storeu_si128((__m128i *)utf8_output, + _mm256_extractf128_si256(utf8_packed, 1)); + utf8_output += row_2[0]; + + // 6. adjust pointers + buf += 16; + continue; + } + // Must check for overflow in packing + const __m256i saturation_bytemask = _mm256_cmpeq_epi32( + _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000); + const uint32_t saturation_bitmask = + static_cast(_mm256_movemask_epi8(saturation_bytemask)); + if (saturation_bitmask == 0xffffffff) { + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes + const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800); + forbidden_bytemask = _mm256_or_si256( + forbidden_bytemask, + _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800)); + + const __m256i dup_even = _mm256_setr_epi16( + 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e, + 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - + single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two + UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - + three UTF-8 bytes + + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. + + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + + Finally from these two code units we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define simdutf_vec(x) _mm256_set1_epi16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000)); + + // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] + const __m256i s0 = _mm256_srli_epi16(in_16, 4); + // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] + const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100)); + // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] + const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140)); + // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000)); + const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, + simdutf_vec(0b0100000000000000)); + const __m256i s4 = _mm256_xor_si256(s3, m0); +#undef simdutf_vec + + // 4. expand code units 16-bit => 32-bit + const __m256i out0 = _mm256_unpacklo_epi16(t2, s4); + const __m256i out1 = _mm256_unpackhi_epi16(t2, s4); + + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + const uint32_t mask = (one_byte_bitmask & 0x55555555) | + (one_or_two_bytes_bitmask & 0xaaaaaaaa); + // Due to the wider registers, the following path is less likely to be + // useful. + /*if(mask == 0) { + // We only have three-byte code units. Use fast path. + const __m256i shuffle = + _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, + 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 = + _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 = + _mm256_shuffle_epi8(out1, shuffle); + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, + _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, + _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16; + continue; + }*/ + const uint8_t mask0 = uint8_t(mask); + const uint8_t *row0 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1)); + const __m128i utf8_0 = + _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0); + + const uint8_t mask1 = static_cast(mask >> 8); + const uint8_t *row1 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1)); + const __m128i utf8_1 = + _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1); + + const uint8_t mask2 = static_cast(mask >> 16); + const uint8_t *row2 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0]; + const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1)); + const __m128i utf8_2 = + _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2); + + const uint8_t mask3 = static_cast(mask >> 24); + const uint8_t *row3 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0]; + const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1)); + const __m128i utf8_3 = + _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3); + + _mm_storeu_si128((__m128i *)utf8_output, utf8_0); + utf8_output += row0[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_1); + utf8_output += row1[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_2); + utf8_output += row2[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_3); + utf8_output += row3[0]; + buf += 16; + } else { + // case: at least one 32-bit word is larger than 0xFFFF <=> it will + // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem + // wasteful to use scalar code, but being efficient with SIMD may require + // large, non-trivial tables? + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint32_t word = buf[k]; + if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII) + *utf8_output++ = char(word); + } else if ((word & 0xFFFFF800) == 0) { // 2-byte + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if ((word & 0xFFFF0000) == 0) { // 3-byte + if (word >= 0xD800 && word <= 0xDFFF) { + return std::make_pair(nullptr, utf8_output); + } + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { // 4-byte + if (word > 0x10FFFF) { + return std::make_pair(nullptr, utf8_output); + } + *utf8_output++ = char((word >> 18) | 0b11110000); + *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + + // check for invalid input + const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff); + if (static_cast(_mm256_movemask_epi8(_mm256_cmpeq_epi32( + _mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) { + return std::make_pair(nullptr, utf8_output); + } + + if (static_cast(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { + return std::make_pair(nullptr, utf8_output); + } + + return std::make_pair(buf, utf8_output); +} + +std::pair +avx2_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len, + char *utf8_output) { + const char32_t *end = buf + len; + const char32_t *start = buf; + + const __m256i v_0000 = _mm256_setzero_si256(); + const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000); + const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80); + const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800); + const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080); + const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff); + const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff); + + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 + + while (end - buf >= std::ptrdiff_t(16 + safety_margin)) { + __m256i in = _mm256_loadu_si256((__m256i *)buf); + __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1); + // Check for too large input + const __m256i max_input = + _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff); + if (static_cast(_mm256_movemask_epi8( + _mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) { + return std::make_pair(result(error_code::TOO_LARGE, buf - start), + utf8_output); + } + + // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned + // saturation + __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), + _mm256_and_si256(nextin, v_7fffffff)); + in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000); + + // Try to apply UTF-16 => UTF-8 routine on 256 bits + // (haswell/avx2_convert_utf16_to_utf8.cpp) + + if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!! + // 1. pack the bytes + const __m128i utf8_packed = _mm_packus_epi16( + _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1)); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i *)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } + // no bits set above 7th bit + const __m256i one_byte_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000); + const uint32_t one_byte_bitmask = + static_cast(_mm256_movemask_epi8(one_byte_bytemask)); + + // no bits set above 11th bit + const __m256i one_or_two_bytes_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000); + const uint32_t one_or_two_bytes_bitmask = + static_cast(_mm256_movemask_epi8(one_or_two_bytes_bytemask)); + if (one_or_two_bytes_bitmask == 0xffffffff) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00); + const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f); + + // t0 = [000a|aaaa|bbbb|bb00] + const __m256i t0 = _mm256_slli_epi16(in_16, 2); + // t1 = [000a|aaaa|0000|0000] + const __m256i t1 = _mm256_and_si256(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const __m256i t2 = _mm256_and_si256(in_16, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const __m256i t3 = _mm256_or_si256(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const __m256i t4 = _mm256_or_si256(t3, v_c080); + + // 2. merge ASCII and 2-byte codewords + const __m256i utf8_unpacked = + _mm256_blendv_epi8(t4, in_16, one_byte_bytemask); + + // 3. prepare bitmask for 8-bit lookup + const uint32_t M0 = one_byte_bitmask & 0x55555555; + const uint32_t M1 = M0 >> 7; + const uint32_t M2 = (M1 | M0) & 0x00ff00ff; + // 4. pack the bytes + + const uint8_t *row = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0]; + const uint8_t *row_2 = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> + 16)][0]; + + const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1)); + const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1)); + + const __m256i utf8_packed = _mm256_shuffle_epi8( + utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2)); + // 5. store bytes + _mm_storeu_si128((__m128i *)utf8_output, + _mm256_castsi256_si128(utf8_packed)); + utf8_output += row[0]; + _mm_storeu_si128((__m128i *)utf8_output, + _mm256_extractf128_si256(utf8_packed, 1)); + utf8_output += row_2[0]; + + // 6. adjust pointers + buf += 16; + continue; + } + // Must check for overflow in packing + const __m256i saturation_bytemask = _mm256_cmpeq_epi32( + _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000); + const uint32_t saturation_bitmask = + static_cast(_mm256_movemask_epi8(saturation_bytemask)); + if (saturation_bitmask == 0xffffffff) { + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes + + // Check for illegal surrogate code units + const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800); + const __m256i forbidden_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800); + if (static_cast(_mm256_movemask_epi8(forbidden_bytemask)) != + 0x0) { + return std::make_pair(result(error_code::SURROGATE, buf - start), + utf8_output); + } + + const __m256i dup_even = _mm256_setr_epi16( + 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e, + 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - + single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two + UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - + three UTF-8 bytes + + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. + + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + + Finally from these two code units we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define simdutf_vec(x) _mm256_set1_epi16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000)); + + // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] + const __m256i s0 = _mm256_srli_epi16(in_16, 4); + // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] + const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100)); + // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] + const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140)); + // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000)); + const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, + simdutf_vec(0b0100000000000000)); + const __m256i s4 = _mm256_xor_si256(s3, m0); +#undef simdutf_vec + + // 4. expand code units 16-bit => 32-bit + const __m256i out0 = _mm256_unpacklo_epi16(t2, s4); + const __m256i out1 = _mm256_unpackhi_epi16(t2, s4); + + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + const uint32_t mask = (one_byte_bitmask & 0x55555555) | + (one_or_two_bytes_bitmask & 0xaaaaaaaa); + // Due to the wider registers, the following path is less likely to be + // useful. + /*if(mask == 0) { + // We only have three-byte code units. Use fast path. + const __m256i shuffle = + _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, + 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 = + _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 = + _mm256_shuffle_epi8(out1, shuffle); + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, + _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, + _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16; + continue; + }*/ + const uint8_t mask0 = uint8_t(mask); + const uint8_t *row0 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1)); + const __m128i utf8_0 = + _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0); + + const uint8_t mask1 = static_cast(mask >> 8); + const uint8_t *row1 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1)); + const __m128i utf8_1 = + _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1); + + const uint8_t mask2 = static_cast(mask >> 16); + const uint8_t *row2 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0]; + const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1)); + const __m128i utf8_2 = + _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2); + + const uint8_t mask3 = static_cast(mask >> 24); + const uint8_t *row3 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0]; + const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1)); + const __m128i utf8_3 = + _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3); + + _mm_storeu_si128((__m128i *)utf8_output, utf8_0); + utf8_output += row0[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_1); + utf8_output += row1[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_2); + utf8_output += row2[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_3); + utf8_output += row3[0]; + buf += 16; + } else { + // case: at least one 32-bit word is larger than 0xFFFF <=> it will + // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem + // wasteful to use scalar code, but being efficient with SIMD may require + // large, non-trivial tables? + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint32_t word = buf[k]; + if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII) + *utf8_output++ = char(word); + } else if ((word & 0xFFFFF800) == 0) { // 2-byte + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if ((word & 0xFFFF0000) == 0) { // 3-byte + if (word >= 0xD800 && word <= 0xDFFF) { + return std::make_pair( + result(error_code::SURROGATE, buf - start + k), utf8_output); + } + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { // 4-byte + if (word > 0x10FFFF) { + return std::make_pair( + result(error_code::TOO_LARGE, buf - start + k), utf8_output); + } + *utf8_output++ = char((word >> 18) | 0b11110000); + *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + + return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output); +} +/* end file src/haswell/avx2_convert_utf32_to_utf8.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +/* begin file src/haswell/avx2_convert_utf32_to_utf16.cpp */ +template +std::pair +avx2_convert_utf32_to_utf16(const char32_t *buf, size_t len, + char16_t *utf16_output) { + const char32_t *end = buf + len; + + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 + __m256i forbidden_bytemask = _mm256_setzero_si256(); + + const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000); + const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800); + const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800); + + while (end - buf >= std::ptrdiff_t(8 + safety_margin)) { + const __m256i in = _mm256_loadu_si256((__m256i *)buf); + + if (simdutf_likely(_mm256_testz_si256(in, v_ffff0000))) { + // no bits set above 16th bit <=> can pack to UTF16 + // without surrogate pairs + forbidden_bytemask = _mm256_or_si256( + forbidden_bytemask, + _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800)); + + __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in), + _mm256_extractf128_si256(in, 1)); + if (big_endian) { + const __m128i swap = + _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + utf16_packed = _mm_shuffle_epi8(utf16_packed, swap); + } + _mm_storeu_si128((__m128i *)utf16_output, utf16_packed); + utf16_output += 8; + buf += 8; + } else { + size_t forward = 7; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint32_t word = buf[k]; + if ((word & 0xFFFF0000) == 0) { + // will not generate a surrogate pair + if (word >= 0xD800 && word <= 0xDFFF) { + return std::make_pair(nullptr, utf16_output); + } + *utf16_output++ = + big_endian + ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) + : char16_t(word); + } else { + // will generate a surrogate pair + if (word > 0x10FFFF) { + return std::make_pair(nullptr, utf16_output); + } + word -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); + if (big_endian) { + high_surrogate = + uint16_t((high_surrogate >> 8) | (high_surrogate << 8)); + low_surrogate = + uint16_t((low_surrogate >> 8) | (low_surrogate << 8)); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + } + } + buf += k; + } + } + + // check for invalid input + if (static_cast(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { + return std::make_pair(nullptr, utf16_output); + } + + return std::make_pair(buf, utf16_output); +} + +template +std::pair +avx2_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len, + char16_t *utf16_output) { + const char32_t *start = buf; + const char32_t *end = buf + len; + + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 + + const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000); + const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800); + const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800); + + while (end - buf >= std::ptrdiff_t(8 + safety_margin)) { + const __m256i in = _mm256_loadu_si256((__m256i *)buf); + + if (simdutf_likely(_mm256_testz_si256(in, v_ffff0000))) { + // no bits set above 16th bit <=> can pack to UTF16 without surrogate + // pairs + const __m256i forbidden_bytemask = + _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800); + if (static_cast(_mm256_movemask_epi8(forbidden_bytemask)) != + 0x0) { + return std::make_pair(result(error_code::SURROGATE, buf - start), + utf16_output); + } + + __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in), + _mm256_extractf128_si256(in, 1)); + if (big_endian) { + const __m128i swap = + _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + utf16_packed = _mm_shuffle_epi8(utf16_packed, swap); + } + _mm_storeu_si128((__m128i *)utf16_output, utf16_packed); + utf16_output += 8; + buf += 8; + } else { + size_t forward = 7; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint32_t word = buf[k]; + if ((word & 0xFFFF0000) == 0) { + // will not generate a surrogate pair + if (word >= 0xD800 && word <= 0xDFFF) { + return std::make_pair( + result(error_code::SURROGATE, buf - start + k), utf16_output); + } + *utf16_output++ = + big_endian + ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) + : char16_t(word); + } else { + // will generate a surrogate pair + if (word > 0x10FFFF) { + return std::make_pair( + result(error_code::TOO_LARGE, buf - start + k), utf16_output); + } + word -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); + if (big_endian) { + high_surrogate = + uint16_t((high_surrogate >> 8) | (high_surrogate << 8)); + low_surrogate = + uint16_t((low_surrogate >> 8) | (low_surrogate << 8)); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + } + } + buf += k; + } + } + + return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output); +} +/* end file src/haswell/avx2_convert_utf32_to_utf16.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/haswell/avx2_convert_utf8_to_latin1.cpp */ +// depends on "tables/utf8_to_utf16_tables.h" + +// Convert up to 12 bytes from utf8 to latin1 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 12). +size_t convert_masked_utf8_to_latin1(const char *input, + uint64_t utf8_end_of_code_point_mask, + char *&latin1_output) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + // + // Optimization note: our main path below is load-latency dependent. Thus it + // is maybe beneficial to have fast paths that depend on branch prediction but + // have less latency. This results in more instructions but, potentially, also + // higher speeds. + // + const __m128i in = _mm_loadu_si128((__m128i *)input); + + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & + 0xfff; // we are only processing 12 bytes in case it is not all ASCII + + if (utf8_end_of_code_point_mask == 0xfff) { + // We process the data in chunks of 12 bytes. + _mm_storeu_si128(reinterpret_cast<__m128i *>(latin1_output), in); + latin1_output += 12; // We wrote 12 characters. + return 12; // We consumed 1 bytes. + } + /// We do not have a fast path available, so we fallback. + const uint8_t idx = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; + const uint8_t consumed = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; + // this indicates an invalid input: + if (idx >= 64) { + return consumed; + } + // Here we should have (idx < 64), if not, there is a bug in the validation or + // elsewhere. SIX (6) input code-code units this is a relatively easy scenario + // we process SIX (6) input code-code units. The max length in bytes of six + // code code units spanning between 1 and 2 bytes each is 12 bytes. On + // processors where pdep/pext is fast, we might be able to use a small lookup + // table. + const __m128i sh = + _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); + __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); + const __m128i latin1_packed = _mm_packus_epi16(composed, composed); + // writing 8 bytes even though we only care about the first 6 bytes. + // performance note: it would be faster to use _mm_storeu_si128, we should + // investigate. + _mm_storel_epi64((__m128i *)latin1_output, latin1_packed); + latin1_output += 6; // We wrote 6 bytes. + return consumed; +} +/* end file src/haswell/avx2_convert_utf8_to_latin1.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_BASE64 +/* begin file src/haswell/avx2_base64.cpp */ +/** + * References and further reading: + * + * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the + * speed of a memory copy, Software: Practice and Experience 50 (2), 2020. + * https://arxiv.org/abs/1910.05109 + * + * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2 + * Instructions, ACM Transactions on the Web 12 (3), 2018. + * https://arxiv.org/abs/1704.00605 + * + * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings. + * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force, + * Request for Comments: 4648. + * + * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization. + * http://www.alfredklomp.com/programming/sse-base64/. (2014). + * + * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD + * acceleration. https://github.com/aklomp/base64. (2014). + * + * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014). + * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/ + * + * Nick Kopp. 2013. Base64 Encoding on a GPU. + * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013). + */ + +template +simdutf_really_inline __m256i lookup_pshufb_improved(const __m256i input) { + // Precomputed shuffle masks for K = 1 to 16 + // credit: Wojciech Muła + __m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51)); + const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input); + result = + _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13))); + __m256i shift_LUT; + if (base64_url) { + shift_LUT = _mm256_setr_epi8( + 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, + '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0, + + 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, + '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0); + } else { + shift_LUT = _mm256_setr_epi8( + 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, + '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0, + + 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, + '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0); + } + + result = _mm256_shuffle_epi8(shift_LUT, result); + return _mm256_add_epi8(result, input); +} + +simdutf_really_inline __m256i insert_line_feed32(__m256i input, int K) { + + static const uint8_t low_table[16][32] = { + {0x80, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {0, 0x80, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {0, 1, 0x80, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {0, 1, 2, 0x80, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {0, 1, 2, 3, 0x80, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {0, 1, 2, 3, 4, 0x80, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {0, 1, 2, 3, 4, 5, 0x80, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {0, 1, 2, 3, 4, 5, 6, 0x80, 7, 8, 9, 10, 11, 12, 13, 14, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {0, 1, 2, 3, 4, 5, 6, 7, 0x80, 8, 9, 10, 11, 12, 13, 14, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 0x80, 9, 10, 11, 12, 13, 14, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0x80, 10, 11, 12, 13, 14, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0x80, 11, 12, 13, 14, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0x80, 12, 13, 14, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0x80, 13, 14, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 14, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0x80, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}}; + static const uint8_t high_table[16][32] = { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0x80, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 0x80, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 0x80, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 0x80, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 0x80, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 0x80, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 0x80, 6, 7, 8, 9, 10, 11, 12, 13, 14}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 0x80, 7, 8, 9, 10, 11, 12, 13, 14}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 0x80, 8, 9, 10, 11, 12, 13, 14}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 0x80, 9, 10, 11, 12, 13, 14}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0x80, 10, 11, 12, 13, 14}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0x80, 11, 12, 13, 14}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0x80, 12, 13, 14}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0x80, 13, 14}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 14}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0x80}}; + + __m256i line_feed_vector = _mm256_set1_epi8('\n'); + if (K >= 16) { + __m256i mask = _mm256_loadu_si256((const __m256i *)high_table[K - 16]); + __m256i lf_pos = + _mm256_cmpeq_epi8(mask, _mm256_set1_epi8(static_cast(0x80))); + __m256i shuffled = _mm256_shuffle_epi8(input, mask); + __m256i result = _mm256_blendv_epi8(shuffled, line_feed_vector, lf_pos); + return result; + } + // Shift input right by 1 byte + __m256i shift = _mm256_alignr_epi8( + input, _mm256_permute2x128_si256(input, input, 0x21), 15); + + input = _mm256_blend_epi32(input, shift, 0xF0); + + __m256i mask = _mm256_loadu_si256((const __m256i *)low_table[K]); + + __m256i lf_pos = + _mm256_cmpeq_epi8(mask, _mm256_set1_epi8(static_cast(0x80))); + __m256i shuffled = _mm256_shuffle_epi8(input, mask); + + __m256i result = _mm256_blendv_epi8(shuffled, line_feed_vector, lf_pos); + return result; +} + +template +size_t +avx2_encode_base64_impl(char *dst, const char *src, size_t srclen, + base64_options options, + size_t line_length = simdutf::default_line_length) { + size_t offset = 0; + + if (line_length < 4) { + line_length = 4; // We do not support line_length less than 4 + } + // credit: Wojciech Muła + const uint8_t *input = (const uint8_t *)src; + + uint8_t *out = (uint8_t *)dst; + const __m256i shuf = + _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1, + + 10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1); + size_t i = 0; + for (; i + 100 <= srclen; i += 96) { + const __m128i lo0 = _mm_loadu_si128( + reinterpret_cast(input + i + 4 * 3 * 0)); + const __m128i hi0 = _mm_loadu_si128( + reinterpret_cast(input + i + 4 * 3 * 1)); + const __m128i lo1 = _mm_loadu_si128( + reinterpret_cast(input + i + 4 * 3 * 2)); + const __m128i hi1 = _mm_loadu_si128( + reinterpret_cast(input + i + 4 * 3 * 3)); + const __m128i lo2 = _mm_loadu_si128( + reinterpret_cast(input + i + 4 * 3 * 4)); + const __m128i hi2 = _mm_loadu_si128( + reinterpret_cast(input + i + 4 * 3 * 5)); + const __m128i lo3 = _mm_loadu_si128( + reinterpret_cast(input + i + 4 * 3 * 6)); + const __m128i hi3 = _mm_loadu_si128( + reinterpret_cast(input + i + 4 * 3 * 7)); + + __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf); + __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf); + __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf); + __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf); + + const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00)); + const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00)); + const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00)); + const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00)); + + const __m256i t1_0 = + _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040)); + const __m256i t1_1 = + _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040)); + const __m256i t1_2 = + _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040)); + const __m256i t1_3 = + _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040)); + + const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0)); + const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0)); + const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0)); + const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0)); + + const __m256i t3_0 = + _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010)); + const __m256i t3_1 = + _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010)); + const __m256i t3_2 = + _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010)); + const __m256i t3_3 = + _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010)); + + const __m256i input0 = _mm256_or_si256(t1_0, t3_0); + const __m256i input1 = _mm256_or_si256(t1_1, t3_1); + const __m256i input2 = _mm256_or_si256(t1_2, t3_2); + const __m256i input3 = _mm256_or_si256(t1_3, t3_3); + + if (use_lines) { + if (line_length >= 32) { // fast path + __m256i result; + result = lookup_pshufb_improved(input0); + if (offset + 32 > line_length) { + size_t location_end = line_length - offset; + size_t to_move = 32 - location_end; + // We could do this, or extract instead. + _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result); + _mm256_storeu_si256( + reinterpret_cast<__m256i *>(out), + insert_line_feed32(result, static_cast(location_end))); + offset = to_move; + out += 32 + 1; + } else { + _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result); + offset += 32; + out += 32; + } + result = lookup_pshufb_improved(input1); + + if (offset + 32 > line_length) { + size_t location_end = line_length - offset; + size_t to_move = 32 - location_end; + + // We could do this, or extract instead. + _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result); + _mm256_storeu_si256( + reinterpret_cast<__m256i *>(out), + insert_line_feed32(result, static_cast(location_end))); + // see above. + // out[32] = static_cast(_mm256_extract_epi8(result, 31)); + offset = to_move; + out += 32 + 1; + } else { + + _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result); + + offset += 32; + out += 32; + } + result = lookup_pshufb_improved(input2); + + if (offset + 32 > line_length) { + size_t location_end = line_length - offset; + size_t to_move = 32 - location_end; + + // We could do this, or extract instead. + _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result); + _mm256_storeu_si256( + reinterpret_cast<__m256i *>(out), + insert_line_feed32(result, static_cast(location_end))); + // see above. + // out[32] = static_cast(_mm256_extract_epi8(result, 31)); + offset = to_move; + out += 32 + 1; + } else { + _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result); + offset += 32; + out += 32; + } + result = lookup_pshufb_improved(input3); + + if (offset + 32 > line_length) { + size_t location_end = line_length - offset; + size_t to_move = 32 - location_end; + + // We could do this, or extract instead. + _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result); + _mm256_storeu_si256( + reinterpret_cast<__m256i *>(out), + insert_line_feed32(result, static_cast(location_end))); + // see above. + // out[32] = static_cast(_mm256_extract_epi8(result, 31)); + offset = to_move; + out += 32 + 1; + } else { + _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result); + offset += 32; + out += 32; + } + } else { // slow path + // could be optimized + uint8_t buffer[128]; + _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer), + lookup_pshufb_improved(input0)); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 32), + lookup_pshufb_improved(input1)); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 64), + lookup_pshufb_improved(input2)); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 96), + lookup_pshufb_improved(input3)); + size_t out_pos = 0; + size_t local_offset = offset; + for (size_t j = 0; j < 128;) { + if (local_offset == line_length) { + out[out_pos++] = '\n'; + local_offset = 0; + } + out[out_pos++] = buffer[j++]; + local_offset++; + } + offset = local_offset; + out += out_pos; + } + } else { + _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), + lookup_pshufb_improved(input0)); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 32), + lookup_pshufb_improved(input1)); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 64), + lookup_pshufb_improved(input2)); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 96), + lookup_pshufb_improved(input3)); + + out += 128; + } + } + for (; i + 28 <= srclen; i += 24) { + // lo = [xxxx|DDDC|CCBB|BAAA] + // hi = [xxxx|HHHG|GGFF|FEEE] + const __m128i lo = + _mm_loadu_si128(reinterpret_cast(input + i)); + const __m128i hi = + _mm_loadu_si128(reinterpret_cast(input + i + 4 * 3)); + + // bytes from groups A, B and C are needed in separate 32-bit lanes + // in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA] + __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf); + + // this part is well commented in encode.sse.cpp + + const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00)); + const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040)); + const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0)); + const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010)); + const __m256i indices = _mm256_or_si256(t1, t3); + + if (use_lines) { + if (line_length >= 32) { // fast path + _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), + lookup_pshufb_improved(indices)); + + if (offset + 32 > line_length) { + size_t location_end = line_length - offset; + size_t to_move = 32 - location_end; + std::memmove(out + location_end + 1, out + location_end, to_move); + out[location_end] = '\n'; + offset = to_move; + out += 32 + 1; + } else { + offset += 32; + out += 32; + } + } else { // slow path + // could be optimized + alignas(32) uint8_t buffer[32]; + _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer), + lookup_pshufb_improved(indices)); + std::memcpy(out, buffer, 32); + size_t out_pos = 0; + size_t local_offset = offset; + for (size_t j = 0; j < 32;) { + if (local_offset == line_length) { + out[out_pos++] = '\n'; + local_offset = 0; + } + out[out_pos++] = buffer[j++]; + local_offset++; + } + offset = local_offset; + out += out_pos; + } + } else { + _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), + lookup_pshufb_improved(indices)); + + out += 32; + } + } + return ((char *)out - (char *)dst) + + scalar::base64::tail_encode_base64_impl( + (char *)out, src + i, srclen - i, options, line_length, offset); +} + +template +size_t encode_base64(char *dst, const char *src, size_t srclen, + base64_options options) { + return avx2_encode_base64_impl(dst, src, srclen, options); +} + +static inline void compress(__m128i data, uint16_t mask, char *output) { + if (mask == 0) { + _mm_storeu_si128(reinterpret_cast<__m128i *>(output), data); + return; + } + // this particular implementation was inspired by work done by @animetosho + // we do it in two steps, first 8 bytes and then second 8 bytes + uint8_t mask1 = uint8_t(mask); // least significant 8 bits + uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits + // next line just loads the 64-bit values thintable_epi8[mask1] and + // thintable_epi8[mask2] into a 128-bit register, using only + // two instructions on most compilers. + + __m128i shufmask = _mm_set_epi64x(tables::base64::thintable_epi8[mask2], + tables::base64::thintable_epi8[mask1]); + // we increment by 0x08 the second half of the mask + shufmask = + _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0)); + // this is the version "nearly pruned" + __m128i pruned = _mm_shuffle_epi8(data, shufmask); + // we still need to put the two halves together. + // we compute the popcount of the first half: + int pop1 = tables::base64::BitsSetTable256mul2[mask1]; + // then load the corresponding mask, what it does is to write + // only the first pop1 bytes from the first 8 bytes, and then + // it fills in with the bytes from the second 8 bytes + some filling + // at the end. + __m128i compactmask = _mm_loadu_si128(reinterpret_cast( + tables::base64::pshufb_combine_table + pop1 * 8)); + __m128i answer = _mm_shuffle_epi8(pruned, compactmask); + + _mm_storeu_si128(reinterpret_cast<__m128i *>(output), answer); +} + +// --- decoding ----------------------------------------------- + +template +simdutf_really_inline void compress(__m256i data, uint32_t mask, char *output) { + if (mask == 0) { + _mm256_storeu_si256(reinterpret_cast<__m256i *>(output), data); + return; + } + compress(_mm256_castsi256_si128(data), uint16_t(mask), output); + compress(_mm256_extracti128_si256(data, 1), uint16_t(mask >> 16), + output + count_ones(~mask & 0xFFFF)); +} + +template +simdutf_really_inline void base64_decode(char *out, __m256i str) { + // credit: aqrit + const __m256i pack_shuffle = + _mm256_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1, + 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1); + const __m256i t0 = _mm256_maddubs_epi16(str, _mm256_set1_epi32(0x01400140)); + const __m256i t1 = _mm256_madd_epi16(t0, _mm256_set1_epi32(0x00011000)); + const __m256i t2 = _mm256_shuffle_epi8(t1, pack_shuffle); + + // Store the output: + _mm_storeu_si128((__m128i *)out, _mm256_castsi256_si128(t2)); + _mm_storeu_si128((__m128i *)(out + 12), _mm256_extracti128_si256(t2, 1)); +} + +template +simdutf_really_inline void base64_decode_block(char *out, const char *src) { + base64_decode(out, + _mm256_loadu_si256(reinterpret_cast(src))); + base64_decode(out + 24, _mm256_loadu_si256( + reinterpret_cast(src + 32))); +} + +template +simdutf_really_inline void base64_decode_block_safe(char *out, + const char *src) { + base64_decode(out, + _mm256_loadu_si256(reinterpret_cast(src))); + alignas(32) char buffer[32]; // We enforce safety with a buffer. + base64_decode( + buffer, _mm256_loadu_si256(reinterpret_cast(src + 32))); + std::memcpy(out + 24, buffer, 24); +} + +// --- decoding - base64 class -------------------------------- + +class block64 { + __m256i chunks[2]; + +public: + // The caller of this function is responsible to ensure that there are 64 + // bytes available from reading at src. + simdutf_really_inline block64(const char *src) { + chunks[0] = _mm256_loadu_si256(reinterpret_cast(src)); + chunks[1] = _mm256_loadu_si256(reinterpret_cast(src + 32)); + } + + // The caller of this function is responsible to ensure that there are 128 + // bytes available from reading at src. + simdutf_really_inline block64(const char16_t *src) { + const auto m1 = _mm256_loadu_si256(reinterpret_cast(src)); + const auto m2 = + _mm256_loadu_si256(reinterpret_cast(src + 16)); + const auto m3 = + _mm256_loadu_si256(reinterpret_cast(src + 32)); + const auto m4 = + _mm256_loadu_si256(reinterpret_cast(src + 48)); + + const auto m1p = _mm256_permute2x128_si256(m1, m2, 0x20); + const auto m2p = _mm256_permute2x128_si256(m1, m2, 0x31); + const auto m3p = _mm256_permute2x128_si256(m3, m4, 0x20); + const auto m4p = _mm256_permute2x128_si256(m3, m4, 0x31); + + chunks[0] = _mm256_packus_epi16(m1p, m2p); + chunks[1] = _mm256_packus_epi16(m3p, m4p); + } + + simdutf_really_inline void copy_block(char *output) { + _mm256_storeu_si256(reinterpret_cast<__m256i *>(output), chunks[0]); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(output + 32), chunks[1]); + } + + // decode 64 bytes and output 48 bytes + simdutf_really_inline void base64_decode_block(char *out) { + base64_decode(out, chunks[0]); + base64_decode(out + 24, chunks[1]); + } + + simdutf_really_inline void base64_decode_block_safe(char *out) { + base64_decode(out, chunks[0]); + alignas(32) char buffer[32]; // We enforce safety with a buffer. + base64_decode(buffer, chunks[1]); + std::memcpy(out + 24, buffer, 24); + } + + template + simdutf_really_inline uint64_t to_base64_mask(uint64_t *error) { + uint32_t err0 = 0; + uint32_t err1 = 0; + uint64_t m0 = to_base64_mask( + &chunks[0], &err0); + uint64_t m1 = to_base64_mask( + &chunks[1], &err1); + if (!ignore_garbage) { + *error = err0 | ((uint64_t)err1 << 32); + } + return m0 | (m1 << 32); + } + + template + simdutf_really_inline uint32_t to_base64_mask(__m256i *src, uint32_t *error) { + const __m256i ascii_space_tbl = + _mm256_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa, + 0x0, 0xc, 0xd, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0); + // credit: aqrit + __m256i delta_asso; + if (default_or_url) { + delta_asso = _mm256_setr_epi8( + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x11, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x16); + } else if (base64_url) { + delta_asso = _mm256_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, + 0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF, 0x1, 0x1, + 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0, + 0x0, 0x0, 0xF, 0x0, 0xF); + } else { + delta_asso = _mm256_setr_epi8( + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x0F, 0x00, 0x0F, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F); + } + + __m256i delta_values; + if (default_or_url) { + delta_values = _mm256_setr_epi8( + uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0x13), + uint8_t(0x04), uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9), + uint8_t(0xB9), uint8_t(0x00), uint8_t(0xFF), uint8_t(0x11), + uint8_t(0xFF), uint8_t(0xBF), uint8_t(0x10), uint8_t(0xB9), + uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0x13), + uint8_t(0x04), uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9), + uint8_t(0xB9), uint8_t(0x00), uint8_t(0xFF), uint8_t(0x11), + uint8_t(0xFF), uint8_t(0xBF), uint8_t(0x10), uint8_t(0xB9)); + } else if (base64_url) { + delta_values = _mm256_setr_epi8( + 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9), + uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF), uint8_t(0xE0), + uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), + uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3), + uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9)); + } else { + delta_values = _mm256_setr_epi8( + int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), int8_t(0x04), + int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), int8_t(0x00), + int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), + int8_t(0xB9), int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), + int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), + int8_t(0x00), int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), + int8_t(0xB9), int8_t(0xB9)); + } + + __m256i check_asso; + if (default_or_url) { + check_asso = _mm256_setr_epi8( + 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, + 0x07, 0x0B, 0x0E, 0x0B, 0x06, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0E, 0x0B, 0x06); + } else if (base64_url) { + check_asso = _mm256_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, + 0x1, 0x3, 0x7, 0xB, 0xE, 0xB, 0x6, 0xD, 0x1, + 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x3, + 0x7, 0xB, 0xE, 0xB, 0x6); + } else { + check_asso = _mm256_setr_epi8( + 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, + 0x07, 0x0B, 0x0B, 0x0B, 0x0F, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F); + } + __m256i check_values; + if (default_or_url) { + check_values = _mm256_setr_epi8( + uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), + uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6), + uint8_t(0xB5), uint8_t(0xA1), uint8_t(0x00), uint8_t(0x80), + uint8_t(0x00), uint8_t(0x80), uint8_t(0x00), uint8_t(0x80), + uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), + uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6), + uint8_t(0xB5), uint8_t(0xA1), uint8_t(0x00), uint8_t(0x80), + uint8_t(0x00), uint8_t(0x80), uint8_t(0x00), uint8_t(0x80)); + } else if (base64_url) { + check_values = _mm256_setr_epi8( + uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), + uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6), uint8_t(0xA6), + uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0, uint8_t(0x80), + 0x0, uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), + uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6), + uint8_t(0xA6), uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0, + uint8_t(0x80), 0x0, uint8_t(0x80)); + } else { + check_values = _mm256_setr_epi8( + int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF), + int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), int8_t(0x86), + int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), int8_t(0x91), + int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), + int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), + int8_t(0x86), int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), + int8_t(0x91), int8_t(0x80)); + } + const __m256i shifted = _mm256_srli_epi32(*src, 3); + __m256i delta_hash = + _mm256_avg_epu8(_mm256_shuffle_epi8(delta_asso, *src), shifted); + if (default_or_url) { + delta_hash = _mm256_and_si256(delta_hash, _mm256_set1_epi8(0xf)); + } + const __m256i check_hash = + _mm256_avg_epu8(_mm256_shuffle_epi8(check_asso, *src), shifted); + const __m256i out = + _mm256_adds_epi8(_mm256_shuffle_epi8(delta_values, delta_hash), *src); + const __m256i chk = + _mm256_adds_epi8(_mm256_shuffle_epi8(check_values, check_hash), *src); + const int mask = _mm256_movemask_epi8(chk); + if (!ignore_garbage && mask) { + __m256i ascii_space = + _mm256_cmpeq_epi8(_mm256_shuffle_epi8(ascii_space_tbl, *src), *src); + *error = (mask ^ _mm256_movemask_epi8(ascii_space)); + } + *src = out; + return (uint32_t)mask; + } + + simdutf_really_inline uint64_t compress_block(uint64_t mask, char *output) { + if (is_power_of_two(mask)) { + return compress_block_single(mask, output); + } + + uint64_t nmask = ~mask; + compress(chunks[0], uint32_t(mask), output); + compress(chunks[1], uint32_t(mask >> 32), + output + count_ones(nmask & 0xFFFFFFFF)); + return count_ones(nmask); + } + + simdutf_really_inline size_t compress_block_single(uint64_t mask, + char *output) { + const size_t pos64 = trailing_zeroes(mask); + const int8_t pos = pos64 & 0xf; + switch (pos64 >> 4) { + case 0b00: { + const __m128i lane0 = _mm256_extracti128_si256(chunks[0], 0); + const __m128i lane1 = _mm256_extracti128_si256(chunks[0], 1); + + const __m128i v0 = _mm_set1_epi8(char(pos - 1)); + const __m128i v1 = + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + const __m128i v2 = _mm_cmpgt_epi8(v1, v0); + const __m128i sh = _mm_sub_epi8(v1, v2); + const __m128i compressed = _mm_shuffle_epi8(lane0, sh); + + _mm_storeu_si128((__m128i *)(output + 0 * 16), compressed); + _mm_storeu_si128((__m128i *)(output + 1 * 16 - 1), lane1); + _mm256_storeu_si256((__m256i *)(output + 2 * 16 - 1), chunks[1]); + } break; + case 0b01: { + const __m128i lane0 = _mm256_extracti128_si256(chunks[0], 0); + const __m128i lane1 = _mm256_extracti128_si256(chunks[0], 1); + _mm_storeu_si128((__m128i *)(output + 0 * 16), lane0); + + const __m128i v0 = _mm_set1_epi8(char(pos - 1)); + const __m128i v1 = + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + const __m128i v2 = _mm_cmpgt_epi8(v1, v0); + const __m128i sh = _mm_sub_epi8(v1, v2); + const __m128i compressed = _mm_shuffle_epi8(lane1, sh); + + _mm_storeu_si128((__m128i *)(output + 1 * 16), compressed); + _mm256_storeu_si256((__m256i *)(output + 2 * 16 - 1), chunks[1]); + } break; + case 0b10: { + const __m128i lane2 = _mm256_extracti128_si256(chunks[1], 0); + const __m128i lane3 = _mm256_extracti128_si256(chunks[1], 1); + + _mm256_storeu_si256((__m256i *)(output + 0 * 16), chunks[0]); + + const __m128i v0 = _mm_set1_epi8(char(pos - 1)); + const __m128i v1 = + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + const __m128i v2 = _mm_cmpgt_epi8(v1, v0); + const __m128i sh = _mm_sub_epi8(v1, v2); + const __m128i compressed = _mm_shuffle_epi8(lane2, sh); + + _mm_storeu_si128((__m128i *)(output + 2 * 16), compressed); + _mm_storeu_si128((__m128i *)(output + 3 * 16 - 1), lane3); + } break; + case 0b11: { + const __m128i lane2 = _mm256_extracti128_si256(chunks[1], 0); + const __m128i lane3 = _mm256_extracti128_si256(chunks[1], 1); + + _mm256_storeu_si256((__m256i *)(output + 0 * 16), chunks[0]); + _mm_storeu_si128((__m128i *)(output + 2 * 16), lane2); + + const __m128i v0 = _mm_set1_epi8(char(pos - 1)); + const __m128i v1 = + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + const __m128i v2 = _mm_cmpgt_epi8(v1, v0); + const __m128i sh = _mm_sub_epi8(v1, v2); + const __m128i compressed = _mm_shuffle_epi8(lane3, sh); + + _mm_storeu_si128((__m128i *)(output + 3 * 16), compressed); + } break; + } + + return 63; + } +}; +/* end file src/haswell/avx2_base64.cpp */ +#endif // SIMDUTF_FEATURE_BASE64 + +} // unnamed namespace +} // namespace haswell +} // namespace simdutf + +/* begin file src/generic/buf_block_reader.h */ +namespace simdutf { +namespace haswell { +namespace { + +// Walks through a buffer in block-sized increments, loading the last part with +// spaces +template struct buf_block_reader { +public: + simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len); + simdutf_really_inline size_t block_index(); + simdutf_really_inline bool has_full_block() const; + simdutf_really_inline const uint8_t *full_block() const; + /** + * Get the last block, padded with spaces. + * + * There will always be a last block, with at least 1 byte, unless len == 0 + * (in which case this function fills the buffer with spaces and returns 0. In + * particular, if len == STEP_SIZE there will be 0 full_blocks and 1 remainder + * block with STEP_SIZE bytes and no spaces for padding. + * + * @return the number of effective characters in the last block. + */ + simdutf_really_inline size_t get_remainder(uint8_t *dst) const; + simdutf_really_inline void advance(); + +private: + const uint8_t *buf; + const size_t len; + const size_t lenminusstep; + size_t idx; +}; + +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char *format_input_text_64(const uint8_t *text) { + static char *buf = + reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + for (size_t i = 0; i < sizeof(simd8x64); i++) { + buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]); + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} + +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char *format_input_text(const simd8x64 &in) { + static char *buf = + reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + in.store(reinterpret_cast(buf)); + for (size_t i = 0; i < sizeof(simd8x64); i++) { + if (buf[i] < ' ') { + buf[i] = '_'; + } + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} + +simdutf_unused static char *format_mask(uint64_t mask) { + static char *buf = reinterpret_cast(malloc(64 + 1)); + for (size_t i = 0; i < 64; i++) { + buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' '; + } + buf[64] = '\0'; + return buf; +} + +template +simdutf_really_inline +buf_block_reader::buf_block_reader(const uint8_t *_buf, size_t _len) + : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, + idx{0} {} + +template +simdutf_really_inline size_t buf_block_reader::block_index() { + return idx; +} + +template +simdutf_really_inline bool buf_block_reader::has_full_block() const { + return idx < lenminusstep; +} + +template +simdutf_really_inline const uint8_t * +buf_block_reader::full_block() const { + return &buf[idx]; +} + +template +simdutf_really_inline size_t +buf_block_reader::get_remainder(uint8_t *dst) const { + if (len == idx) { + return 0; + } // memcpy(dst, null, 0) will trigger an error with some sanitizers + std::memset(dst, 0x20, + STEP_SIZE); // std::memset STEP_SIZE because it is more efficient + // to write out 8 or 16 bytes at once. + std::memcpy(dst, buf + idx, len - idx); + return len - idx; +} + +template +simdutf_really_inline void buf_block_reader::advance() { + idx += STEP_SIZE; +} + +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/buf_block_reader.h */ +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +namespace simdutf { +namespace haswell { +namespace { +namespace utf8_validation { + +using namespace simd; + +simdutf_really_inline simd8 +check_special_cases(const simd8 input, const simd8 prev1) { + // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) + // Bit 1 = Too Long (ASCII followed by continuation) + // Bit 2 = Overlong 3-byte + // Bit 4 = Surrogate + // Bit 5 = Overlong 2-byte + // Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1 << 6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4); + constexpr const uint8_t CARRY = + TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = + (prev1 & 0x0F) + .lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | + OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); + return (byte_1_high & byte_1_low & byte_2_high); +} +simdutf_really_inline simd8 +check_multibyte_lengths(const simd8 input, + const simd8 prev_input, + const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = + simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; +} + +// +// Return nonzero if there are incomplete multibyte characters at the end of the +// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end. +// +simdutf_really_inline simd8 is_incomplete(const simd8 input) { + // If the previous input's last 3 bytes match this, they're too short (they + // ended at EOF): + // ... 1111____ 111_____ 11______ + static const uint8_t max_array[32] = {255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 0b11110000u - 1, + 0b11100000u - 1, + 0b11000000u - 1}; + const simd8 max_value( + &max_array[sizeof(max_array) - sizeof(simd8)]); + return input.gt_bits(max_value); +} + +struct utf8_checker { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + // The last input we received + simd8 prev_input_block; + // Whether the last input we received was incomplete (used for ASCII fast + // path) + simd8 prev_incomplete; + + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, + const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ + // lead bytes (2, 3, 4-byte leads become large positive numbers instead of + // small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + // The only problem that can happen at EOF is that a multibyte character is + // too short or a byte value too large in the last bytes: check_special_cases + // only checks for bytes too large in the first of two bytes. + simdutf_really_inline void check_eof() { + // If the previous block had incomplete UTF-8 characters at the end, an + // ASCII block can't possibly finish them. + this->error |= this->prev_incomplete; + } + + simdutf_really_inline void check_next_input(const simd8x64 &input) { + if (simdutf_likely(is_ascii(input))) { + this->error |= this->prev_incomplete; + } else { + // you might think that a for-loop would work, but under Visual Studio, it + // is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + this->prev_incomplete = + is_incomplete(input.chunks[simd8x64::NUM_CHUNKS - 1]); + this->prev_input_block = input.chunks[simd8x64::NUM_CHUNKS - 1]; + } + } + + // do not forget to call check_eof! + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + +}; // struct utf8_checker +} // namespace utf8_validation + +using utf8_validation::utf8_checker; + +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +/* begin file src/generic/utf8_validation/utf8_validator.h */ +namespace simdutf { +namespace haswell { +namespace { +namespace utf8_validation { + +/** + * Validates that the string is actual UTF-8. + */ +template +bool generic_validate_utf8(const uint8_t *input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + reader.advance(); + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + c.check_eof(); + return !c.errors(); +} + +bool generic_validate_utf8(const char *input, size_t length) { + return generic_validate_utf8( + reinterpret_cast(input), length); +} + +/** + * Validates that the string is actual UTF-8 and stops on errors. + */ +template +result generic_validate_utf8_with_errors(const uint8_t *input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + size_t count{0}; + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + if (c.errors()) { + if (count != 0) { + count--; + } // Sometimes the error is only detected in the next chunk + result res = scalar::utf8::rewind_and_validate_with_errors( + reinterpret_cast(input), + reinterpret_cast(input + count), length - count); + res.count += count; + return res; + } + reader.advance(); + count += 64; + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + c.check_eof(); + if (c.errors()) { + if (count != 0) { + count--; + } // Sometimes the error is only detected in the next chunk + result res = scalar::utf8::rewind_and_validate_with_errors( + reinterpret_cast(input), + reinterpret_cast(input) + count, length - count); + res.count += count; + return res; + } else { + return result(error_code::SUCCESS, length); + } +} + +result generic_validate_utf8_with_errors(const char *input, size_t length) { + return generic_validate_utf8_with_errors( + reinterpret_cast(input), length); +} + +} // namespace utf8_validation +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/utf8_validation/utf8_validator.h */ +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_ASCII +/* begin file src/generic/ascii_validation.h */ +namespace simdutf { +namespace haswell { +namespace { +namespace ascii_validation { + +result generic_validate_ascii_with_errors(const char *input, size_t length) { + buf_block_reader<64> reader(reinterpret_cast(input), length); + size_t count{0}; + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + if (!in.is_ascii()) { + result res = scalar::ascii::validate_with_errors( + reinterpret_cast(input + count), length - count); + return result(res.error, count + res.count); + } + reader.advance(); + + count += 64; + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + if (!in.is_ascii()) { + result res = scalar::ascii::validate_with_errors( + reinterpret_cast(input + count), length - count); + return result(res.error, count + res.count); + } else { + return result(error_code::SUCCESS, length); + } +} + +bool generic_validate_ascii(const char *input, size_t length) { + buf_block_reader<64> reader(reinterpret_cast(input), length); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + if (!in.is_ascii()) { + return false; + } + reader.advance(); + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + return in.is_ascii(); +} + +} // namespace ascii_validation +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/ascii_validation.h */ +#endif // SIMDUTF_FEATURE_ASCII + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + // transcoding from UTF-8 to UTF-16 +/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ +namespace simdutf { +namespace haswell { +namespace { +namespace utf8_to_utf16 { + +using namespace simd; + +template +simdutf_warn_unused size_t convert_valid(const char *input, size_t size, + char16_t *utf16_output) noexcept { + // The implementation is not specific to haswell and should be moved to the + // generic directory. + size_t pos = 0; + char16_t *start{utf16_output}; + const size_t safety_margin = 16; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + // this loop could be unrolled further. For example, we could process the + // mask far more than 64 bytes. + simd8x64 in(reinterpret_cast(input + pos)); + if (in.is_ascii()) { + in.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // Slow path. We hope that the compiler will recognize that this is a slow + // path. Anything that is not a continuation mask is a 'leading byte', + // that is, the start of a new code point. + uint64_t utf8_continuation_mask = in.lt(-65 + 1); + // -65 is 0b10111111 in two-complement's, so largest possible continuation + // byte + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + // The *start* of code points is not so useful, rather, we want the *end* + // of code points. + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times when using solely + // the slow/regular path, and at least four times if there are fast paths. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + // + // Thus we may allow convert_masked_utf8_to_utf16 to process + // more bytes at a time under a fast-path mode where 16 bytes + // are consumed at once (e.g., when encountering ASCII). + size_t consumed = convert_masked_utf8_to_utf16( + input + pos, utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + utf16_output += scalar::utf8_to_utf16::convert_valid( + input + pos, size - pos, utf16_output); + return utf16_output - start; +} + +} // namespace utf8_to_utf16 +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ +/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */ +namespace simdutf { +namespace haswell { +namespace { +namespace utf8_to_utf16 { +using namespace simd; + +simdutf_really_inline simd8 +check_special_cases(const simd8 input, const simd8 prev1) { + // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) + // Bit 1 = Too Long (ASCII followed by continuation) + // Bit 2 = Overlong 3-byte + // Bit 4 = Surrogate + // Bit 5 = Overlong 2-byte + // Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1 << 6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4); + constexpr const uint8_t CARRY = + TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = + (prev1 & 0x0F) + .lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | + OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); + return (byte_1_high & byte_1_low & byte_2_high); +} +simdutf_really_inline simd8 +check_multibyte_lengths(const simd8 input, + const simd8 prev_input, + const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = + simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; +} + +struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, + const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ + // lead bytes (2, 3, 4-byte leads become large positive numbers instead of + // small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + template + simdutf_really_inline size_t convert(const char *in, size_t size, + char16_t *utf16_output) { + size_t pos = 0; + char16_t *start{utf16_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 8 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the eight last + // leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (utf8_continuation_mask & 1) { + return 0; // error + } + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf16( + in + pos, utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (errors()) { + return 0; + } + if (pos < size) { + size_t howmany = scalar::utf8_to_utf16::convert( + in + pos, size - pos, utf16_output); + if (howmany == 0) { + return 0; + } + utf16_output += howmany; + } + return utf16_output - start; + } + + template + simdutf_really_inline result convert_with_errors(const char *in, size_t size, + char16_t *utf16_output) { + size_t pos = 0; + char16_t *start{utf16_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 8 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the eight last + // leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (errors() || (utf8_continuation_mask & 1)) { + // rewind_and_convert_with_errors will seek a potential error from + // in+pos onward, with the ability to go back up to pos bytes, and + // read size-pos bytes forward. + result res = + scalar::utf8_to_utf16::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf16_output); + res.count += pos; + return res; + } + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf16( + in + pos, utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (errors()) { + // rewind_and_convert_with_errors will seek a potential error from in+pos + // onward, with the ability to go back up to pos bytes, and read size-pos + // bytes forward. + result res = + scalar::utf8_to_utf16::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf16_output); + res.count += pos; + return res; + } + if (pos < size) { + // rewind_and_convert_with_errors will seek a potential error from in+pos + // onward, with the ability to go back up to pos bytes, and read size-pos + // bytes forward. + result res = + scalar::utf8_to_utf16::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf16_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + utf16_output += res.count; + } + } + return result(error_code::SUCCESS, utf16_output - start); + } + + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + +}; // struct utf8_checker +} // namespace utf8_to_utf16 +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */ +/* begin file src/generic/utf8/utf16_length_from_utf8_bytemask.h */ +namespace simdutf { +namespace haswell { +namespace { +namespace utf8 { + +using namespace simd; + +simdutf_really_inline size_t utf16_length_from_utf8_bytemask(const char *in, + size_t size) { + using vector_i8 = simd8; + using vector_u8 = simd8; + using vector_u64 = simd64; + + constexpr size_t N = vector_i8::SIZE; + constexpr size_t max_iterations = 255 / 2; + + auto counters = vector_u64::zero(); + auto local = vector_u8::zero(); + + size_t iterations = 0; + size_t pos = 0; + size_t count = 0; + for (; pos + N <= size; pos += N) { + const auto input = + vector_i8::load(reinterpret_cast(in + pos)); + + const auto continuation = input > int8_t(-65); + const auto utf_4bytes = vector_u8(input.value) >= uint8_t(240); + + local -= vector_u8(continuation); + local -= vector_u8(utf_4bytes); + + iterations += 1; + if (iterations == max_iterations) { + counters += sum_8bytes(local); + local = vector_u8::zero(); + iterations = 0; + } + } + + if (iterations > 0) { + count += local.sum_bytes(); + } + + count += counters.sum(); + + return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos); +} + +} // namespace utf8 +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/utf8/utf16_length_from_utf8_bytemask.h */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + // transcoding from UTF-8 to UTF-32 +/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ +namespace simdutf { +namespace haswell { +namespace { +namespace utf8_to_utf32 { + +using namespace simd; + +simdutf_warn_unused size_t convert_valid(const char *input, size_t size, + char32_t *utf32_output) noexcept { + size_t pos = 0; + char32_t *start{utf32_output}; + const size_t safety_margin = 16; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 in(reinterpret_cast(input + pos)); + if (in.is_ascii()) { + in.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // -65 is 0b10111111 in two-complement's, so largest possible continuation + // byte + uint64_t utf8_continuation_mask = in.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + size_t max_starting_point = (pos + 64) - 12; + while (pos < max_starting_point) { + size_t consumed = convert_masked_utf8_to_utf32( + input + pos, utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + } + } + utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, + utf32_output); + return utf32_output - start; +} + +} // namespace utf8_to_utf32 +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ +/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */ +namespace simdutf { +namespace haswell { +namespace { +namespace utf8_to_utf32 { +using namespace simd; + +simdutf_really_inline simd8 +check_special_cases(const simd8 input, const simd8 prev1) { + // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) + // Bit 1 = Too Long (ASCII followed by continuation) + // Bit 2 = Overlong 3-byte + // Bit 4 = Surrogate + // Bit 5 = Overlong 2-byte + // Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1 << 6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4); + constexpr const uint8_t CARRY = + TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = + (prev1 & 0x0F) + .lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | + OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); + return (byte_1_high & byte_1_low & byte_2_high); +} +simdutf_really_inline simd8 +check_multibyte_lengths(const simd8 input, + const simd8 prev_input, + const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = + simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; +} + +struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, + const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ + // lead bytes (2, 3, 4-byte leads become large positive numbers instead of + // small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + simdutf_really_inline size_t convert(const char *in, size_t size, + char32_t *utf32_output) { + size_t pos = 0; + char32_t *start{utf32_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 words when calling convert_masked_utf8_to_utf32. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 16 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the fourth + // last leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (utf8_continuation_mask & 1) { + return 0; // we have an error + } + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf32( + in + pos, utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (errors()) { + return 0; + } + if (pos < size) { + size_t howmany = + scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output); + if (howmany == 0) { + return 0; + } + utf32_output += howmany; + } + return utf32_output - start; + } + + simdutf_really_inline result convert_with_errors(const char *in, size_t size, + char32_t *utf32_output) { + size_t pos = 0; + char32_t *start{utf32_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 8 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the fourth + // last leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (errors() || (utf8_continuation_mask & 1)) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf32_output); + res.count += pos; + return res; + } + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf32( + in + pos, utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (errors()) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf32_output); + res.count += pos; + return res; + } + if (pos < size) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf32_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + utf32_output += res.count; + } + } + return result(error_code::SUCCESS, utf32_output - start); + } + + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + +}; // struct utf8_checker +} // namespace utf8_to_utf32 +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */ +/* begin file src/generic/utf32.h */ +#include + +namespace simdutf { +namespace haswell { +namespace { +namespace utf32 { + +template T min(T a, T b) { return a <= b ? a : b; } + +simdutf_really_inline size_t utf8_length_from_utf32(const char32_t *input, + size_t length) { + using vector_u32 = simd32; + + const char32_t *start = input; + + // we add up to three ones in a single iteration (see the vectorized loop in + // section #2 below) + const size_t max_increment = 3; + + const size_t N = vector_u32::ELEMENTS; + +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + const auto v_0000007f = vector_u32::splat(0x0000007f); + const auto v_000007ff = vector_u32::splat(0x000007ff); + const auto v_0000ffff = vector_u32::splat(0x0000ffff); +#else + const auto v_ffffff80 = vector_u32::splat(0xffffff80); + const auto v_fffff800 = vector_u32::splat(0xfffff800); + const auto v_ffff0000 = vector_u32::splat(0xffff0000); + const auto one = vector_u32::splat(1); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP + + size_t counter = 0; + + // 1. vectorized loop unrolled 4 times + { + // we use vector of uint32 counters, this is why this limit is used + const size_t max_iterations = + std::numeric_limits::max() / (max_increment * 4); + size_t blocks = length / (N * 4); + length -= blocks * (N * 4); + while (blocks != 0) { + const size_t iterations = min(blocks, max_iterations); + blocks -= iterations; + + simd32 acc = vector_u32::zero(); + for (size_t i = 0; i < iterations; i++) { + const auto in0 = vector_u32(input + 0 * N); + const auto in1 = vector_u32(input + 1 * N); + const auto in2 = vector_u32(input + 2 * N); + const auto in3 = vector_u32(input + 3 * N); + +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + acc -= as_vector_u32(in0 > v_0000007f); + acc -= as_vector_u32(in1 > v_0000007f); + acc -= as_vector_u32(in2 > v_0000007f); + acc -= as_vector_u32(in3 > v_0000007f); + + acc -= as_vector_u32(in0 > v_000007ff); + acc -= as_vector_u32(in1 > v_000007ff); + acc -= as_vector_u32(in2 > v_000007ff); + acc -= as_vector_u32(in3 > v_000007ff); + + acc -= as_vector_u32(in0 > v_0000ffff); + acc -= as_vector_u32(in1 > v_0000ffff); + acc -= as_vector_u32(in2 > v_0000ffff); + acc -= as_vector_u32(in3 > v_0000ffff); +#else + acc += min(one, in0 & v_ffffff80); + acc += min(one, in1 & v_ffffff80); + acc += min(one, in2 & v_ffffff80); + acc += min(one, in3 & v_ffffff80); + + acc += min(one, in0 & v_fffff800); + acc += min(one, in1 & v_fffff800); + acc += min(one, in2 & v_fffff800); + acc += min(one, in3 & v_fffff800); + + acc += min(one, in0 & v_ffff0000); + acc += min(one, in1 & v_ffff0000); + acc += min(one, in2 & v_ffff0000); + acc += min(one, in3 & v_ffff0000); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP + + input += 4 * N; + } + + counter += acc.sum(); + } + } + + // 2. vectorized loop for tail + { + const size_t max_iterations = + std::numeric_limits::max() / max_increment; + size_t blocks = length / N; + length -= blocks * N; + while (blocks != 0) { + const size_t iterations = min(blocks, max_iterations); + blocks -= iterations; + + auto acc = vector_u32::zero(); + for (size_t i = 0; i < iterations; i++) { + const auto in = vector_u32(input); + +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + acc -= as_vector_u32(in > v_0000007f); + acc -= as_vector_u32(in > v_000007ff); + acc -= as_vector_u32(in > v_0000ffff); +#else + acc += min(one, in & v_ffffff80); + acc += min(one, in & v_fffff800); + acc += min(one, in & v_ffff0000); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP + + input += N; + } + + counter += acc.sum(); + } + } + + const size_t consumed = input - start; + if (consumed != 0) { + // We don't count 0th bytes in the vectorized loops above, this + // is why we need to count them in the end. + counter += consumed; + } + + return counter + scalar::utf32::utf8_length_from_utf32(input, length); +} + +} // namespace utf32 +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/utf32.h */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +// other functions +#if SIMDUTF_FEATURE_UTF8 +/* begin file src/generic/utf8.h */ +namespace simdutf { +namespace haswell { +namespace { +namespace utf8 { + +using namespace simd; + +simdutf_really_inline size_t count_code_points(const char *in, size_t size) { + size_t pos = 0; + size_t count = 0; + for (; pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.gt(-65); + count += count_ones(utf8_continuation_mask); + } + return count + scalar::utf8::count_code_points(in + pos, size - pos); +} + +#ifdef SIMDUTF_SIMD_HAS_BYTEMASK +simdutf_really_inline size_t count_code_points_bytemask(const char *in, + size_t size) { + using vector_i8 = simd8; + using vector_u8 = simd8; + using vector_u64 = simd64; + + constexpr size_t N = vector_i8::SIZE; + constexpr size_t max_iterations = 255 / 4; + + size_t pos = 0; + size_t count = 0; + + auto counters = vector_u64::zero(); + auto local = vector_u8::zero(); + size_t iterations = 0; + for (; pos + 4 * N <= size; pos += 4 * N) { + const auto input0 = + simd8::load(reinterpret_cast(in + pos + 0 * N)); + const auto input1 = + simd8::load(reinterpret_cast(in + pos + 1 * N)); + const auto input2 = + simd8::load(reinterpret_cast(in + pos + 2 * N)); + const auto input3 = + simd8::load(reinterpret_cast(in + pos + 3 * N)); + const auto mask0 = input0 > int8_t(-65); + const auto mask1 = input1 > int8_t(-65); + const auto mask2 = input2 > int8_t(-65); + const auto mask3 = input3 > int8_t(-65); + + local -= vector_u8(mask0); + local -= vector_u8(mask1); + local -= vector_u8(mask2); + local -= vector_u8(mask3); + + iterations += 1; + if (iterations == max_iterations) { + counters += sum_8bytes(local); + local = vector_u8::zero(); + iterations = 0; + } + } + + if (iterations > 0) { + count += local.sum_bytes(); + } + + count += counters.sum(); + + return count + scalar::utf8::count_code_points(in + pos, size - pos); +} +#endif // SIMDUTF_SIMD_HAS_BYTEMASK + +simdutf_really_inline size_t utf16_length_from_utf8(const char *in, + size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for (; pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + // We count one word for anything that is not a continuation (so + // leading bytes). + count += 64 - count_ones(utf8_continuation_mask); + int64_t utf8_4byte = input.gteq_unsigned(240); + count += count_ones(utf8_4byte); + } + return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos); +} + +} // namespace utf8 +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/utf8.h */ +#endif // SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_UTF16 +/* begin file src/generic/utf16.h */ +namespace simdutf { +namespace haswell { +namespace { +namespace utf16 { + +template +simdutf_really_inline size_t count_code_points(const char16_t *in, + size_t size) { + size_t pos = 0; + size_t count = 0; + for (; pos < size / 32 * 32; pos += 32) { + simd16x32 input(reinterpret_cast(in + pos)); + if simdutf_constexpr (!match_system(big_endian)) { + input.swap_bytes(); + } + uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF); + count += count_ones(not_pair) / 2; + } + return count + + scalar::utf16::count_code_points(in + pos, size - pos); +} + +template +simdutf_really_inline size_t utf8_length_from_utf16(const char16_t *in, + size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for (; pos < size / 32 * 32; pos += 32) { + simd16x32 input(reinterpret_cast(in + pos)); + if simdutf_constexpr (!match_system(big_endian)) { + input.swap_bytes(); + } + uint64_t ascii_mask = input.lteq(0x7F); + uint64_t twobyte_mask = input.lteq(0x7FF); + uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF); + + size_t ascii_count = count_ones(ascii_mask) / 2; + size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2; + size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2; + size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2; + count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + + ascii_count; + } + return count + scalar::utf16::utf8_length_from_utf16(in + pos, + size - pos); +} + +template +simdutf_really_inline size_t utf32_length_from_utf16(const char16_t *in, + size_t size) { + return count_code_points(in, size); +} + +simdutf_really_inline void +change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) { + size_t pos = 0; + + while (pos < size / 32 * 32) { + simd16x32 input(reinterpret_cast(in + pos)); + input.swap_bytes(); + input.store(reinterpret_cast(output)); + pos += 32; + output += 32; + } + + scalar::utf16::change_endianness_utf16(in + pos, size - pos, output); +} + +} // namespace utf16 +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/utf16.h */ +/* begin file src/generic/utf16/utf8_length_from_utf16_bytemask.h */ +namespace simdutf { +namespace haswell { +namespace { +namespace utf16 { + +using namespace simd; + +template +simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in, + size_t size) { + size_t pos = 0; + + using vector_u16 = simd16; + constexpr size_t N = vector_u16::ELEMENTS; + + const auto one = vector_u16::splat(1); + + auto v_count = vector_u16::zero(); + + // each char16 yields at least one byte + size_t count = size / N * N; + + // in a single iteration the increment is 0, 1 or 2, despite we have + // three additions + constexpr size_t max_iterations = 65535 / 2; + size_t iteration = max_iterations; + + for (; pos < size / N * N; pos += N) { + auto input = vector_u16::load(reinterpret_cast(in + pos)); + if simdutf_constexpr (!match_system(big_endian)) { + input = input.swap_bytes(); + } + // 0xd800 .. 0xdbff - low surrogate + // 0xdc00 .. 0xdfff - high surrogate + const auto is_surrogate = ((input & uint16_t(0xf800)) == uint16_t(0xd800)); + + // c0 - chars that yield 2- or 3-byte UTF-8 codes + const auto c0 = min(input & uint16_t(0xff80), one); + + // c1 - chars that yield 3-byte UTF-8 codes (including surrogates) + const auto c1 = min(input & uint16_t(0xf800), one); + + /* + Explanation how the counting works. + + In the case of a non-surrogate character we count: + * always 1 -- see how `count` is initialized above; + * c0 = 1 if the current char yields 2 or 3 bytes; + * c1 = 1 if the current char yields 3 bytes. + + Thus, we always have correct count for the current char: + from 1, 2 or 3 bytes. + + A trickier part is how we count surrogate pairs. Whether + we encounter a surrogate (low or high), we count it as + 3 chars and then minus 1 (`is_surrogate` is -1 or 0). + Each surrogate char yields 2. A surrogate pair, that + is a low surrogate followed by a high one, yields + the expected 4 bytes. + + It also correctly handles cases when low surrogate is + processed by the this loop, but high surrogate is counted + by the scalar procedure. The scalar procedure uses exactly + the described approach, thanks to that for valid UTF-16 + strings it always count correctly. + */ + v_count += c0; + v_count += c1; + v_count += vector_u16(is_surrogate); + + iteration -= 1; + if (iteration == 0) { + count += v_count.sum(); + v_count = vector_u16::zero(); + iteration = max_iterations; + } + } + + if (iteration > 0) { + count += v_count.sum(); + } + + return count + scalar::utf16::utf8_length_from_utf16(in + pos, + size - pos); +} + +template +simdutf_really_inline result +utf8_length_from_utf16_with_replacement(const char16_t *in, size_t size) { + using vector_u16 = simd16; + constexpr size_t N = vector_u16::ELEMENTS; + if (N + 1 > size) { + return scalar::utf16::utf8_length_from_utf16_with_replacement( + in, size); + } // special case for short inputs + size_t pos = 0; + bool any_surrogates = false; + + const auto one = vector_u16::splat(1); + + auto v_count = vector_u16::zero(); + auto v_mismatched_count = vector_u16::zero(); + + size_t count = 0; + size_t mismatched_count = 0; + + // in a single iteration the increment is 0, 1 or 2, despite we have + // three additions + constexpr size_t max_iterations = 65535 / 2; + size_t iteration = max_iterations; + + if (scalar::utf16::is_low_surrogate(in[0])) { + any_surrogates = true; + mismatched_count += 1; + } + + for (; pos < (size - 1) / N * N; pos += N) { + auto input = vector_u16::load(reinterpret_cast(in + pos)); + if simdutf_constexpr (!match_system(big_endian)) { + input = input.swap_bytes(); + } + // 0xd800 .. 0xdbff - low surrogate + // 0xdc00 .. 0xdfff - high surrogate + const auto is_surrogate = ((input & uint16_t(0xf800)) == uint16_t(0xd800)); + + // c0 - chars that yield 2- or 3-byte UTF-8 codes + const auto c0 = min(input & uint16_t(0xff80), one); + + // c1 - chars that yield 3-byte UTF-8 codes (including surrogates) + const auto c1 = min(input & uint16_t(0xf800), one); + + v_count += c0; + v_count += c1; + v_count += vector_u16(is_surrogate); + if (is_surrogate.to_bitmask() != 0 || + scalar::utf16::is_low_surrogate(in[pos + N])) { + any_surrogates = true; + auto input_next = + vector_u16::load(reinterpret_cast(in + pos + 1)); + if simdutf_constexpr (!match_system(big_endian)) { + input_next = input_next.swap_bytes(); + } + + const auto lb_masked = input & (0xfc00); + const auto block_masked = input_next & (0xfc00); + + const auto lb_is_high = lb_masked == (0xd800); + const auto block_is_low = block_masked == (0xdc00); + + const auto illseq = min(vector_u16(lb_is_high ^ block_is_low), one); + + v_mismatched_count += illseq; + } + + iteration -= 1; + if (iteration == 0) { + count += v_count.sum(); + v_count = vector_u16::zero(); + mismatched_count += v_mismatched_count.sum(); + v_mismatched_count = vector_u16::zero(); + iteration = max_iterations; + } + } + + if (iteration > 0) { + count += v_count.sum(); + mismatched_count += v_mismatched_count.sum(); + } + + if (scalar::utf16::is_low_surrogate(in[pos])) { + any_surrogates = true; + if (!scalar::utf16::is_high_surrogate(in[pos - 1])) { + mismatched_count -= 1; + count += 2; + pos += 1; + } + } + count += pos; + count += mismatched_count; + if (scalar::utf16::is_high_surrogate(in[pos - 1])) { + any_surrogates = true; + if (pos == size) { + count += 2; + } else if (scalar::utf16::is_low_surrogate(in[pos])) { + pos += 1; + count += 2; + } + } + result scalar_result = + scalar::utf16::utf8_length_from_utf16_with_replacement( + in + pos, size - pos); + return {any_surrogates ? SURROGATE : scalar_result.error, + count + scalar_result.count}; +} + +} // namespace utf16 +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/utf16/utf8_length_from_utf16_bytemask.h */ +#endif // SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +/* begin file src/generic/validate_utf16.h */ +namespace simdutf { +namespace haswell { +namespace { +namespace utf16 { +/* + UTF-16 validation + -------------------------------------------------- + + In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning. + + In a vectorized algorithm we want to examine the most significant + nibble in order to select a fast path. If none of highest nibbles + are 0xD (13), than we are sure that UTF-16 chunk in a vector + register is valid. + + Let us analyze what we need to check if the nibble is 0xD. The + value of the preceding nibble determines what we have: + + 0xd000 .. 0xd7ff - a valid word + 0xd800 .. 0xdbff - low surrogate + 0xdc00 .. 0xdfff - high surrogate + + Other constraints we have to consider: + - there must not be two consecutive low surrogates (0xd800 .. 0xdbff) + - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff) + - there must not be sole low surrogate nor high surrogate + + We are going to build three bitmasks based on the 3rd nibble: + - V = valid word, + - L = low surrogate (0xd800 .. 0xdbff) + - H = high surrogate (0xdc00 .. 0xdfff) + + 0 1 2 3 4 5 6 7 <--- word index + [ V | L | H | L | H | V | V | L ] + 1 0 0 0 0 1 1 0 - V = valid masks + 0 1 0 1 0 0 0 1 - L = low surrogate + 0 0 1 0 1 0 0 0 - H high surrogate + + + 1 0 0 0 0 1 1 0 V = valid masks + 0 1 0 1 0 0 0 0 a = L & (H >> 1) + 0 0 1 0 1 0 0 0 b = a << 1 + 1 1 1 1 1 1 1 0 c = V | a | b + ^ + the last bit can be zero, we just consume 7 + code units and recheck this word in the next iteration +*/ +template +const result validate_utf16_with_errors(const char16_t *input, size_t size) { + if (simdutf_unlikely(size == 0)) { + return result(error_code::SUCCESS, 0); + } + + const char16_t *start = input; + const char16_t *end = input + size; + + const auto v_d8 = simd8::splat(0xd8); + const auto v_f8 = simd8::splat(0xf8); + const auto v_fc = simd8::splat(0xfc); + const auto v_dc = simd8::splat(0xdc); + + while (input + simd16::SIZE * 2 < end) { + // 0. Load data: since the validation takes into account only higher + // byte of each word, we compress the two vectors into one which + // consists only the higher bytes. + auto in0 = simd16(input); + auto in1 = + simd16(input + simd16::SIZE / sizeof(char16_t)); + + // Function `utf16_gather_high_bytes` consumes two vectors of UTF-16 + // and yields a single vector having only higher bytes of characters. + const auto in = utf16_gather_high_bytes(in0, in1); + + // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). + const auto surrogates_wordmask = (in & v_f8) == v_d8; + const uint16_t surrogates_bitmask = + static_cast(surrogates_wordmask.to_bitmask()); + if (surrogates_bitmask == 0x0000) { + input += 16; + } else { + // 2. We have some surrogates that have to be distinguished: + // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) + // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) + // + // Fact: high surrogate has 11th bit set (3rd bit in the higher byte) + + // V - non-surrogate code units + // V = not surrogates_wordmask + const uint16_t V = static_cast(~surrogates_bitmask); + + // H - word-mask for high surrogates: the six highest bits are 0b1101'11 + const auto vH = (in & v_fc) == v_dc; + const uint16_t H = static_cast(vH.to_bitmask()); + + // L - word mask for low surrogates + // L = not H and surrogates_wordmask + const uint16_t L = static_cast(~H & surrogates_bitmask); + + const uint16_t a = static_cast( + L & (H >> 1)); // A low surrogate must be followed by high one. + // (A low surrogate placed in the 7th register's word + // is an exception we handle.) + const uint16_t b = static_cast( + a << 1); // Just mark that the opinput - startite fact is hold, + // thanks to that we have only two masks for valid case. + const uint16_t c = static_cast( + V | a | b); // Combine all the masks into the final one. + + if (c == 0xffff) { + // The whole input register contains valid UTF-16, i.e., + // either single code units or proper surrogate pairs. + input += 16; + } else if (c == 0x7fff) { + // The 15 lower code units of the input register contains valid UTF-16. + // The 15th word may be either a low or high surrogate. It the next + // iteration we 1) check if the low surrogate is followed by a high + // one, 2) reject sole high surrogate. + input += 15; + } else { + return result(error_code::SURROGATE, input - start); + } + } + } + + return result(error_code::SUCCESS, input - start); +} + +template +const result validate_utf16_as_ascii_with_errors(const char16_t *input, + size_t size) { + if (simdutf_unlikely(size == 0)) { + return result(error_code::SUCCESS, 0); + } + size_t pos = 0; + for (; pos < size / 32 * 32; pos += 32) { + simd16x32 input_vec( + reinterpret_cast(input + pos)); + if simdutf_constexpr (!match_system(big_endian)) { + input_vec.swap_bytes(); + } + uint64_t matches = input_vec.lteq(uint16_t(0x7f)); + if (~matches) { + // Found a match, return the first one + int index = trailing_zeroes(~matches) / 2; + return result(error_code::TOO_LARGE, pos + index); + } + } + + // Scalar tail + while (pos < size) { + + char16_t v = scalar::utf16::swap_if_needed(input[pos]); + if (v > 0x7F) { + return result(error_code::TOO_LARGE, pos); + } + pos++; + } + return result(error_code::SUCCESS, size); +} + +} // namespace utf16 +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/validate_utf16.h */ +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + // transcoding from UTF-8 to Latin 1 +/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */ +namespace simdutf { +namespace haswell { +namespace { +namespace utf8_to_latin1 { +using namespace simd; + +simdutf_really_inline simd8 +check_special_cases(const simd8 input, const simd8 prev1) { + // For UTF-8 to Latin 1, we can allow any ASCII character, and any + // continuation byte, but the non-ASCII leading bytes must be 0b11000011 or + // 0b11000010 and nothing else. + // + // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) + // Bit 1 = Too Long (ASCII followed by continuation) + // Bit 2 = Overlong 3-byte + // Bit 4 = Surrogate + // Bit 5 = Overlong 2-byte + // Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1 << 6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____ + constexpr const uint8_t FORBIDDEN = 0xff; + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + FORBIDDEN, + // 1110____ ________ + FORBIDDEN, + // 1111____ ________ + FORBIDDEN); + constexpr const uint8_t CARRY = + TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = + (prev1 & 0x0F) + .lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, CARRY, + + // ____0100 ________ + FORBIDDEN, + // ____0101 ________ + FORBIDDEN, + // ____011_ ________ + FORBIDDEN, FORBIDDEN, + + // ____1___ ________ + FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, + // ____1101 ________ + FORBIDDEN, FORBIDDEN, FORBIDDEN); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | + OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); + return (byte_1_high & byte_1_low & byte_2_high); +} + +struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, + const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ + // lead bytes (2, 3, 4-byte leads become large positive numbers instead of + // small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + this->error |= check_special_cases(input, prev1); + } + + simdutf_really_inline size_t convert(const char *in, size_t size, + char *latin1_output) { + size_t pos = 0; + char *start{latin1_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 16 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 16; margin--) { + leading_byte += (int8_t(in[margin - 1]) > + -65); // twos complement of -65 is 1011 1111 ... + } + // If the input is long enough, then we have that margin-1 is the eight last + // leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store((int8_t *)latin1_output); + latin1_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = + input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in + // this case, we also have ASCII to account for. + if (utf8_continuation_mask & 1) { + return 0; // error + } + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_latin1( + in + pos, utf8_end_of_code_point_mask, latin1_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (errors()) { + return 0; + } + if (pos < size) { + size_t howmany = + scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output); + if (howmany == 0) { + return 0; + } + latin1_output += howmany; + } + return latin1_output - start; + } + + simdutf_really_inline result convert_with_errors(const char *in, size_t size, + char *latin1_output) { + size_t pos = 0; + char *start{latin1_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 8 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the eight last + // leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store((int8_t *)latin1_output); + latin1_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + if (errors()) { + // rewind_and_convert_with_errors will seek a potential error from + // in+pos onward, with the ability to go back up to pos bytes, and + // read size-pos bytes forward. + result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors( + pos, in + pos, size - pos, latin1_output); + res.count += pos; + return res; + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_latin1( + in + pos, utf8_end_of_code_point_mask, latin1_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (errors()) { + // rewind_and_convert_with_errors will seek a potential error from in+pos + // onward, with the ability to go back up to pos bytes, and read size-pos + // bytes forward. + result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors( + pos, in + pos, size - pos, latin1_output); + res.count += pos; + return res; + } + if (pos < size) { + // rewind_and_convert_with_errors will seek a potential error from in+pos + // onward, with the ability to go back up to pos bytes, and read size-pos + // bytes forward. + result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors( + pos, in + pos, size - pos, latin1_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + latin1_output += res.count; + } + } + return result(error_code::SUCCESS, latin1_output - start); + } + + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + +}; // struct utf8_checker +} // namespace utf8_to_latin1 +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */ +/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */ +namespace simdutf { +namespace haswell { +namespace { +namespace utf8_to_latin1 { +using namespace simd; + +simdutf_really_inline size_t convert_valid(const char *in, size_t size, + char *latin1_output) { + size_t pos = 0; + char *start{latin1_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last + // 16 bytes, and if the data is valid, then it is entirely safe because 16 + // UTF-8 bytes generate much more than 8 bytes. However, you cannot generally + // assume that you have valid UTF-8 input, so we are going to go back from the + // end counting 8 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > + -65); // twos complement of -65 is 1011 1111 ... + } + // If the input is long enough, then we have that margin-1 is the eight last + // leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store((int8_t *)latin1_output); + latin1_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it + // is not good enough. + uint64_t utf8_continuation_mask = + input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in + // this case, we also have ASCII to account for. + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_latin1( + in + pos, utf8_end_of_code_point_mask, latin1_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (pos < size) { + size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos, + latin1_output); + latin1_output += howmany; + } + return latin1_output - start; +} + +} // namespace utf8_to_latin1 +} // namespace +} // namespace haswell +} // namespace simdutf + // namespace simdutf +/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING +/* begin file src/generic/validate_utf32.h */ +namespace simdutf { +namespace haswell { +namespace { +namespace utf32 { + +simdutf_really_inline bool validate(const char32_t *input, size_t size) { + if (simdutf_unlikely(size == 0)) { + // empty input is valid UTF-32. protect the implementation from + // handling nullptr + return true; + } + + const char32_t *end = input + size; + + using vector_u32 = simd32; + + const auto standardmax = vector_u32::splat(0x10ffff); + const auto offset = vector_u32::splat(0xffff2000); + const auto standardoffsetmax = vector_u32::splat(0xfffff7ff); + auto currentmax = vector_u32::zero(); + auto currentoffsetmax = vector_u32::zero(); + + constexpr size_t N = vector_u32::ELEMENTS; + + while (input + N < end) { + auto in = vector_u32(input); + if simdutf_constexpr (!match_system(endianness::BIG)) { + in.swap_bytes(); + } + + currentmax = max(currentmax, in); + currentoffsetmax = max(currentoffsetmax, in + offset); + input += N; + } + + const auto too_large = currentmax > standardmax; + if (too_large.any()) { + return false; + } + + const auto surrogate = currentoffsetmax > standardoffsetmax; + if (surrogate.any()) { + return false; + } + + return scalar::utf32::validate(input, end - input); +} + +simdutf_really_inline result validate_with_errors(const char32_t *input, + size_t size) { + if (simdutf_unlikely(size == 0)) { + // empty input is valid UTF-32. protect the implementation from + // handling nullptr + return result(error_code::SUCCESS, 0); + } + + const char32_t *start = input; + const char32_t *end = input + size; + + using vector_u32 = simd32; + + const auto standardmax = vector_u32::splat(0x10ffff + 1); + const auto surrogate_mask = vector_u32::splat(0xfffff800); + const auto surrogate_byte = vector_u32::splat(0x0000d800); + + constexpr size_t N = vector_u32::ELEMENTS; + + while (input + N < end) { + auto in = vector_u32(input); + if simdutf_constexpr (!match_system(endianness::BIG)) { + in.swap_bytes(); + } + + const auto too_large = in >= standardmax; + const auto surrogate = (in & surrogate_mask) == surrogate_byte; + + const auto combined = too_large | surrogate; + if (simdutf_unlikely(combined.any())) { + const size_t consumed = input - start; + auto sr = scalar::utf32::validate_with_errors(input, end - input); + sr.count += consumed; + + return sr; + } + + input += N; + } + + const size_t consumed = input - start; + auto sr = scalar::utf32::validate_with_errors(input, end - input); + sr.count += consumed; + + return sr; +} + +} // namespace utf32 +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/validate_utf32.h */ +#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_BASE64 +/* begin file src/generic/base64.h */ +/** + * References and further reading: + * + * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the + * speed of a memory copy, Software: Practice and Experience 50 (2), 2020. + * https://arxiv.org/abs/1910.05109 + * + * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2 + * Instructions, ACM Transactions on the Web 12 (3), 2018. + * https://arxiv.org/abs/1704.00605 + * + * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings. + * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force, + * Request for Comments: 4648. + * + * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization. + * http://www.alfredklomp.com/programming/sse-base64/. (2014). + * + * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD + * acceleration. https://github.com/aklomp/base64. (2014). + * + * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014). + * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/ + * + * Nick Kopp. 2013. Base64 Encoding on a GPU. + * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013). + */ +namespace simdutf { +namespace haswell { +namespace { +namespace base64 { + +/* + The following template function implements API for Base64 decoding. + + An implementation is responsible for providing the `block64` type and + associated methods that perform actual conversion. Please refer + to any vectorized implementation to learn the API of these procedures. +*/ +template +full_result +compress_decode_base64(char *dst, const chartype *src, size_t srclen, + base64_options options, + last_chunk_handling_options last_chunk_options) { + const uint8_t *to_base64 = + default_or_url ? tables::base64::to_base64_default_or_url_value + : (base64_url ? tables::base64::to_base64_url_value + : tables::base64::to_base64_value); + auto ri = simdutf::scalar::base64::find_end(src, srclen, options); + size_t equallocation = ri.equallocation; + size_t equalsigns = ri.equalsigns; + srclen = ri.srclen; + size_t full_input_length = ri.full_input_length; + if (srclen == 0) { + if (!ignore_garbage && equalsigns > 0) { + return {INVALID_BASE64_CHARACTER, equallocation, 0}; + } + return {SUCCESS, full_input_length, 0}; + } + char *end_of_safe_64byte_zone = + dst == nullptr + ? nullptr + : ((srclen + 3) / 4 * 3 >= 63 ? dst + (srclen + 3) / 4 * 3 - 63 + : dst); + + const chartype *const srcinit = src; + const char *const dstinit = dst; + const chartype *const srcend = src + srclen; + + constexpr size_t block_size = 6; + static_assert(block_size >= 2, "block_size must be at least two"); + char buffer[block_size * 64]; + char *bufferptr = buffer; + if (srclen >= 64) { + const chartype *const srcend64 = src + srclen - 64; + while (src <= srcend64) { + block64 b(src); + src += 64; + uint64_t error = 0; + const uint64_t badcharmask = + b.to_base64_mask(&error); + if (!ignore_garbage && error) { + src -= 64; + const size_t error_offset = trailing_zeroes(error); + return {error_code::INVALID_BASE64_CHARACTER, + size_t(src - srcinit + error_offset), size_t(dst - dstinit)}; + } + if (badcharmask != 0) { + bufferptr += b.compress_block(badcharmask, bufferptr); + } else if (bufferptr != buffer) { + b.copy_block(bufferptr); + bufferptr += 64; + } else { + if (dst >= end_of_safe_64byte_zone) { + b.base64_decode_block_safe(dst); + } else { + b.base64_decode_block(dst); + } + dst += 48; + } + if (bufferptr >= (block_size - 1) * 64 + buffer) { + for (size_t i = 0; i < (block_size - 2); i++) { + base64_decode_block(dst, buffer + i * 64); + dst += 48; + } + if (dst >= end_of_safe_64byte_zone) { + base64_decode_block_safe(dst, buffer + (block_size - 2) * 64); + } else { + base64_decode_block(dst, buffer + (block_size - 2) * 64); + } + dst += 48; + std::memcpy(buffer, buffer + (block_size - 1) * 64, + 64); // 64 might be too much + bufferptr -= (block_size - 1) * 64; + } + } + } + + char *buffer_start = buffer; + // Optimization note: if this is almost full, then it is worth our + // time, otherwise, we should just decode directly. + int last_block = (int)((bufferptr - buffer_start) % 64); + if (last_block != 0 && srcend - src + last_block >= 64) { + + while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) { + uint8_t val = to_base64[uint8_t(*src)]; + *bufferptr = char(val); + if (!ignore_garbage && + (!scalar::base64::is_eight_byte(*src) || val > 64)) { + return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit), + size_t(dst - dstinit)}; + } + bufferptr += (val <= 63); + src++; + } + } + + for (; buffer_start + 64 <= bufferptr; buffer_start += 64) { + if (dst >= end_of_safe_64byte_zone) { + base64_decode_block_safe(dst, buffer_start); + } else { + base64_decode_block(dst, buffer_start); + } + dst += 48; + } + if ((bufferptr - buffer_start) % 64 != 0) { + while (buffer_start + 4 < bufferptr) { + uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) + + (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) + + (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) + + (uint32_t(uint8_t(buffer_start[3])) << 0 * 6)) + << 8; +#if !SIMDUTF_IS_BIG_ENDIAN + triple = scalar::u32_swap_bytes(triple); +#endif + std::memcpy(dst, &triple, 3); + + dst += 3; + buffer_start += 4; + } + if (buffer_start + 4 <= bufferptr) { + uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) + + (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) + + (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) + + (uint32_t(uint8_t(buffer_start[3])) << 0 * 6)) + << 8; +#if !SIMDUTF_IS_BIG_ENDIAN + triple = scalar::u32_swap_bytes(triple); +#endif + std::memcpy(dst, &triple, 3); + + dst += 3; + buffer_start += 4; + } + // we may have 1, 2 or 3 bytes left and we need to decode them so let us + // backtrack + int leftover = int(bufferptr - buffer_start); + while (leftover > 0) { + if (!ignore_garbage) { + while (to_base64[uint8_t(*(src - 1))] == 64) { + src--; + } + } else { + while (to_base64[uint8_t(*(src - 1))] >= 64) { + src--; + } + } + src--; + leftover--; + } + } + if (src < srcend + equalsigns) { + full_result r = scalar::base64::base64_tail_decode( + dst, src, srcend - src, equalsigns, options, last_chunk_options); + r = scalar::base64::patch_tail_result( + r, size_t(src - srcinit), size_t(dst - dstinit), equallocation, + full_input_length, last_chunk_options); + // When is_partial(last_chunk_options) is true, we must either end with + // the end of the stream (beyond whitespace) or right after a non-ignorable + // character or at the very beginning of the stream. + // See https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64 + if (is_partial(last_chunk_options) && r.error == error_code::SUCCESS && + r.input_count < full_input_length) { + // First check if we can extend the input to the end of the stream + while (r.input_count < full_input_length && + base64_ignorable(*(srcinit + r.input_count), options)) { + r.input_count++; + } + // If we are still not at the end of the stream, then we must backtrack + // to the last non-ignorable character. + if (r.input_count < full_input_length) { + while (r.input_count > 0 && + base64_ignorable(*(srcinit + r.input_count - 1), options)) { + r.input_count--; + } + } + } + return r; + } + if (!ignore_garbage && equalsigns > 0) { + if ((size_t(dst - dstinit) % 3 == 0) || + ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) { + return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)}; + } + } + return {SUCCESS, srclen, size_t(dst - dstinit)}; +} + +} // namespace base64 +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/base64.h */ +/* begin file src/generic/find.h */ +namespace simdutf { +namespace haswell { +namespace { +namespace util { + +simdutf_really_inline const char *find(const char *start, const char *end, + char character) noexcept { + // Handle empty or invalid range + if (start >= end) + return end; + // Align the start pointer to 64 bytes + uintptr_t misalignment = reinterpret_cast(start) % 64; + if (misalignment != 0) { + size_t adjustment = 64 - misalignment; + if (size_t(std::distance(start, end)) < adjustment) { + adjustment = std::distance(start, end); + } + for (size_t i = 0; i < adjustment; i++) { + if (start[i] == character) { + return start + i; + } + } + start += adjustment; + } + + // Main loop for 64-byte aligned data + for (; std::distance(start, end) >= 64; start += 64) { + simd8x64 input(reinterpret_cast(start)); + uint64_t matches = input.eq(uint8_t(character)); + if (matches != 0) { + // Found a match, return the first one + int index = trailing_zeroes(matches); + return start + index; + } + } + return std::find(start, end, character); +} + +simdutf_really_inline const char16_t * +find(const char16_t *start, const char16_t *end, char16_t character) noexcept { + // Handle empty or invalid range + if (start >= end) + return end; + // Align the start pointer to 64 bytes if misalignment is even + uintptr_t misalignment = reinterpret_cast(start) % 64; + if (misalignment != 0 && misalignment % 2 == 0) { + size_t adjustment = (64 - misalignment) / sizeof(char16_t); + if (size_t(std::distance(start, end)) < adjustment) { + adjustment = std::distance(start, end); + } + for (size_t i = 0; i < adjustment; i++) { + if (start[i] == character) { + return start + i; + } + } + start += adjustment; + } + + // Main loop for 64-byte aligned data + for (; std::distance(start, end) >= 32; start += 32) { + simd16x32 input(reinterpret_cast(start)); + uint64_t matches = input.eq(uint16_t(character)); + if (matches != 0) { + // Found a match, return the first one + int index = trailing_zeroes(matches) / 2; + return start + index; + } + } + return std::find(start, end, character); +} + +} // namespace util +} // namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/find.h */ +#endif // SIMDUTF_FEATURE_BASE64 + +namespace simdutf { +namespace haswell { + +#if SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused int +implementation::detect_encodings(const char *input, + size_t length) const noexcept { + // If there is a BOM, then we trust it. + auto bom_encoding = simdutf::BOM::check_bom(input, length); + if (bom_encoding != encoding_type::unspecified) { + return bom_encoding; + } + + int out = 0; + uint32_t utf16_err = (length % 2); + uint32_t utf32_err = (length % 4); + uint32_t ends_with_high = 0; + const auto v_d8 = simd8::splat(0xd8); + const auto v_f8 = simd8::splat(0xf8); + const auto v_fc = simd8::splat(0xfc); + const auto v_dc = simd8::splat(0xdc); + const __m256i standardmax = _mm256_set1_epi32(0x10ffff); + const __m256i offset = _mm256_set1_epi32(0xffff2000); + const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff); + __m256i currentmax = _mm256_setzero_si256(); + __m256i currentoffsetmax = _mm256_setzero_si256(); + + utf8_checker c{}; + buf_block_reader<64> reader(reinterpret_cast(input), length); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + // utf8 checks + c.check_next_input(in); + + // utf16le checks + auto in0 = simd16(in.chunks[0]); + auto in1 = simd16(in.chunks[1]); + const auto t0 = in0.shr<8>(); + const auto t1 = in1.shr<8>(); + const auto in2 = simd16::pack(t0, t1); + const auto surrogates_wordmask = (in2 & v_f8) == v_d8; + const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask(); + const auto vL = (in2 & v_fc) == v_dc; + const uint32_t L = vL.to_bitmask(); + const uint32_t H = L ^ surrogates_bitmask; + utf16_err |= (((H << 1) | ends_with_high) != L); + ends_with_high = (H & 0x80000000) != 0; + + // utf32le checks + currentmax = _mm256_max_epu32(in.chunks[0], currentmax); + currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in.chunks[0], offset), + currentoffsetmax); + currentmax = _mm256_max_epu32(in.chunks[1], currentmax); + currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in.chunks[1], offset), + currentoffsetmax); + + reader.advance(); + } + + uint8_t block[64]{}; + size_t idx = reader.block_index(); + std::memcpy(block, &input[idx], length - idx); + simd::simd8x64 in(block); + c.check_next_input(in); + + // utf16le last block check + auto in0 = simd16(in.chunks[0]); + auto in1 = simd16(in.chunks[1]); + const auto t0 = in0.shr<8>(); + const auto t1 = in1.shr<8>(); + const auto in2 = simd16::pack(t0, t1); + const auto surrogates_wordmask = (in2 & v_f8) == v_d8; + const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask(); + const auto vL = (in2 & v_fc) == v_dc; + const uint32_t L = vL.to_bitmask(); + const uint32_t H = L ^ surrogates_bitmask; + utf16_err |= (((H << 1) | ends_with_high) != L); + // this is required to check for last byte ending in high and end of input + // is reached + ends_with_high = (H & 0x80000000) != 0; + utf16_err |= ends_with_high; + + // utf32le last block check + currentmax = _mm256_max_epu32(in.chunks[0], currentmax); + currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in.chunks[0], offset), + currentoffsetmax); + currentmax = _mm256_max_epu32(in.chunks[1], currentmax); + currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in.chunks[1], offset), + currentoffsetmax); + + reader.advance(); + + c.check_eof(); + bool is_valid_utf8 = !c.errors(); + __m256i is_zero = + _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax); + utf32_err |= (_mm256_testz_si256(is_zero, is_zero) == 0); + + is_zero = _mm256_xor_si256( + _mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax); + utf32_err |= (_mm256_testz_si256(is_zero, is_zero) == 0); + if (is_valid_utf8) { + out |= encoding_type::UTF8; + } + if (utf16_err == 0) { + out |= encoding_type::UTF16_LE; + } + if (utf32_err == 0) { + out |= encoding_type::UTF32_LE; + } + return out; +} +#endif // SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused bool +implementation::validate_utf8(const char *buf, size_t len) const noexcept { + return haswell::utf8_validation::generic_validate_utf8(buf, len); +} +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF8 +simdutf_warn_unused result implementation::validate_utf8_with_errors( + const char *buf, size_t len) const noexcept { + return haswell::utf8_validation::generic_validate_utf8_with_errors(buf, len); +} +#endif // SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_ASCII +simdutf_warn_unused bool +implementation::validate_ascii(const char *buf, size_t len) const noexcept { + return haswell::ascii_validation::generic_validate_ascii(buf, len); +} + +simdutf_warn_unused result implementation::validate_ascii_with_errors( + const char *buf, size_t len) const noexcept { + return haswell::ascii_validation::generic_validate_ascii_with_errors(buf, + len); +} +#endif // SIMDUTF_FEATURE_ASCII + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII +simdutf_warn_unused bool +implementation::validate_utf16le_as_ascii(const char16_t *buf, + size_t len) const noexcept { + return haswell::utf16::validate_utf16_as_ascii_with_errors< + endianness::LITTLE>(buf, len) + .error == SUCCESS; +} + +simdutf_warn_unused bool +implementation::validate_utf16be_as_ascii(const char16_t *buf, + size_t len) const noexcept { + return haswell::utf16::validate_utf16_as_ascii_with_errors( + buf, len) + .error == SUCCESS; +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII + +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused bool +implementation::validate_utf16le(const char16_t *buf, + size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + // empty input is valid UTF-16. protect the implementation from + // handling nullptr + return true; + } + const auto res = + haswell::utf16::validate_utf16_with_errors(buf, len); + if (res.is_err()) { + return false; + } + + if (res.count == len) { + return true; + } + + return scalar::utf16::validate(buf + res.count, + len - res.count); +} +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused bool +implementation::validate_utf16be(const char16_t *buf, + size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + // empty input is valid UTF-16. protect the implementation from + // handling nullptr + return true; + } + const auto res = + haswell::utf16::validate_utf16_with_errors(buf, len); + if (res.is_err()) { + return false; + } + + if (res.count == len) { + return true; + } + + return scalar::utf16::validate(buf + res.count, + len - res.count); +} + +simdutf_warn_unused result implementation::validate_utf16le_with_errors( + const char16_t *buf, size_t len) const noexcept { + + const result res = + haswell::utf16::validate_utf16_with_errors(buf, len); + if (res.count != len) { + const result scalar_res = + scalar::utf16::validate_with_errors( + buf + res.count, len - res.count); + return result(scalar_res.error, res.count + scalar_res.count); + } else { + return res; + } +} + +simdutf_warn_unused result implementation::validate_utf16be_with_errors( + const char16_t *buf, size_t len) const noexcept { + const result res = + haswell::utf16::validate_utf16_with_errors(buf, len); + if (res.count != len) { + const result scalar_res = + scalar::utf16::validate_with_errors(buf + res.count, + len - res.count); + return result(scalar_res.error, res.count + scalar_res.count); + } else { + return res; + } +} + +void implementation::to_well_formed_utf16le(const char16_t *input, size_t len, + char16_t *output) const noexcept { + return utf16fix_avx(input, len, output); +} + +void implementation::to_well_formed_utf16be(const char16_t *input, size_t len, + char16_t *output) const noexcept { + return utf16fix_avx(input, len, output); +} +#endif // SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused bool +implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept { + return utf32::validate(buf, len); +} +#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused result implementation::validate_utf32_with_errors( + const char32_t *buf, size_t len) const noexcept { + return utf32::validate_with_errors(buf, len); +} +#endif // SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_latin1_to_utf8( + const char *buf, size_t len, char *utf8_output) const noexcept { + std::pair ret = + avx2_convert_latin1_to_utf8(buf, len, utf8_output); + size_t converted_chars = ret.second - utf8_output; + + if (ret.first != buf + len) { + const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + converted_chars += scalar_converted_chars; + } + + return converted_chars; +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + std::pair ret = + avx2_convert_latin1_to_utf16(buf, len, utf16_output); + if (ret.first == nullptr) { + return 0; + } + size_t converted_chars = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_converted_chars = + scalar::latin1_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_converted_chars == 0) { + return 0; + } + converted_chars += scalar_converted_chars; + } + return converted_chars; +} + +simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + std::pair ret = + avx2_convert_latin1_to_utf16(buf, len, utf16_output); + if (ret.first == nullptr) { + return 0; + } + size_t converted_chars = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_converted_chars = + scalar::latin1_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_converted_chars == 0) { + return 0; + } + converted_chars += scalar_converted_chars; + } + return converted_chars; +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_latin1_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept { + std::pair ret = + avx2_convert_latin1_to_utf32(buf, len, utf32_output); + if (ret.first == nullptr) { + return 0; + } + size_t converted_chars = ret.second - utf32_output; + if (ret.first != buf + len) { + const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_converted_chars == 0) { + return 0; + } + converted_chars += scalar_converted_chars; + } + return converted_chars; +} +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept { + utf8_to_latin1::validating_transcoder converter; + return converter.convert(buf, len, latin1_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors( + const char *buf, size_t len, char *latin1_output) const noexcept { + utf8_to_latin1::validating_transcoder converter; + return converter.convert_with_errors(buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1( + const char *input, size_t size, char *latin1_output) const noexcept { + return utf8_to_latin1::convert_valid(input, size, latin1_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert(buf, len, utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert_with_errors(buf, len, + utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert_with_errors(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le( + const char *input, size_t size, char16_t *utf16_output) const noexcept { + return utf8_to_utf16::convert_valid(input, size, + utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be( + const char *input, size_t size, char16_t *utf16_output) const noexcept { + return utf8_to_utf16::convert_valid(input, size, + utf16_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_utf8_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept { + utf8_to_utf32::validating_transcoder converter; + return converter.convert(buf, len, utf32_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors( + const char *buf, size_t len, char32_t *utf32_output) const noexcept { + utf8_to_utf32::validating_transcoder converter; + return converter.convert_with_errors(buf, len, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32( + const char *input, size_t size, char32_t *utf32_output) const noexcept { + return utf8_to_utf32::convert_valid(input, size, utf32_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair ret = + haswell::avx2_convert_utf16_to_latin1(buf, len, + latin1_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - latin1_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_latin1::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair ret = + haswell::avx2_convert_utf16_to_latin1(buf, len, + latin1_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - latin1_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_latin1::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result +implementation::convert_utf16le_to_latin1_with_errors( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair ret = + avx2_convert_utf16_to_latin1_with_errors( + buf, len, latin1_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = + scalar::utf16_to_latin1::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + latin1_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused result +implementation::convert_utf16be_to_latin1_with_errors( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair ret = + avx2_convert_utf16_to_latin1_with_errors(buf, len, + latin1_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = + scalar::utf16_to_latin1::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + latin1_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + // optimization opportunity: implement a custom function + return convert_utf16be_to_latin1(buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + // optimization opportunity: implement a custom function + return convert_utf16le_to_latin1(buf, len, latin1_output); +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + std::pair ret = + haswell::avx2_convert_utf16_to_utf8(buf, len, + utf8_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf8_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + std::pair ret = + haswell::avx2_convert_utf16_to_utf8(buf, len, + utf8_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf8_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + haswell::avx2_convert_utf16_to_utf8_with_errors( + buf, len, utf8_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = + scalar::utf16_to_utf8::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + utf8_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + haswell::avx2_convert_utf16_to_utf8_with_errors( + buf, len, utf8_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = + scalar::utf16_to_utf8::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + utf8_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + return convert_utf16le_to_utf8(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + return convert_utf16be_to_utf8(buf, len, utf8_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_output) const noexcept { + std::pair ret = + avx2_convert_utf32_to_utf8(buf, len, utf8_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf8_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_utf32_to_latin1( + const char32_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair ret = + avx2_convert_utf32_to_latin1(buf, len, latin1_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - latin1_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors( + const char32_t *buf, size_t len, char *latin1_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + avx2_convert_utf32_to_latin1_with_errors(buf, len, latin1_output); + if (ret.first.count != len) { + result scalar_res = scalar::utf32_to_latin1::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + latin1_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1( + const char32_t *buf, size_t len, char *latin1_output) const noexcept { + return convert_utf32_to_latin1(buf, len, latin1_output); +} +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors( + const char32_t *buf, size_t len, char *utf8_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + haswell::avx2_convert_utf32_to_utf8_with_errors(buf, len, utf8_output); + if (ret.first.count != len) { + result scalar_res = scalar::utf32_to_utf8::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + utf8_output; // Set count to the number of 8-bit code units written + return ret.first; +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + std::pair ret = + haswell::avx2_convert_utf16_to_utf32(buf, len, + utf32_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf32_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_utf32::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + std::pair ret = + haswell::avx2_convert_utf16_to_utf32(buf, len, + utf32_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf32_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_utf32::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + haswell::avx2_convert_utf16_to_utf32_with_errors( + buf, len, utf32_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = + scalar::utf16_to_utf32::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + utf32_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + haswell::avx2_convert_utf16_to_utf32_with_errors( + buf, len, utf32_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = + scalar::utf16_to_utf32::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + utf32_output; // Set count to the number of 8-bit code units written + return ret.first; +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_output) const noexcept { + return convert_utf32_to_utf8(buf, len, utf8_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + std::pair ret = + avx2_convert_utf32_to_utf16(buf, len, utf16_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf32_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + std::pair ret = + avx2_convert_utf32_to_utf16(buf, len, utf16_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf32_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + haswell::avx2_convert_utf32_to_utf16_with_errors( + buf, len, utf16_output); + if (ret.first.count != len) { + result scalar_res = + scalar::utf32_to_utf16::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + utf16_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + haswell::avx2_convert_utf32_to_utf16_with_errors( + buf, len, utf16_output); + if (ret.first.count != len) { + result scalar_res = + scalar::utf32_to_utf16::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + utf16_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + return convert_utf32_to_utf16le(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + return convert_utf32_to_utf16be(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + return convert_utf16le_to_utf32(buf, len, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + return convert_utf16be_to_utf32(buf, len, utf32_output); +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 +void implementation::change_endianness_utf16(const char16_t *input, + size_t length, + char16_t *output) const noexcept { + utf16::change_endianness_utf16(input, length, output); +} + +simdutf_warn_unused size_t implementation::count_utf16le( + const char16_t *input, size_t length) const noexcept { + return utf16::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::count_utf16be( + const char16_t *input, size_t length) const noexcept { + return utf16::count_code_points(input, length); +} +#endif // SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 +simdutf_warn_unused size_t +implementation::count_utf8(const char *in, size_t size) const noexcept { + return utf8::count_code_points_bytemask(in, size); +} +#endif // SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::latin1_length_from_utf8( + const char *buf, size_t len) const noexcept { + return count_utf8(buf, len); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::utf8_length_from_utf16le( + const char16_t *input, size_t length) const noexcept { + return utf16::utf8_length_from_utf16_bytemask(input, + length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf16be( + const char16_t *input, size_t length) const noexcept { + return utf16::utf8_length_from_utf16_bytemask(input, length); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf32_length_from_utf16le( + const char16_t *input, size_t length) const noexcept { + return utf16::utf32_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf16be( + const char16_t *input, size_t length) const noexcept { + return utf16::utf32_length_from_utf16(input, length); +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::utf16_length_from_utf8( + const char *input, size_t length) const noexcept { + return utf8::utf16_length_from_utf8_bytemask(input, length); +} +simdutf_warn_unused result +implementation::utf8_length_from_utf16le_with_replacement( + const char16_t *input, size_t length) const noexcept { + return utf16::utf8_length_from_utf16_with_replacement( + input, length); +} + +simdutf_warn_unused result +implementation::utf8_length_from_utf16be_with_replacement( + const char16_t *input, size_t length) const noexcept { + return utf16::utf8_length_from_utf16_with_replacement( + input, length); +} + +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::utf8_length_from_latin1( + const char *input, size_t len) const noexcept { + const uint8_t *data = reinterpret_cast(input); + size_t answer = len / sizeof(__m256i) * sizeof(__m256i); + size_t i = 0; + if (answer >= 2048) { // long strings optimization + __m256i four_64bits = _mm256_setzero_si256(); + while (i + sizeof(__m256i) <= len) { + __m256i runner = _mm256_setzero_si256(); + // We can do up to 255 loops without overflow. + size_t iterations = (len - i) / sizeof(__m256i); + if (iterations > 255) { + iterations = 255; + } + size_t max_i = i + iterations * sizeof(__m256i) - sizeof(__m256i); + for (; i + 4 * sizeof(__m256i) <= max_i; i += 4 * sizeof(__m256i)) { + __m256i input1 = _mm256_loadu_si256((const __m256i *)(data + i)); + __m256i input2 = + _mm256_loadu_si256((const __m256i *)(data + i + sizeof(__m256i))); + __m256i input3 = _mm256_loadu_si256( + (const __m256i *)(data + i + 2 * sizeof(__m256i))); + __m256i input4 = _mm256_loadu_si256( + (const __m256i *)(data + i + 3 * sizeof(__m256i))); + __m256i input12 = + _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input1), + _mm256_cmpgt_epi8(_mm256_setzero_si256(), input2)); + __m256i input23 = + _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input3), + _mm256_cmpgt_epi8(_mm256_setzero_si256(), input4)); + __m256i input1234 = _mm256_add_epi8(input12, input23); + runner = _mm256_sub_epi8(runner, input1234); + } + for (; i <= max_i; i += sizeof(__m256i)) { + __m256i input_256_chunk = + _mm256_loadu_si256((const __m256i *)(data + i)); + runner = _mm256_sub_epi8( + runner, _mm256_cmpgt_epi8(_mm256_setzero_si256(), input_256_chunk)); + } + four_64bits = _mm256_add_epi64( + four_64bits, _mm256_sad_epu8(runner, _mm256_setzero_si256())); + } + answer += _mm256_extract_epi64(four_64bits, 0) + + _mm256_extract_epi64(four_64bits, 1) + + _mm256_extract_epi64(four_64bits, 2) + + _mm256_extract_epi64(four_64bits, 3); + } else if (answer > 0) { + for (; i + sizeof(__m256i) <= len; i += sizeof(__m256i)) { + __m256i latin = _mm256_loadu_si256((const __m256i *)(data + i)); + uint32_t non_ascii = _mm256_movemask_epi8(latin); + answer += count_ones(non_ascii); + } + } + return answer + scalar::latin1::utf8_length_from_latin1( + reinterpret_cast(data + i), len - i); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf8_length_from_utf32( + const char32_t *input, size_t length) const noexcept { + return utf32::utf8_length_from_utf32(input, length); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf16_length_from_utf32( + const char32_t *input, size_t length) const noexcept { + const __m256i v_00000000 = _mm256_setzero_si256(); + const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000); + size_t pos = 0; + size_t count = 0; + for (; pos + 8 <= length; pos += 8) { + __m256i in = _mm256_loadu_si256((__m256i *)(input + pos)); + const __m256i surrogate_bytemask = + _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000); + const uint32_t surrogate_bitmask = + static_cast(_mm256_movemask_epi8(surrogate_bytemask)); + size_t surrogate_count = (32 - count_ones(surrogate_bitmask)) / 4; + count += 8 + surrogate_count; + } + return count + + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos); +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf32_length_from_utf8( + const char *input, size_t length) const noexcept { + return utf8::count_code_points(input, length); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_BASE64 +simdutf_warn_unused result implementation::base64_to_binary( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + if (options & base64_default_or_url) { + if (options == base64_options::base64_default_or_url_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } +} + +simdutf_warn_unused full_result implementation::base64_to_binary_details( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + if (options & base64_default_or_url) { + if (options == base64_options::base64_default_or_url_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } +} + +simdutf_warn_unused result implementation::base64_to_binary( + const char16_t *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + if (options & base64_default_or_url) { + if (options == base64_options::base64_default_or_url_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } +} + +simdutf_warn_unused full_result implementation::base64_to_binary_details( + const char16_t *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + if (options & base64_default_or_url) { + if (options == base64_options::base64_default_or_url_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } +} + +size_t implementation::binary_to_base64(const char *input, size_t length, + char *output, + base64_options options) const noexcept { + if (options & base64_url) { + return encode_base64(output, input, length, options); + } else { + return encode_base64(output, input, length, options); + } +} + +size_t implementation::binary_to_base64_with_lines( + const char *input, size_t length, char *output, size_t line_length, + base64_options options) const noexcept { + if (options & base64_url) { + return avx2_encode_base64_impl(output, input, length, options, + line_length); + } else { + return avx2_encode_base64_impl(output, input, length, options, + line_length); + } +} + +const char *implementation::find(const char *start, const char *end, + char character) const noexcept { + return util::find(start, end, character); +} + +const char16_t *implementation::find(const char16_t *start, const char16_t *end, + char16_t character) const noexcept { + return util::find(start, end, character); +} +#endif // SIMDUTF_FEATURE_BASE64 + +} // namespace haswell +} // namespace simdutf + +/* begin file src/simdutf/haswell/end.h */ +#if SIMDUTF_CAN_ALWAYS_RUN_HASWELL +// nothing needed. +#else +SIMDUTF_UNTARGET_REGION +#endif + +#undef SIMDUTF_SIMD_HAS_BYTEMASK + +#if SIMDUTF_GCC11ORMORE // workaround for + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593 +SIMDUTF_POP_DISABLE_WARNINGS +#endif // end of workaround +/* end file src/simdutf/haswell/end.h */ +/* end file src/haswell/implementation.cpp */ +#endif +#if SIMDUTF_IMPLEMENTATION_PPC64 +/* begin file src/ppc64/implementation.cpp */ +/* begin file src/simdutf/ppc64/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "ppc64" +// #define SIMDUTF_IMPLEMENTATION ppc64 +/* end file src/simdutf/ppc64/begin.h */ + +/* begin file src/ppc64/ppc64_utf16_to_utf8_tables.h */ +// Code generated automatically; DO NOT EDIT +// file generated by scripts/ppc64_convert_utf16_to_utf8.py +#ifndef PPC64_SIMDUTF_UTF16_TO_UTF8_TABLES_H +#define PPC64_SIMDUTF_UTF16_TO_UTF8_TABLES_H + +namespace simdutf { +namespace { +namespace tables { +namespace ppc64_utf16_to_utf8 { + +#if SIMDUTF_IS_BIG_ENDIAN +// 1 byte for length, 16 bytes for mask +const uint8_t pack_1_2_3_utf8_bytes[256][17] = { + {12, 1, 0, 16, 3, 2, 18, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80}, + {9, 3, 2, 18, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {11, 0, 16, 3, 2, 18, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80}, + {10, 17, 3, 2, 18, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {9, 1, 0, 16, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {6, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 0, 16, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 17, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {11, 1, 0, 16, 2, 18, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80}, + {8, 2, 18, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {10, 0, 16, 2, 18, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {9, 17, 2, 18, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {10, 1, 0, 16, 19, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {7, 19, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {9, 0, 16, 19, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 17, 19, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {9, 1, 0, 16, 3, 2, 18, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {6, 3, 2, 18, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 0, 16, 3, 2, 18, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 17, 3, 2, 18, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 1, 0, 16, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {3, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 0, 16, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 17, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {8, 1, 0, 16, 2, 18, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 2, 18, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 0, 16, 2, 18, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 17, 2, 18, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 1, 0, 16, 19, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 19, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 0, 16, 19, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 17, 19, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {11, 1, 0, 16, 3, 2, 18, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80}, + {8, 3, 2, 18, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {10, 0, 16, 3, 2, 18, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {9, 17, 3, 2, 18, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 1, 0, 16, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 0, 16, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 17, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {10, 1, 0, 16, 2, 18, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {7, 2, 18, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {9, 0, 16, 2, 18, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 17, 2, 18, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {9, 1, 0, 16, 19, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 19, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 0, 16, 19, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 17, 19, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {10, 1, 0, 16, 3, 2, 18, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {7, 3, 2, 18, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {9, 0, 16, 3, 2, 18, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 17, 3, 2, 18, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 1, 0, 16, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 0, 16, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 17, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {9, 1, 0, 16, 2, 18, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 2, 18, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 0, 16, 2, 18, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 17, 2, 18, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 1, 0, 16, 19, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 19, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 0, 16, 19, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 17, 19, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {9, 1, 0, 16, 3, 2, 18, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {6, 3, 2, 18, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 0, 16, 3, 2, 18, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 17, 3, 2, 18, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 1, 0, 16, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {3, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 0, 16, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 17, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {8, 1, 0, 16, 2, 18, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 2, 18, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 0, 16, 2, 18, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 17, 2, 18, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 1, 0, 16, 19, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 19, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 0, 16, 19, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 17, 19, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {6, 1, 0, 16, 3, 2, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {3, 3, 2, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 0, 16, 3, 2, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 17, 3, 2, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {3, 1, 0, 16, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}, + {2, 0, 16, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {1, 17, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}, + {5, 1, 0, 16, 2, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {2, 2, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {4, 0, 16, 2, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {3, 17, 2, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {4, 1, 0, 16, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {1, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}, + {3, 0, 16, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {2, 17, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}, + {8, 1, 0, 16, 3, 2, 18, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 3, 2, 18, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 0, 16, 3, 2, 18, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 17, 3, 2, 18, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 1, 0, 16, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {2, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {4, 0, 16, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {3, 17, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {7, 1, 0, 16, 2, 18, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 2, 18, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 0, 16, 2, 18, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 17, 2, 18, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {6, 1, 0, 16, 19, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {3, 19, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 0, 16, 19, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 17, 19, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {7, 1, 0, 16, 3, 2, 18, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 3, 2, 18, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 0, 16, 3, 2, 18, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 17, 3, 2, 18, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 1, 0, 16, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {1, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}, + {3, 0, 16, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {2, 17, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}, + {6, 1, 0, 16, 2, 18, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {3, 2, 18, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 0, 16, 2, 18, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 17, 2, 18, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 1, 0, 16, 19, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {2, 19, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}, + {4, 0, 16, 19, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {3, 17, 19, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {11, 1, 0, 16, 3, 2, 18, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80}, + {8, 3, 2, 18, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {10, 0, 16, 3, 2, 18, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {9, 17, 3, 2, 18, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 1, 0, 16, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 0, 16, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 17, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {10, 1, 0, 16, 2, 18, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {7, 2, 18, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {9, 0, 16, 2, 18, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 17, 2, 18, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {9, 1, 0, 16, 19, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 19, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 0, 16, 19, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 17, 19, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 1, 0, 16, 3, 2, 18, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 3, 2, 18, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 0, 16, 3, 2, 18, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 17, 3, 2, 18, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 1, 0, 16, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {2, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {4, 0, 16, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {3, 17, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {7, 1, 0, 16, 2, 18, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 2, 18, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 0, 16, 2, 18, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 17, 2, 18, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {6, 1, 0, 16, 19, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {3, 19, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 0, 16, 19, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 17, 19, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {10, 1, 0, 16, 3, 2, 18, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {7, 3, 2, 18, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {9, 0, 16, 3, 2, 18, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 17, 3, 2, 18, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 1, 0, 16, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 0, 16, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 17, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {9, 1, 0, 16, 2, 18, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 2, 18, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 0, 16, 2, 18, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 17, 2, 18, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 1, 0, 16, 19, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 19, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 0, 16, 19, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 17, 19, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {9, 1, 0, 16, 3, 2, 18, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 3, 2, 18, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 0, 16, 3, 2, 18, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 17, 3, 2, 18, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 1, 0, 16, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {3, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 0, 16, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 17, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {8, 1, 0, 16, 2, 18, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 2, 18, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 0, 16, 2, 18, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 17, 2, 18, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 1, 0, 16, 19, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 19, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 0, 16, 19, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 17, 19, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {10, 1, 0, 16, 3, 2, 18, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {7, 3, 2, 18, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {9, 0, 16, 3, 2, 18, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 17, 3, 2, 18, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 1, 0, 16, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 0, 16, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 17, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {9, 1, 0, 16, 2, 18, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 2, 18, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 0, 16, 2, 18, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 17, 2, 18, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 1, 0, 16, 19, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 19, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 0, 16, 19, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 17, 19, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 1, 0, 16, 3, 2, 18, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 3, 2, 18, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 0, 16, 3, 2, 18, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 17, 3, 2, 18, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 1, 0, 16, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {1, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}, + {3, 0, 16, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {2, 17, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}, + {6, 1, 0, 16, 2, 18, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {3, 2, 18, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 0, 16, 2, 18, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 17, 2, 18, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 1, 0, 16, 19, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {2, 19, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}, + {4, 0, 16, 19, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {3, 17, 19, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {9, 1, 0, 16, 3, 2, 18, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 3, 2, 18, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 0, 16, 3, 2, 18, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 17, 3, 2, 18, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 1, 0, 16, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {3, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 0, 16, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 17, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {8, 1, 0, 16, 2, 18, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 2, 18, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 0, 16, 2, 18, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 17, 2, 18, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 1, 0, 16, 19, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 19, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 0, 16, 19, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 17, 19, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 1, 0, 16, 3, 2, 18, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 3, 2, 18, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 0, 16, 3, 2, 18, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 17, 3, 2, 18, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 1, 0, 16, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {2, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}, + {4, 0, 16, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {3, 17, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {7, 1, 0, 16, 2, 18, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 2, 18, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 0, 16, 2, 18, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 17, 2, 18, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {6, 1, 0, 16, 19, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {3, 19, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 0, 16, 19, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 17, 19, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, +}; +#else +// 1 byte for length, 16 bytes for mask +const uint8_t pack_1_2_3_utf8_bytes[256][17] = { + {12, 0, 1, 17, 2, 3, 19, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80}, + {9, 2, 3, 19, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {11, 1, 17, 2, 3, 19, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80}, + {10, 16, 2, 3, 19, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {9, 0, 1, 17, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {6, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 1, 17, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 16, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {11, 0, 1, 17, 3, 19, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80}, + {8, 3, 19, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {10, 1, 17, 3, 19, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {9, 16, 3, 19, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {10, 0, 1, 17, 18, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {7, 18, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {9, 1, 17, 18, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 16, 18, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {9, 0, 1, 17, 2, 3, 19, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {6, 2, 3, 19, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 1, 17, 2, 3, 19, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 16, 2, 3, 19, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 0, 1, 17, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {3, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 1, 17, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 16, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {8, 0, 1, 17, 3, 19, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 3, 19, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 1, 17, 3, 19, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 16, 3, 19, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 0, 1, 17, 18, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 18, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 1, 17, 18, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 16, 18, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {11, 0, 1, 17, 2, 3, 19, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80}, + {8, 2, 3, 19, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {10, 1, 17, 2, 3, 19, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {9, 16, 2, 3, 19, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 0, 1, 17, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 1, 17, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 16, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {10, 0, 1, 17, 3, 19, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {7, 3, 19, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {9, 1, 17, 3, 19, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 16, 3, 19, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {9, 0, 1, 17, 18, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 18, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 1, 17, 18, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 16, 18, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {10, 0, 1, 17, 2, 3, 19, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {7, 2, 3, 19, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {9, 1, 17, 2, 3, 19, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 16, 2, 3, 19, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 0, 1, 17, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 1, 17, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 16, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {9, 0, 1, 17, 3, 19, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 3, 19, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 1, 17, 3, 19, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 16, 3, 19, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 0, 1, 17, 18, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 18, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 1, 17, 18, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 16, 18, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {9, 0, 1, 17, 2, 3, 19, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {6, 2, 3, 19, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 1, 17, 2, 3, 19, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 16, 2, 3, 19, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 0, 1, 17, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {3, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 1, 17, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 16, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {8, 0, 1, 17, 3, 19, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 3, 19, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 1, 17, 3, 19, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 16, 3, 19, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 0, 1, 17, 18, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 18, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 1, 17, 18, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 16, 18, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {6, 0, 1, 17, 2, 3, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {3, 2, 3, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 1, 17, 2, 3, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 16, 2, 3, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {3, 0, 1, 17, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}, + {2, 1, 17, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {1, 16, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}, + {5, 0, 1, 17, 3, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {2, 3, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {4, 1, 17, 3, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {3, 16, 3, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {4, 0, 1, 17, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {1, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}, + {3, 1, 17, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {2, 16, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}, + {8, 0, 1, 17, 2, 3, 19, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 2, 3, 19, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 1, 17, 2, 3, 19, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 16, 2, 3, 19, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 0, 1, 17, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {2, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {4, 1, 17, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {3, 16, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {7, 0, 1, 17, 3, 19, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 3, 19, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 1, 17, 3, 19, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 16, 3, 19, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {6, 0, 1, 17, 18, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {3, 18, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 1, 17, 18, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 16, 18, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {7, 0, 1, 17, 2, 3, 19, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 2, 3, 19, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 1, 17, 2, 3, 19, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 16, 2, 3, 19, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 0, 1, 17, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {1, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}, + {3, 1, 17, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {2, 16, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}, + {6, 0, 1, 17, 3, 19, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {3, 3, 19, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 1, 17, 3, 19, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 16, 3, 19, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 0, 1, 17, 18, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {2, 18, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}, + {4, 1, 17, 18, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {3, 16, 18, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {11, 0, 1, 17, 2, 3, 19, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80}, + {8, 2, 3, 19, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {10, 1, 17, 2, 3, 19, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {9, 16, 2, 3, 19, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 0, 1, 17, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 1, 17, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 16, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {10, 0, 1, 17, 3, 19, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {7, 3, 19, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {9, 1, 17, 3, 19, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 16, 3, 19, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {9, 0, 1, 17, 18, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 18, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 1, 17, 18, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 16, 18, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 0, 1, 17, 2, 3, 19, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 2, 3, 19, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 1, 17, 2, 3, 19, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 16, 2, 3, 19, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 0, 1, 17, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {2, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {4, 1, 17, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {3, 16, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {7, 0, 1, 17, 3, 19, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 3, 19, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 1, 17, 3, 19, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 16, 3, 19, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {6, 0, 1, 17, 18, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {3, 18, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 1, 17, 18, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 16, 18, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {10, 0, 1, 17, 2, 3, 19, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {7, 2, 3, 19, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {9, 1, 17, 2, 3, 19, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 16, 2, 3, 19, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 0, 1, 17, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 1, 17, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 16, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {9, 0, 1, 17, 3, 19, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 3, 19, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 1, 17, 3, 19, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 16, 3, 19, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 0, 1, 17, 18, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 18, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 1, 17, 18, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 16, 18, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {9, 0, 1, 17, 2, 3, 19, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 2, 3, 19, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 1, 17, 2, 3, 19, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 16, 2, 3, 19, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 0, 1, 17, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {3, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 1, 17, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 16, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {8, 0, 1, 17, 3, 19, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 3, 19, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 1, 17, 3, 19, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 16, 3, 19, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 0, 1, 17, 18, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 18, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 1, 17, 18, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 16, 18, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {10, 0, 1, 17, 2, 3, 19, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {7, 2, 3, 19, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {9, 1, 17, 2, 3, 19, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 16, 2, 3, 19, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 0, 1, 17, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 1, 17, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 16, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {9, 0, 1, 17, 3, 19, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 3, 19, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 1, 17, 3, 19, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 16, 3, 19, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {8, 0, 1, 17, 18, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 18, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 1, 17, 18, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 16, 18, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 0, 1, 17, 2, 3, 19, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 2, 3, 19, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 1, 17, 2, 3, 19, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 16, 2, 3, 19, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 0, 1, 17, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {1, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}, + {3, 1, 17, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {2, 16, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}, + {6, 0, 1, 17, 3, 19, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {3, 3, 19, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 1, 17, 3, 19, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 16, 3, 19, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 0, 1, 17, 18, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {2, 18, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}, + {4, 1, 17, 18, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {3, 16, 18, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {9, 0, 1, 17, 2, 3, 19, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 2, 3, 19, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 1, 17, 2, 3, 19, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {7, 16, 2, 3, 19, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 0, 1, 17, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {3, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 1, 17, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 16, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {8, 0, 1, 17, 3, 19, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 3, 19, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 1, 17, 3, 19, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 16, 3, 19, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 0, 1, 17, 18, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 18, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 1, 17, 18, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 16, 18, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {8, 0, 1, 17, 2, 3, 19, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {5, 2, 3, 19, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {7, 1, 17, 2, 3, 19, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {6, 16, 2, 3, 19, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 0, 1, 17, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {2, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}, + {4, 1, 17, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {3, 16, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {7, 0, 1, 17, 3, 19, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80}, + {4, 3, 19, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {6, 1, 17, 3, 19, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {5, 16, 3, 19, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {6, 0, 1, 17, 18, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {3, 18, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, + {5, 1, 17, 18, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80}, + {4, 16, 18, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80}, +}; +#endif // SIMDUTF_IS_BIG_ENDIAN +} // namespace ppc64_utf16_to_utf8 +} // namespace tables +} // unnamed namespace +} // namespace simdutf + +#endif // PPC64_SIMDUTF_UTF16_TO_UTF8_TABLES_H +/* end file src/ppc64/ppc64_utf16_to_utf8_tables.h */ + +namespace simdutf { +namespace ppc64 { +namespace { +#ifndef SIMDUTF_PPC64_H + #error "ppc64.h must be included" +#endif +using namespace simd; + +simdutf_really_inline bool is_ascii(const simd8x64 &input) { + // careful: 0x80 is not ascii. + return input.reduce_or().saturating_sub(0b01111111u).bits_not_set_anywhere(); +} + +simdutf_really_inline simd8 +must_be_2_3_continuation(const simd8 prev2, + const simd8 prev3) { + simd8 is_third_byte = + prev2.saturating_sub(0xe0u - 0x80); // Only 111_____ will be >= 0x80 + simd8 is_fourth_byte = + prev3.saturating_sub(0xf0u - 0x80); // Only 1111____ will be >= 0x80 + // Caller requires a bool (all 1's). All values resulting from the subtraction + // will be <= 64, so signed comparison is fine. + return simd8(is_third_byte | is_fourth_byte); +} + +/// ErrorReporting describes behaviour of a vectorized procedure regarding error +/// checking +enum class ErrorReporting { + precise, // the procedure will report *approximate* or *precise* error + // position + at_the_end, // the procedure will only inform about an error after scanning + // the whole input (or its significant portion) + none, // no error checking is done, we assume valid inputs +}; + +#if SIMDUTF_FEATURE_UTF16 +/* begin file src/ppc64/ppc64_validate_utf16.cpp */ +template +simd8 utf16_gather_high_bytes(const simd16 in0, + const simd16 in1) { + if (big_endian) { + const vec_u8_t pack_high = { + 0, 2, 4, 6, 8, 10, 12, 14, // in0 + 16, 18, 20, 22, 24, 26, 28, 30 // in1 + }; + + return vec_perm(vec_u8_t(in0.value), vec_u8_t(in1.value), pack_high); + } else { + const vec_u8_t pack_high = { + 1, 3, 5, 7, 9, 11, 13, 15, // in0 + 17, 19, 21, 23, 25, 27, 29, 31 // in1 + }; + + return vec_perm(vec_u8_t(in0.value), vec_u8_t(in1.value), pack_high); + } +} +/* end file src/ppc64/ppc64_validate_utf16.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_LATIN1 && SIMDUTF_FEATURE_UTF8 +/* begin file src/ppc64/ppc64_convert_latin1_to_utf8.cpp */ +/* + * reads a vector of uint16 values + * bits after 11th are ignored + * first 11 bits are encoded into utf8 + * !important! utf8_output must have at least 16 writable bytes + */ +simdutf_really_inline void +write_v_u16_11bits_to_utf8(const vector_u16 v_u16, char *&utf8_output, + const vector_u8 one_byte_bytemask, + const uint16_t one_byte_bitmask) { + + // 0b1100_0000_1000_0000 + const auto v_c080 = vector_u16(0xc080); + // 0b0011_1111_0000_0000 + const auto v_1f00 = vector_u16(0x1f00); + // 0b0000_0000_0011_1111 + const auto v_003f = vector_u16(0x003f); + + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + + // t0 = [0000|0000|00bb|bbbb] + const auto t0 = v_u16 & v_003f; + // t1 = [000a|aaaa|bbbb|bb00] + const auto t1 = v_u16.shl<2>(); + // t2 = [000a|aaaa|00bb|bbbb] + const auto t2 = select(v_1f00, t1, t0); + // t3 = [110a|aaaa|10bb|bbbb] + const auto t3 = t2 | v_c080; + + // 2. merge ASCII and 2-byte codewords + const auto utf8_unpacked1 = + select(one_byte_bytemask, as_vector_u8(v_u16), as_vector_u8(t3)); + +#if SIMDUTF_IS_BIG_ENDIAN + const auto tmp = as_vector_u16(utf8_unpacked1).swap_bytes(); +#else + const auto tmp = as_vector_u16(utf8_unpacked1); +#endif // SIMDUTF_IS_BIG_ENDIAN + const auto utf8_unpacked = as_vector_u8(tmp); + + // 3. prepare bitmask for 8-bit lookup + // one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a + // - LSB) + const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a + const uint16_t m1 = static_cast(m0 >> 7); // m1 = 00000000h0g0f0e0 + const uint8_t m2 = static_cast((m0 | m1) & 0xff); // m2 = hdgcfbea + // 4. pack the bytes + const uint8_t *row = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; + const auto shuffle = vector_u8::load(row + 1); + const auto utf8_packed = shuffle.lookup_16(utf8_unpacked); + + // 5. store bytes + utf8_packed.store(utf8_output); + + // 6. adjust pointers + utf8_output += row[0]; +} + +inline void write_v_u16_11bits_to_utf8(const vector_u16 v_u16, + char *&utf8_output, + const vector_u16 v_0000, + const vector_u16 v_ff80) { + // no bits set above 7th bit + const auto one_byte_bytemask = (v_u16 & v_ff80) == v_0000; + const uint16_t one_byte_bitmask = one_byte_bytemask.to_bitmask(); + + write_v_u16_11bits_to_utf8(v_u16, utf8_output, + as_vector_u8(one_byte_bytemask), one_byte_bitmask); +} + +std::pair +ppc64_convert_latin1_to_utf8(const char *latin_input, + const size_t latin_input_length, + char *utf8_output) { + const char *end = latin_input + latin_input_length; + + const auto v_0000 = vector_u16::zero(); + const auto v_00 = vector_u8::zero(); + + // 0b1111_1111_1000_0000 + const auto v_ff80 = vector_u16(0xff80); + +#if SIMDUTF_IS_BIG_ENDIAN + const auto latin_1_half_into_u16_byte_mask = + vector_u8(16, 0, 16, 1, 16, 2, 16, 3, 16, 4, 16, 5, 16, 6, 16, 7); + const auto latin_2_half_into_u16_byte_mask = + vector_u8(16, 8, 16, 9, 16, 10, 16, 11, 16, 12, 16, 13, 16, 14, 16, 15); +#else + const auto latin_1_half_into_u16_byte_mask = + vector_u8(0, 16, 1, 16, 2, 16, 3, 16, 4, 16, 5, 16, 6, 16, 7, 16); + const auto latin_2_half_into_u16_byte_mask = + vector_u8(8, 16, 9, 16, 10, 16, 11, 16, 12, 16, 13, 16, 14, 16, 15, 16); +#endif // SIMDUTF_IS_BIG_ENDIAN + + // each latin1 takes 1-2 utf8 bytes + // slow path writes useful 8-15 bytes twice (eagerly writes 16 bytes and then + // adjust the pointer) so the last write can exceed the utf8_output size by + // 8-1 bytes by reserving 8 extra input bytes, we expect the output to have + // 8-16 bytes free + while (end - latin_input >= 16 + 8) { + // Load 16 Latin1 characters (16 bytes) into a 128-bit register + const auto v_latin = vector_u8::load(latin_input); + + if (v_latin.is_ascii()) { // ASCII fast path!!!! + v_latin.store(utf8_output); + latin_input += 16; + utf8_output += 16; + continue; + } + + // assuming a/b are bytes and A/B are uint16 of the same value + // aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA + const vector_u16 v_u16_latin_1_half = + as_vector_u16(latin_1_half_into_u16_byte_mask.lookup_32(v_latin, v_00)); + + // aaaa_aaaa_bbbb_bbbb -> BBBB_BBBB + const vector_u16 v_u16_latin_2_half = + as_vector_u16(latin_2_half_into_u16_byte_mask.lookup_32(v_latin, v_00)); + + write_v_u16_11bits_to_utf8(v_u16_latin_1_half, utf8_output, v_0000, v_ff80); + write_v_u16_11bits_to_utf8(v_u16_latin_2_half, utf8_output, v_0000, v_ff80); + latin_input += 16; + } + + if (end - latin_input >= 16) { + // Load 16 Latin1 characters (16 bytes) into a 128-bit register + const auto v_latin = vector_u8::load(latin_input); + + if (v_latin.is_ascii()) { // ASCII fast path!!!! + v_latin.store(utf8_output); + latin_input += 16; + utf8_output += 16; + } else { + // assuming a/b are bytes and A/B are uint16 of the same value + // aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA + const auto v_u16_latin_1_half = as_vector_u16( + latin_1_half_into_u16_byte_mask.lookup_32(v_latin, v_00)); + + write_v_u16_11bits_to_utf8(v_u16_latin_1_half, utf8_output, v_0000, + v_ff80); + latin_input += 8; + } + } + + return std::make_pair(latin_input, utf8_output); +} +/* end file src/ppc64/ppc64_convert_latin1_to_utf8.cpp */ +#endif // SIMDUTF_FEATURE_LATIN1 && SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_LATIN1 && SIMDUTF_FEATURE_UTF16 +/* begin file src/ppc64/ppc64_convert_latin1_to_utf16.cpp */ +template +size_t ppc64_convert_latin1_to_utf16(const char *latin1_input, size_t len, + char16_t *utf16_output) { + const size_t rounded_len = align_down(len); + + for (size_t i = 0; i < rounded_len; i += vector_u8::ELEMENTS) { + const auto in = vector_u8::load(&latin1_input[i]); + in.store_bytes_as_utf16(&utf16_output[i]); + } + + return rounded_len; +} +/* end file src/ppc64/ppc64_convert_latin1_to_utf16.cpp */ +#endif // SIMDUTF_FEATURE_LATIN1 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_LATIN1 && SIMDUTF_FEATURE_UTF32 +/* begin file src/ppc64/ppc64_convert_latin1_to_utf32.cpp */ +std::pair +ppc64_convert_latin1_to_utf32(const char *buf, size_t len, + char32_t *utf32_output) { + const size_t rounded_len = align_down(len); + + for (size_t i = 0; i < rounded_len; i += vector_u8::ELEMENTS) { + const auto in = vector_u8::load(&buf[i]); + in.store_bytes_as_utf32(&utf32_output[i]); + } + + return std::make_pair(buf + rounded_len, utf32_output + rounded_len); +} +/* end file src/ppc64/ppc64_convert_latin1_to_utf32.cpp */ +#endif // SIMDUTF_FEATURE_LATIN1 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/ppc64/ppc64_convert_utf8_to_latin1.cpp */ +// depends on "tables/utf8_to_utf16_tables.h" + +// Convert up to 12 bytes from utf8 to latin1 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 12). +size_t convert_masked_utf8_to_latin1(const char *input, + uint64_t utf8_end_of_code_point_mask, + char *&latin1_output) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + // + // Optimization note: our main path below is load-latency dependent. Thus it + // is maybe beneficial to have fast paths that depend on branch prediction but + // have less latency. This results in more instructions but, potentially, also + // higher speeds. + // + const auto in = vector_u8::load(input); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & + 0xfff; // we are only processing 12 bytes in case it is not all ASCII + if (utf8_end_of_code_point_mask == 0xfff) { + // We process the data in chunks of 12 bytes. + in.store(latin1_output); + latin1_output += 12; // We wrote 12 characters. + return 12; // We consumed 12 bytes. + } + /// We do not have a fast path available, so we fallback. + const uint8_t idx = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; + const uint8_t consumed = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; + // this indicates an invalid input: + if (idx >= 64) { + return consumed; + } + // Here we should have (idx < 64), if not, there is a bug in the validation or + // elsewhere. SIX (6) input code-code units this is a relatively easy scenario + // we process SIX (6) input code-code units. The max length in bytes of six + // code code units spanning between 1 and 2 bytes each is 12 bytes. On + // processors where pdep/pext is fast, we might be able to use a small lookup + // table. + + const auto reshuffle = vector_u8::load(&tables::utf8_to_utf16::shufutf8[idx]); + const auto perm8 = reshuffle.lookup_32(in, vector_u8::zero()); +#if SIMDUTF_IS_BIG_ENDIAN + const auto perm16 = as_vector_u16(perm8).swap_bytes(); +#else + const auto perm16 = as_vector_u16(perm8); +#endif // SIMDUTF_IS_BIG_ENDIAN + const auto ascii = perm16 & uint16_t(0x7f); + const auto highbyte = perm16 & uint16_t(0x1f00); + const auto composed = ascii | highbyte.shr<2>(); + + const auto latin1_packed = vector_u16::pack(composed, composed); +#if defined(__clang__) + __attribute__((aligned(16))) char buf[16]; + latin1_packed.store(buf); + memcpy(latin1_output, buf, 6); +#else + // writing 8 bytes even though we only care about the first 6 bytes. + const auto tmp = vec_u64_t(latin1_packed.value); + memcpy(latin1_output, &tmp[0], 8); +#endif + latin1_output += 6; // We wrote 6 bytes. + return consumed; +} +/* end file src/ppc64/ppc64_convert_utf8_to_latin1.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +/* begin file src/ppc64/ppc64_convert_utf8_to_utf16.cpp */ +// depends on "tables/utf8_to_utf16_tables.h" + +// Convert up to 12 bytes from utf8 to utf16 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 12). +template +size_t convert_masked_utf8_to_utf16(const char *input, + uint64_t utf8_end_of_code_point_mask, + char16_t *&utf16_output) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + // + // Optimization note: our main path below is load-latency dependent. Thus it + // is maybe beneficial to have fast paths that depend on branch prediction but + // have less latency. This results in more instructions but, potentially, also + // higher speeds. + // + // We first try a few fast paths. + const auto in = vector_u8::load(input); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & 0xfff; + if (utf8_end_of_code_point_mask == 0xfff) { + // We process the data in chunks of 12 bytes. + // Note: using 16 bytes is unsafe, see issue_ossfuzz_71218 + in.store_bytes_as_utf16(utf16_output); + utf16_output += 12; // We wrote 12 16-bit characters. + return 12; // We consumed 12 bytes. + } + if (((utf8_end_of_code_point_mask & 0xFFFF) == 0xaaaa)) { + // We want to take 8 2-byte UTF-8 code units and turn them into 8 2-byte + // UTF-16 code units. +#if SIMDUTF_IS_BIG_ENDIAN + const auto in16 = as_vector_u16(in); +#else + const auto in16 = as_vector_u16(in).swap_bytes(); +#endif // SIMDUTF_IS_BIG_ENDIAN + const auto lo = in16 & uint16_t(0x007f); + const auto hi = in16.shr<2>(); + + auto composed = select(uint16_t(0x1f00 >> 2), hi, lo); + if simdutf_constexpr (!match_system(big_endian)) { + composed = composed.swap_bytes(); + } + + composed.store(utf16_output); + utf16_output += 8; // We wrote 16 bytes, 8 code points. + return 16; + } + if (input_utf8_end_of_code_point_mask == 0x924) { + // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte + // UTF-16 code units. There is probably a more efficient sequence, but the + // following might do. + + // AltiVec: it might be done better, for now SSE translation + + const auto sh = + vector_u8(2, 1, 0, 16, 5, 4, 3, 16, 8, 7, 6, 16, 11, 10, 9, 16); +#if SIMDUTF_IS_BIG_ENDIAN + const auto perm = + as_vector_u32(sh.lookup_32(in, vector_u8::zero())).swap_bytes(); +#else + const auto perm = as_vector_u32(sh.lookup_32(in, vector_u8::zero())); +#endif // SIMDUTF_IS_BIG_ENDIAN + const auto b0 = perm & uint32_t(0x0000007f); + const auto b1 = select(uint32_t(0x00003f00 >> 2), perm.shr<2>(), b0); + const auto b2 = select(uint32_t(0x000f0000 >> 4), perm.shr<4>(), b1); + const auto composed = b2; + auto packed = vector_u32::pack(composed, composed); + + if simdutf_constexpr (!match_system(big_endian)) { + packed = packed.swap_bytes(); + } + + packed.store(utf16_output); + utf16_output += 4; + return 12; + } + /// We do not have a fast path available, so we fallback. + + const uint8_t idx = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; + const uint8_t consumed = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; + + if (idx < 64) { + // SIX (6) input code-code units + // this is a relatively easy scenario + // we process SIX (6) input code-code units. The max length in bytes of six + // code code units spanning between 1 and 2 bytes each is 12 bytes. On + // processors where pdep/pext is fast, we might be able to use a small + // lookup table. + const auto sh = vector_u8::load(&tables::utf8_to_utf16::shufutf8[idx]); +#if SIMDUTF_IS_BIG_ENDIAN + const auto perm = + as_vector_u16(sh.lookup_32(in, vector_u8::zero())).swap_bytes(); +#else + const auto perm = as_vector_u16(sh.lookup_32(in, vector_u8::zero())); +#endif // SIMDUTF_IS_BIG_ENDIAN + const auto b0 = perm & uint16_t(0x007f); + const auto b1 = perm & uint16_t(0x1f00); + + auto composed = b0 | b1.shr<2>(); + + if simdutf_constexpr (!match_system(big_endian)) { + composed = composed.swap_bytes(); + } + + composed.store(utf16_output); + utf16_output += 6; // We wrote 12 bytes, 6 code points. + } else if (idx < 145) { + // FOUR (4) input code-code units + const auto sh = vector_u8::load(&tables::utf8_to_utf16::shufutf8[idx]); +#if SIMDUTF_IS_BIG_ENDIAN + const auto perm = + as_vector_u32(sh.lookup_32(in, vector_u8::zero())).swap_bytes(); +#else + const auto perm = as_vector_u32(sh.lookup_32(in, vector_u8::zero())); +#endif // SIMDUTF_IS_BIG_ENDIAN + const auto b0 = perm & uint32_t(0x0000007f); + const auto b1 = perm & uint32_t(0x00003f00); + const auto b2 = perm & uint32_t(0x000f0000); + + const auto composed = b0 | b1.shr<2>() | b2.shr<4>(); + + auto packed = vector_u32::pack(composed, composed); + + if simdutf_constexpr (!match_system(big_endian)) { + packed = packed.swap_bytes(); + } + + packed.store(utf16_output); + utf16_output += 4; + } else if (idx < 209) { + // TWO (2) input code-code units + ////////////// + // There might be garbage inputs where a leading byte mascarades as a + // four-byte leading byte (by being followed by 3 continuation byte), but is + // not greater than 0xf0. This could trigger a buffer overflow if we only + // counted leading bytes of the form 0xf0 as generating surrogate pairs, + // without further UTF-8 validation. Thus we must be careful to ensure that + // only leading bytes at least as large as 0xf0 generate surrogate pairs. We + // do as at the cost of an extra mask. + ///////////// + const auto sh = vector_u8::load(&tables::utf8_to_utf16::shufutf8[idx]); +#if SIMDUTF_IS_BIG_ENDIAN + const auto perm = + as_vector_u32(sh.lookup_32(in, vector_u8::zero())).swap_bytes(); +#else + const auto perm = as_vector_u32(sh.lookup_32(in, vector_u8::zero())); +#endif // SIMDUTF_IS_BIG_ENDIAN + const auto ascii = perm & uint32_t(0x00000007f); + const auto middlebyte = perm & uint32_t(0x00003f00); + const auto middlebyte_shifted = middlebyte.shr<2>(); + + auto middlehighbyte = perm & uint32_t(0x003f0000); + // correct for spurious high bit + + const auto correct = (perm & uint32_t(0x00400000)).shr<1>(); + middlehighbyte = correct ^ middlehighbyte; + const auto middlehighbyte_shifted = middlehighbyte.shr<4>(); + // We deliberately carry the leading four bits in highbyte if they are + // present, we remove them later when computing hightenbits. + const auto highbyte = perm & uint32_t(0xff000000); + const auto highbyte_shifted = highbyte.shr<6>(); + // When we need to generate a surrogate pair (leading byte > 0xF0), then + // the corresponding 32-bit value in 'composed' will be greater than + // > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the + // location of the surrogate pairs. + const auto composed = + ascii | middlebyte_shifted | highbyte_shifted | middlehighbyte_shifted; + + const auto composedminus = composed - uint32_t(0x10000); + const auto lowtenbits = composedminus & uint32_t(0x3ff); + // Notice the 0x3ff mask: + const auto hightenbits = composedminus.shr<10>() & uint32_t(0x3ff); + const auto lowtenbitsadd = lowtenbits + uint32_t(0xDC00); + const auto hightenbitsadd = hightenbits + uint32_t(0xD800); + const auto lowtenbitsaddshifted = lowtenbitsadd.shl<16>(); + auto surrogates = hightenbitsadd | lowtenbitsaddshifted; + + uint32_t basic_buffer[4]; + composed.store(basic_buffer); + uint32_t surrogate_buffer[4]; + surrogates.swap_bytes().store(surrogate_buffer); + + for (size_t i = 0; i < 3; i++) { + if (basic_buffer[i] > 0x3c00000) { + const auto ch0 = uint16_t(surrogate_buffer[i] & 0xffff); + const auto ch1 = uint16_t(surrogate_buffer[i] >> 16); + if (match_system(big_endian)) { + utf16_output[1] = scalar::u16_swap_bytes(ch0); + utf16_output[0] = scalar::u16_swap_bytes(ch1); + } else { + utf16_output[1] = ch0; + utf16_output[0] = ch1; + } + utf16_output += 2; + } else { + const auto chr = uint16_t(basic_buffer[i]); + utf16_output[0] = scalar::utf16::swap_if_needed(chr); + utf16_output++; + } + } + } else { + // here we know that there is an error but we do not handle errors + } + return consumed; +} +/* end file src/ppc64/ppc64_convert_utf8_to_utf16.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +/* begin file src/ppc64/ppc64_convert_utf8_to_utf32.cpp */ +// depends on "tables/utf8_to_utf16_tables.h" + +// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 12). +size_t convert_masked_utf8_to_utf32(const char *input, + uint64_t utf8_end_of_code_point_mask, + char32_t *&utf32_output) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + // + // Optimization note: our main path below is load-latency dependent. Thus it + // is maybe beneficial to have fast paths that depend on branch prediction but + // have less latency. This results in more instructions but, potentially, also + // higher speeds. + // + // We first try a few fast paths. + const auto in = vector_u8::load(input); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & 0xfff; + if (utf8_end_of_code_point_mask == 0xfff) { + // We process the data in chunks of 12 bytes. + in.store_bytes_as_utf32(utf32_output); + utf32_output += 12; // We wrote 12 32-bit characters. + return 12; // We consumed 12 bytes. + } + if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) { + // We want to take 8 2-byte UTF-8 code units and turn them into 8 4-byte + // UTF-32 code units. +#if SIMDUTF_IS_BIG_ENDIAN + const auto perm = as_vector_u16(in); +#else + const auto perm = as_vector_u16(in).swap_bytes(); +#endif // SIMDUTF_IS_BIG_ENDIAN + // in = [110aaaaa|10bbbbbb] + // t0 = [00000000|00bbbbbb] + const auto t0 = perm & uint16_t(0x007f); + + // t1 = [00110aaa|aabbbbbb] + const auto t1 = perm.shr<2>(); + const auto composed = select(uint16_t(0x1f00 >> 2), t1, t0); + + const auto composed8 = as_vector_u8(composed); + composed8.store_words_as_utf32(utf32_output); + + utf32_output += 8; // We wrote 32 bytes, 8 code points. + return 16; + } + if (input_utf8_end_of_code_point_mask == 0x924) { + // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte + // UTF-32 code units. +#if SIMDUTF_IS_BIG_ENDIAN + const auto sh = + vector_u8(-1, 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11); +#else + const auto sh = + vector_u8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1); +#endif // SIMDUTF_IS_BIG_ENDIAN + const auto perm = as_vector_u32(sh.lookup_32(in, vector_u8::zero())); + + // in = [1110aaaa|10bbbbbb|10cccccc] + + // t0 = [00000000|00000000|00cccccc] + const auto t0 = perm & uint32_t(0x0000007f); + + // t2 = [00000000|0000bbbb|bbcccccc] + const auto t1 = perm.shr<2>(); + const auto t2 = select(uint32_t(0x00003f00 >> 2), t1, t0); + + // t4 = [00000000|aaaabbbb|bbcccccc] + const auto t3 = perm.shr<4>(); + const auto t4 = select(uint32_t(0x0f0000 >> 4), t3, t2); + + t4.store(utf32_output); + utf32_output += 4; + return 12; + } + /// We do not have a fast path available, so we fallback. + + const uint8_t idx = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; + const uint8_t consumed = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; + if (idx < 64) { + // SIX (6) input code-code units + // this is a relatively easy scenario + // we process SIX (6) input code-code units. The max length in bytes of six + // code code units spanning between 1 and 2 bytes each is 12 bytes. On + // processors where pdep/pext is fast, we might be able to use a small + // lookup table. + const auto sh = vector_u8::load(&tables::utf8_to_utf16::shufutf8[idx]); +#if SIMDUTF_IS_BIG_ENDIAN + const auto perm = + as_vector_u16(sh.lookup_32(in, vector_u8::zero())).swap_bytes(); +#else + const auto perm = as_vector_u16(sh.lookup_32(in, vector_u8::zero())); +#endif // SIMDUTF_IS_BIG_ENDIAN + const auto ascii = perm & uint16_t(0x7f); + const auto highbyte = perm & uint16_t(0x1f00); + const auto composed = ascii | highbyte.shr<2>(); + + as_vector_u8(composed).store_words_as_utf32(utf32_output); + utf32_output += 6; // We wrote 12 bytes, 6 code points. + } else if (idx < 145) { + // FOUR (4) input code-code units + const auto sh = vector_u8::load(&tables::utf8_to_utf16::shufutf8[idx]); +#if SIMDUTF_IS_BIG_ENDIAN + const auto perm = + as_vector_u32(sh.lookup_32(in, vector_u8::zero())).swap_bytes(); +#else + const auto perm = as_vector_u32(sh.lookup_32(in, vector_u8::zero())); +#endif // SIMDUTF_IS_BIG_ENDIAN + const auto ascii = perm & uint32_t(0x7f); + const auto middlebyte = perm & uint32_t(0x3f00); + const auto middlebyte_shifted = middlebyte.shr<2>(); + const auto highbyte = perm & uint32_t(0x0f0000); + const auto highbyte_shifted = highbyte.shr<4>(); + const auto composed = ascii | middlebyte_shifted | highbyte_shifted; + + composed.store(utf32_output); + utf32_output += 4; + } else if (idx < 209) { + // TWO (2) input code-code units + const auto sh = vector_u8::load(&tables::utf8_to_utf16::shufutf8[idx]); +#if SIMDUTF_IS_BIG_ENDIAN + const auto perm = + as_vector_u32(sh.lookup_32(in, vector_u8::zero())).swap_bytes(); +#else + const auto perm = as_vector_u32(sh.lookup_32(in, vector_u8::zero())); +#endif // SIMDUTF_IS_BIG_ENDIAN + const auto ascii = perm & uint32_t(0x0000007f); + const auto middlebyte = perm & uint32_t(0x3f00); + const auto middlebyte_shifted = middlebyte.shr<2>(); + auto middlehighbyte = perm & uint32_t(0x003f0000); + // correct for spurious high bit + const auto correct0 = perm & uint32_t(0x00400000); + const auto correct = correct0.shr<1>(); + middlehighbyte = correct ^ middlehighbyte; + const auto middlehighbyte_shifted = middlehighbyte.shr<4>(); + const auto highbyte = perm & uint32_t(0x07000000); + const auto highbyte_shifted = highbyte.shr<6>(); + const auto composed = + ascii | middlebyte_shifted | highbyte_shifted | middlehighbyte_shifted; + composed.store(utf32_output); + utf32_output += 3; + } else { + // here we know that there is an error but we do not handle errors + } + return consumed; +} +/* end file src/ppc64/ppc64_convert_utf8_to_utf32.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/ppc64/ppc64_convert_utf16_to_latin1.cpp */ +struct utf16_to_latin1_t { + error_code err; + const char16_t *input; + char *output; +}; + +template +utf16_to_latin1_t ppc64_convert_utf16_to_latin1(const char16_t *buf, size_t len, + char *latin1_output) { + const char16_t *end = buf + len; + while (end - buf >= 8) { + // Load 8 x UTF-16 characters + auto in = vector_u8::load(buf); + + // Move low bytes of UTF-16 chars to lower half of `in` + // and upper bytes to upper half of `in`. + if simdutf_constexpr (!match_system(big_endian)) { + const auto perm = + vector_u8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); + in = perm.lookup_16(in); + } else { + const auto perm = + vector_u8(1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14); + in = perm.lookup_16(in); + } + + // AltiVec-specific +#if defined(__clang__) + __attribute__((aligned(16))) uint64_t tmp[8]; + in.store(tmp); + #if SIMDUTF_IS_BIG_ENDIAN + memcpy(latin1_output, &tmp[0], 8); + const uint64_t upper = tmp[1]; + #else + memcpy(latin1_output, &tmp[1], 8); + const uint64_t upper = tmp[0]; + #endif // SIMDUTF_IS_BIG_ENDIAN +#else + const auto tmp = vec_u64_t(in.value); + #if SIMDUTF_IS_BIG_ENDIAN + memcpy(latin1_output, &tmp[0], 8); + const uint64_t upper = tmp[1]; + #else + memcpy(latin1_output, &tmp[1], 8); + const uint64_t upper = tmp[0]; + #endif // SIMDUTF_IS_BIG_ENDIAN +#endif // defined(__clang__) + // AltiVec + + if (simdutf_unlikely(upper)) { + uint8_t bytes[8]; + memcpy(bytes, &upper, 8); + for (size_t k = 0; k < 8; k++) { + if (bytes[k] != 0) { + return utf16_to_latin1_t{error_code::TOO_LARGE, buf + k, + latin1_output}; + } + } + } else { + // Adjust pointers for next iteration + buf += 8; + latin1_output += 8; + } + } // while + + return utf16_to_latin1_t{error_code::SUCCESS, buf, latin1_output}; +} +/* end file src/ppc64/ppc64_convert_utf16_to_latin1.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF8 +/* begin file src/ppc64/ppc64_convert_utf16_to_utf8.cpp */ +/* + The vectorized algorithm works on single SSE register i.e., it + loads eight 16-bit code units. + + We consider three cases: + 1. an input register contains no surrogates and each value + is in range 0x0000 .. 0x07ff. + 2. an input register contains no surrogates and values are + is in range 0x0000 .. 0xffff. + 3. an input register contains surrogates --- i.e. codepoints + can have 16 or 32 bits. + + Ad 1. + + When values are less than 0x0800, it means that a 16-bit code unit + can be converted into: 1) single UTF8 byte (when it is an ASCII + char) or 2) two UTF8 bytes. + + For this case we do only some shuffle to obtain these 2-byte + codes and finally compress the whole SSE register with a single + shuffle. + + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + + Ad 2. + + When values fit in 16-bit code units, but are above 0x07ff, then + a single word may produce one, two or three UTF8 bytes. + + We prepare data for all these three cases in two registers. + The first register contains lower two UTF8 bytes (used in all + cases), while the second one contains just the third byte for + the three-UTF8-bytes case. + + Finally these two registers are interleaved forming eight-element + array of 32-bit values. The array spans two SSE registers. + The bytes from the registers are compressed using two shuffles. + + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + + + To summarize: + - We need two 256-entry tables that have 8704 bytes in total. +*/ + +// Auxiliary procedure used by UTF-16 and UTF-32 into UTF-8. +// Note the pointer is passed by reference, it is updated by the procedure. +template +simdutf_really_inline void ppc64_convert_utf16_to_1_2_3_bytes_of_utf8( + const vector_u16 in, uint16_t one_byte_bitmask, + const T one_or_two_bytes_bytemask, uint16_t one_or_two_bytes_bitmask, + char *&utf8_output) { + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes +#if SIMDUTF_IS_BIG_ENDIAN + const auto dup_lsb = + vector_u8(1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15); +#else + const auto dup_lsb = + vector_u8(0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14); +#endif // SIMDUTF_IS_BIG_ENDIAN + + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - + single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two + UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - + three UTF-8 bytes + + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. + + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + + Finally from these two code units we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const auto t0 = as_vector_u16(dup_lsb.lookup_16(as_vector_u8(in))); + + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const auto t1 = t0 & uint16_t(0b0011111101111111); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const auto t2 = t1 | uint16_t(0b1000000000000000); + + // in = [aaaa|bbbb|bbcc|cccc] + // a0 = [0000|0000|0000|aaaa] + const auto a0 = in.shr<12>(); + // b0 = [aabb|bbbb|cccc|cc00] + const auto b0 = in.shl<2>(); + // s0 = [00bb|bbbb|00cc|cccc] + const auto s0 = select(uint16_t(0x3f00), b0, a0); + + // s3 = [11bb|bbbb|1110|aaaa] + const auto s3 = s0 | uint16_t(0b1100000011100000); + + const auto m0 = + ~as_vector_u16(one_or_two_bytes_bytemask) & uint16_t(0b0100000000000000); + const auto s4 = s3 ^ m0; + + // 4. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + const uint16_t mask = + (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa); + if (mask == 0) { + // We only have three-byte code units. Use fast path. +#if SIMDUTF_IS_BIG_ENDIAN + // Lookups produced by scripts/ppc64_convert_utf16_to_utf8.py + const auto shuffle0 = + vector_u8(1, 0, 16, 3, 2, 18, 5, 4, 20, 7, 6, 22, 9, 8, 24, 11); + const auto shuffle1 = vector_u8(10, 26, 13, 12, 28, 15, 14, 30, -1, -1, -1, + -1, -1, -1, -1, -1); +#else + const auto shuffle0 = + vector_u8(0, 1, 17, 2, 3, 19, 4, 5, 21, 6, 7, 23, 8, 9, 25, 10); + const auto shuffle1 = vector_u8(11, 27, 12, 13, 29, 14, 15, 31, -1, -1, -1, + -1, -1, -1, -1, -1); +#endif // SIMDUTF_IS_BIG_ENDIAN + const auto utf8_0 = shuffle0.lookup_32(as_vector_u8(s4), as_vector_u8(t2)); + const auto utf8_1 = shuffle1.lookup_32(as_vector_u8(s4), as_vector_u8(t2)); + + utf8_0.store(utf8_output); + utf8_output += 16; + utf8_1.store(utf8_output); + utf8_output += 8; + return; + } + + const uint8_t mask0 = uint8_t(mask); + + const uint8_t *row0 = + &simdutf::tables::ppc64_utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const auto shuffle0 = vector_u8::load(row0 + 1); + + const auto utf8_0 = shuffle0.lookup_32(as_vector_u8(s4), as_vector_u8(t2)); + const uint8_t mask1 = static_cast(mask >> 8); + + const uint8_t *row1 = + &simdutf::tables::ppc64_utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const auto shuffle1 = vector_u8::load(row1 + 1) + uint8_t(8); + const auto utf8_1 = shuffle1.lookup_32(as_vector_u8(s4), as_vector_u8(t2)); + + utf8_0.store(utf8_output); + utf8_output += row0[0]; + utf8_1.store(utf8_output); + utf8_output += row1[0]; +} + +struct utf16_to_utf8_t { + error_code err; + const char16_t *input; + char *output; +}; + +/* + Returns utf16_to_utf8_t value + A scalar routine should carry on the conversion of the tail, + iff there was no error. +*/ +template +utf16_to_utf8_t ppc64_convert_utf16_to_utf8(const char16_t *buf, size_t len, + char *utf8_output) { + + const char16_t *end = buf + len; + + const auto v_f800 = vector_u16(0xf800); + const auto v_d800 = vector_u16(0xd800); + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 + + while (end - buf >= std::ptrdiff_t(16 + safety_margin)) { + auto in = vector_u16::load(buf); + if (not match_system(big_endian)) { + in = in.swap_bytes(); + } + // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes + if (in.is_ascii()) { + auto nextin = vector_u16::load(buf + vector_u16::ELEMENTS); + if (not match_system(big_endian)) { + nextin = nextin.swap_bytes(); + } + + if (nextin.is_ascii()) { + // 1. pack the bytes + const auto utf8_packed = vector_u16::pack(in, nextin); + // 2. store (16 bytes) + utf8_packed.store(utf8_output); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } + + // next block is not ASCII + const auto utf8_packed = vector_u16::pack(in, in); + // 2. store (16 bytes) + utf8_packed.store(utf8_output); + // 3. adjust pointers + buf += 8; + utf8_output += 8; + in = nextin; + // fallback + } + + // no bits set above 7th bit + const auto one_byte_bytemask = in < uint16_t(1 << 7); + const uint16_t one_byte_bitmask = one_byte_bytemask.to_bitmask(); + + // no bits set above 11th bit + const auto one_or_two_bytes_bytemask = in < uint16_t(1 << 11); + const uint16_t one_or_two_bytes_bitmask = + one_or_two_bytes_bytemask.to_bitmask(); + + if (one_or_two_bytes_bitmask == 0xffff) { + write_v_u16_11bits_to_utf8( + in, utf8_output, as_vector_u8(one_byte_bytemask), one_byte_bitmask); + buf += 8; + continue; + } + + // 1. Check if there are any surrogate word in the input chunk. + // We have also to deal with situation when there is a surrogate word + // at the end of a chunk. + const auto surrogates_bytemask = (in & v_f800) == v_d800; + + // bitmask = 0x0000 if there are no surrogates + // = 0xc000 if the last word is a surrogate + const uint16_t surrogates_bitmask = surrogates_bytemask.to_bitmask(); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. + // However, it is likely an uncommon occurrence. + if (surrogates_bitmask == 0x0000) { + ppc64_convert_utf16_to_1_2_3_bytes_of_utf8( + in, one_byte_bitmask, one_or_two_bytes_bytemask, + one_or_two_bytes_bitmask, utf8_output); + + buf += 8; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint16_t word = scalar::utf16::swap_if_needed(buf[k]); + if ((word & 0xFF80) == 0) { + *utf8_output++ = uint8_t(word); + } else if ((word & 0xF800) == 0) { + *utf8_output++ = uint8_t((word >> 6) | 0b11000000); + *utf8_output++ = uint8_t((word & 0b111111) | 0b10000000); + } else if ((word & 0xF800) != 0xD800) { + *utf8_output++ = uint8_t((word >> 12) | 0b11100000); + *utf8_output++ = uint8_t(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = uint8_t((word & 0b111111) | 0b10000000); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = + scalar::utf16::swap_if_needed(buf[k + 1]); + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if ((diff | diff2) > 0x3FF) { + return utf16_to_utf8_t{error_code::SURROGATE, buf + k - 1, + utf8_output}; + } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf8_output++ = uint8_t((value >> 18) | 0b11110000); + *utf8_output++ = uint8_t(((value >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = uint8_t(((value >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = uint8_t((value & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + + return utf16_to_utf8_t{error_code::SUCCESS, buf, utf8_output}; +} +/* end file src/ppc64/ppc64_convert_utf16_to_utf8.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +/* begin file src/ppc64/ppc64_convert_utf16_to_utf32.cpp */ +struct utf16_to_utf32_t { + error_code err; // error code + const char16_t *input; // last position in input buffer + char32_t *output; // last position in output buffer +}; + +template +utf16_to_utf32_t ppc64_convert_utf16_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_output) { + const char16_t *end = buf + len; + + const auto v_f800 = vector_u16::splat(0xf800); + const auto v_d800 = vector_u16::splat(0xd800); + const auto zero = vector_u8::zero(); + + while (end - buf >= vector_u16::ELEMENTS) { + auto in = vector_u16::load(buf); + if (not match_system(big_endian)) { + in = in.swap_bytes(); + } + + // 1. Check if there are any surrogate word in the input chunk. + // We have also deal with situation when there is a surrogate word + // at the end of a chunk. + const auto surrogates_bytemask = (in & v_f800) == v_d800; + + // bitmask = 0x0000 if there are no surrogates + const uint16_t surrogates_bitmask = surrogates_bytemask.to_bitmask(); + + // It might seem like checking for surrogates_bitmask == 0xc000 could help. + // However, it is likely an uncommon occurrence. + if (surrogates_bitmask == 0x0000) { + // case: no surrogate pairs, extend 16-bit code units to 32-bit code units +#if SIMDUTF_IS_BIG_ENDIAN + const auto lo = + vector_u8(16, 16, 0, 1, 16, 16, 2, 3, 16, 16, 4, 5, 16, 16, 6, 7); + const auto hi = vector_u8(16, 16, 8 + 0, 8 + 1, 16, 16, 8 + 2, 8 + 3, 16, + 16, 8 + 4, 8 + 5, 16, 16, 8 + 6, 8 + 7); +#else + const auto lo = + vector_u8(0, 1, 16, 16, 2, 3, 16, 16, 4, 5, 16, 16, 6, 7, 16, 16); + const auto hi = vector_u8(8 + 0, 8 + 1, 16, 16, 8 + 2, 8 + 3, 16, 16, + 8 + 4, 8 + 5, 16, 16, 8 + 6, 8 + 7, 16, 16); +#endif // SIMDUTF_IS_BIG_ENDIAN + + const auto utf32_0 = lo.lookup_32(as_vector_u8(in), zero); + const auto utf32_1 = hi.lookup_32(as_vector_u8(in), zero); + + utf32_0.store(utf32_output); + utf32_1.store(utf32_output + 4); + utf32_output += 8; + buf += 8; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + const uint16_t word = scalar::utf16::swap_if_needed(buf[k]); + if ((word & 0xF800) != 0xD800) { + *utf32_output++ = char32_t(word); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = + scalar::utf16::swap_if_needed(buf[k + 1]); + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if ((diff | diff2) > 0x3FF) { + return utf16_to_utf32_t{error_code::SURROGATE, buf + k - 1, + utf32_output}; + } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf32_output++ = char32_t(value); + } + } + buf += k; + } + } // while + + return utf16_to_utf32_t{error_code::SUCCESS, buf, utf32_output}; +} +/* end file src/ppc64/ppc64_convert_utf16_to_utf32.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/ppc64/ppc64_convert_utf32_to_latin1.cpp */ +enum class ErrorChecking { disabled, enabled }; + +struct utf32_to_latin1_t { + error_code err; + const char32_t *input; + char *output; +}; + +template +utf32_to_latin1_t simdutf_really_inline ppc64_convert_utf32_to_latin1( + const char32_t *buf, size_t len, char *latin1_output) { + constexpr size_t N = vector_u32::ELEMENTS; + const size_t rounded_len = align_down<4 * N>(len); + + const auto high_bytes_mask = vector_u32::splat(0xFFFFFF00); + + for (size_t i = 0; i < rounded_len; i += 4 * N) { + const auto in1 = vector_u32::load(buf + 0 * N); + const auto in2 = vector_u32::load(buf + 1 * N); + const auto in3 = vector_u32::load(buf + 2 * N); + const auto in4 = vector_u32::load(buf + 3 * N); + + if (ec == ErrorChecking::enabled) { + const auto combined = in1 | in2 | in3 | in4; + const auto too_big = (combined & high_bytes_mask) != uint32_t(0); + + if (simdutf_unlikely(too_big.any())) { + // Scalar code will carry on from the beginning of the current block + // and report the exact error position. + return utf32_to_latin1_t{error_code::OTHER, buf, latin1_output}; + } + } + + // Note: element #1 contains 0, and is used to mask-out elements +#if SIMDUTF_IS_BIG_ENDIAN + const auto shlo = vector_u8(0 + 3, 4 + 3, 8 + 3, 12 + 3, 16 + 3, 20 + 3, + 24 + 3, 28 + 3, 1, 1, 1, 1, 1, 1, 1, 1); + const auto shhi = vector_u8(1, 1, 1, 1, 1, 1, 1, 1, 0 + 3, 4 + 3, 8 + 3, + 12 + 3, 16 + 3, 20 + 3, 24 + 3, 28 + 3); +#else + const auto shlo = + vector_u8(0, 4, 8, 12, 16, 20, 24, 28, 1, 1, 1, 1, 1, 1, 1, 1); + const auto shhi = + vector_u8(1, 1, 1, 1, 1, 1, 1, 1, 0, 4, 8, 12, 16, 20, 24, 28); +#endif // SIMDUTF_IS_BIG_ENDIAN + const auto lo = shlo.lookup_32(as_vector_u8(in1), as_vector_u8(in2)); + const auto hi = shhi.lookup_32(as_vector_u8(in3), as_vector_u8(in4)); + + const auto merged = lo | hi; + + merged.store(latin1_output); + latin1_output += 4 * N; + buf += 4 * N; + } + + return utf32_to_latin1_t{error_code::SUCCESS, buf, latin1_output}; +} +/* end file src/ppc64/ppc64_convert_utf32_to_latin1.cpp */ +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_UTF16 +/* begin file src/ppc64/ppc64_convert_utf32_to_utf16.cpp */ +struct utf32_to_utf16_t { + error_code err; + const char32_t *input; + char16_t *output; +}; + +template +utf32_to_utf16_t ppc64_convert_utf32_to_utf16(const char32_t *buf, size_t len, + char16_t *utf16_output) { + + const char32_t *end = buf + len; + + const auto zero = vector_u32::zero(); + const auto v_ffff0000 = vector_u32::splat(0xffff0000); + + auto forbidden_global = simd16(); + + while (end - buf >= 8) { + const auto in0 = vector_u32::load(buf); + const auto in1 = vector_u32::load(buf + vector_u32::ELEMENTS); + + const auto any_surrogate = ((in0 | in1) & v_ffff0000) != zero; + + // Check if no bits set above 15th + if (any_surrogate.is_zero()) { + // Pack UTF-32 to UTF-16 +#if SIMDUTF_IS_BIG_ENDIAN + const auto sh = big_endian ? vector_u8(2, 3, 6, 7, 10, 11, 14, 15, 18, 19, + 22, 23, 26, 27, 30, 31) + : vector_u8(3, 2, 7, 6, 11, 10, 15, 14, 19, 18, + 23, 22, 27, 26, 31, 30); +#else + const auto sh = big_endian ? vector_u8(1, 0, 5, 4, 9, 8, 13, 12, 17, 16, + 21, 20, 25, 24, 29, 28) + : vector_u8(0, 1, 4, 5, 8, 9, 12, 13, 16, 17, + 20, 21, 24, 25, 28, 29); +#endif // SIMDUTF_IS_BIG_ENDIAN + const auto packed0 = sh.lookup_32(as_vector_u8(in0), as_vector_u8(in1)); + const auto packed = as_vector_u16(packed0); + +#if SIMDUTF_IS_BIG_ENDIAN + const auto v_f800 = + big_endian ? vector_u16::splat(0xf800) : vector_u16::splat(0x00f8); + const auto v_d800 = + big_endian ? vector_u16::splat(0xd800) : vector_u16::splat(0x00d8); +#else + const auto v_f800 = + big_endian ? vector_u16::splat(0x00f8) : vector_u16::splat(0xf800); + const auto v_d800 = + big_endian ? vector_u16::splat(0x00d8) : vector_u16::splat(0xd800); +#endif // SIMDUTF_IS_BIG_ENDIAN + const auto forbidden = (packed & v_f800) == v_d800; + + switch (er) { + case ErrorReporting::precise: + if (not forbidden.is_zero()) { + // scalar procedure will rescan the portion of buffer we've just + // analysed + return utf32_to_utf16_t{error_code::OTHER, buf, utf16_output}; + } + break; + case ErrorReporting::at_the_end: + forbidden_global |= forbidden; + break; + case ErrorReporting::none: + break; + } + + packed.store(utf16_output); + utf16_output += 8; + buf += 8; + } else { + size_t forward = 7; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint32_t word = buf[k]; + if ((word & 0xFFFF0000) == 0) { + // will not generate a surrogate pair + if (word >= 0xD800 && word <= 0xDFFF) { + return utf32_to_utf16_t{error_code::SURROGATE, buf + k, + utf16_output}; + } + *utf16_output++ = + scalar::utf16::swap_if_needed(uint16_t(word)); + } else { + // will generate a surrogate pair + if (word > 0x10FFFF) { + return utf32_to_utf16_t{error_code::TOO_LARGE, buf + k, + utf16_output}; + } + word -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); + high_surrogate = + scalar::utf16::swap_if_needed(high_surrogate); + low_surrogate = + scalar::utf16::swap_if_needed(low_surrogate); + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + } + } + buf += k; + } + } + + if (er == ErrorReporting::at_the_end) { + // check for invalid input + if (not forbidden_global.is_zero()) { + return utf32_to_utf16_t{error_code::SURROGATE, buf, utf16_output}; + } + } + + return utf32_to_utf16_t{error_code::SUCCESS, buf, utf16_output}; +} +/* end file src/ppc64/ppc64_convert_utf32_to_utf16.cpp */ +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_UTF32 +/* begin file src/ppc64/ppc64_convert_utf32_to_utf8.cpp */ +struct utf32_to_utf8_t { + error_code err; + const char32_t *input; + char *output; +}; + +template +utf32_to_utf8_t ppc64_convert_utf32_to_utf8(const char32_t *buf, size_t len, + char *utf8_output) { + const char32_t *end = buf + len; + + const auto v_f800 = vector_u16::splat(0xf800); + const auto v_d800 = vector_u16::splat(0xd800); + + const auto v_ffff0000 = vector_u32::splat(0xffff0000); + const auto v_00000000 = vector_u32::zero(); + auto forbidden_bytemask = simd16(); + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 + + while (end - buf >= + std::ptrdiff_t( + 16 + safety_margin)) { // buf is a char32_t pointer, each char32_t + // has 4 bytes or 32 bits, thus buf + 16 * + // char_32t = 512 bits = 64 bytes + // We load two 16 bytes registers for a total of 32 bytes or 16 characters. + // These two values can hold only 8 UTF32 chars + auto in0 = vector_u32::load(buf); + auto in1 = vector_u32::load(buf + vector_u32::ELEMENTS); + + // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned + // saturation + auto in = vector_u32::pack(in0, in1); + + // Try to apply UTF-16 => UTF-8 from ./ppc64_convert_utf16_to_utf8.cpp + + // Check for ASCII fast path + + // ASCII fast path!!!! + // We eagerly load another 32 bytes, hoping that they will be ASCII too. + // The intuition is that we try to collect 16 ASCII characters which + // requires a total of 64 bytes of input. If we fail, we just pass thirdin + // and fourthin as our new inputs. + if (in.is_ascii()) { // if the first two blocks are ASCII + const auto in2 = vector_u32::load(buf + 2 * vector_u32::ELEMENTS); + const auto in3 = vector_u32::load(buf + 3 * vector_u32::ELEMENTS); + + const auto next = vector_u32::pack(in2, in3); + if (next.is_ascii()) { + // 1. pack the bytes + const auto utf8_packed = vector_u16::pack(in, next); + // 2. store (16 bytes) + utf8_packed.store(utf8_output); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } + + // `next` is not ASCII, write `in` and carry on with next + + // 1. pack the bytes + const auto utf8_packed = vector_u16::pack(in, in); + utf8_packed.store(utf8_output); + // 3. adjust pointers + buf += 8; + utf8_output += 8; + + // Proceed with next input + in = next; + in0 = in2; + in1 = in3; + } + + // no bits set above 7th bit + const auto one_byte_bytemask = in < uint16_t(1 << 7); + const uint16_t one_byte_bitmask = one_byte_bytemask.to_bitmask(); + + // no bits set above 11th bit + const auto one_or_two_bytes_bytemask = in < uint16_t(1 << 11); + const uint16_t one_or_two_bytes_bitmask = + one_or_two_bytes_bytemask.to_bitmask(); + + if (one_or_two_bytes_bitmask == 0xffff) { + write_v_u16_11bits_to_utf8( + in, utf8_output, as_vector_u8(one_byte_bytemask), one_byte_bitmask); + buf += 8; + continue; + } + + // Check for overflow in packing + const auto saturation_bytemask = ((in0 | in1) & v_ffff0000) == v_00000000; + const uint16_t saturation_bitmask = saturation_bytemask.to_bitmask(); + if (saturation_bitmask == 0xffff) { + switch (er) { + case ErrorReporting::precise: { + const auto forbidden = (in & v_f800) == v_d800; + if (forbidden.any()) { + // We return no error code, instead we force the scalar procedure + // to rescan the portion of input where we've just found an error. + return utf32_to_utf8_t{error_code::SUCCESS, buf, utf8_output}; + } + } break; + case ErrorReporting::at_the_end: + forbidden_bytemask |= (in & v_f800) == v_d800; + break; + case ErrorReporting::none: + break; + } + + ppc64_convert_utf16_to_1_2_3_bytes_of_utf8( + in, one_byte_bitmask, one_or_two_bytes_bytemask, + one_or_two_bytes_bitmask, utf8_output); + buf += 8; + } else { + // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=> + // will produce four UTF-8 bytes Let us do a scalar fallback. It may seem + // wasteful to use scalar code, but being efficient with SIMD in the + // presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint32_t word = buf[k]; + if ((word & 0xFFFFFF80) == 0) { + *utf8_output++ = char(word); + } else if ((word & 0xFFFFF800) == 0) { + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if ((word & 0xFFFF0000) == 0) { + if (er != ErrorReporting::none and + (word >= 0xD800 && word <= 0xDFFF)) { + return utf32_to_utf8_t{error_code::SURROGATE, buf + k, utf8_output}; + } + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + if (er != ErrorReporting::none and (word > 0x10FFFF)) { + return utf32_to_utf8_t{error_code::TOO_LARGE, buf + k, utf8_output}; + } + *utf8_output++ = char((word >> 18) | 0b11110000); + *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + + if (er == ErrorReporting::at_the_end) { + if (forbidden_bytemask.any()) { + return utf32_to_utf8_t{error_code::SURROGATE, buf, utf8_output}; + } + } + + return utf32_to_utf8_t{ + error_code::SUCCESS, + buf, + utf8_output, + }; +} +/* end file src/ppc64/ppc64_convert_utf32_to_utf8.cpp */ +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/ppc64/ppc64_utf8_length_from_latin1.cpp */ +template T min(T a, T b) { return a <= b ? a : b; } + +std::pair ppc64_utf8_length_from_latin1(const char *input, + size_t length) { + constexpr size_t N = vector_u8::ELEMENTS; + length = (length / N); + + size_t count = length * N; + while (length != 0) { + vector_u32 partial = vector_u32::zero(); + + // partial accumulator has 32 bits => this yields (2^31 / 16) + size_t chunk = min(length, size_t(0xffffffff / N)); + length -= chunk; + while (chunk != 0) { + auto local = vector_u8::zero(); + // local accumulator has 8 bits => this yields 255 max (we increment by 1 + // in each iteration) + const size_t n = min(chunk, size_t(255)); + chunk -= n; + for (size_t i = 0; i < n; i++) { + const auto in = vector_i8::load(input); + input += N; + + local -= as_vector_u8(in < vector_i8::splat(0)); + } + + partial = sum4bytes(local, partial); + } + + for (int i = 0; i < vector_u32::ELEMENTS; i++) { + count += size_t(partial.value[i]); + } + } + + return std::make_pair(input, count); +} +/* end file src/ppc64/ppc64_utf8_length_from_latin1.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_BASE64 +/* begin file src/ppc64/ppc64_base64.cpp */ +/* + * References and further reading: + * + * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the + * speed of a memory copy, Software: Practice and Experience 50 (2), 2020. + * https://arxiv.org/abs/1910.05109 + * + * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2 + * Instructions, ACM Transactions on the Web 12 (3), 2018. + * https://arxiv.org/abs/1704.00605 + * + * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings. + * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force, + * Request for Comments: 4648. + * + * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization. + * http://www.alfredklomp.com/programming/sse-base64/. (2014). + * + * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD + * acceleration. https://github.com/aklomp/base64. (2014). + * + * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014). + * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/ + * + * Nick Kopp. 2013. Base64 Encoding on a GPU. + * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013). + * + * AMD XOP specific: http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html + * Altivec has capabilities of AMD XOP (or vice versa): shuffle using 2 vectors + * and variable shifts, thus this implementation shares some code solution + * (modulo intrinsic function names). + */ + +constexpr bool with_base64_std = false; +constexpr bool with_base64_url = true; +constexpr bool with_ignore_errors = true; +constexpr bool with_ignore_garbage = true; +constexpr bool with_strict_checking = false; + +// --- encoding ----------------------------------------------- + +/* + Procedure translates vector of bytes having 6-bit values + into ASCII counterparts. +*/ +template +vector_u8 encoding_translate_6bit_values(const vector_u8 input) { + // credit: Wojciech Muła + // reduce 0..51 -> 0 + // 52..61 -> 1 .. 10 + // 62 -> 11 + // 63 -> 12 + auto result = input.saturating_sub(vector_u8::splat(51)); + + // distinguish between ranges 0..25 and 26..51: + // 0 .. 25 -> remains 13 + // 26 .. 51 -> becomes 0 + const auto lt = input < vector_u8::splat(26); + result = select(as_vector_u8(lt), vector_u8::splat(13), result); + + const auto shift_LUT = + base64_url ? vector_u8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, + '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, + '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0) + : vector_u8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, + '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, + '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0); + // read shift + result = result.lookup_16(shift_LUT); + + return input + result; +} + +/* + Procedure expands 12 bytes (4*3 bytes) into 16 bytes, + each byte stores 6 bits of data +*/ +template +simdutf_really_inline vector_u8 encoding_expand_6bit_fields(vector_u8 input) { +#if SIMDUTF_IS_BIG_ENDIAN + #define indices4(dx) (dx + 0), (dx + 1), (dx + 1), (dx + 2) + const auto expand_3_to_4 = vector_u8(indices4(0 * 3), indices4(1 * 3), + indices4(2 * 3), indices4(3 * 3)); + #undef indices4 + + // input = [........|ccdddddd|bbbbcccc|aaaaaabb] as uint8_t + // 3 2 1 0 + // + // in' = [aaaaaabb|bbbbcccc|bbbbcccc|ccdddddd] as uint32_t + // 0 1 1 2 + const auto in = as_vector_u32(expand_3_to_4.lookup_16(input)); + + // t0 = [00000000|00000000|00000000|00dddddd] + const auto t0 = in & uint32_t(0x0000003f); + + // t1 = [00000000|00000000|00cccccc|00dddddd] + const auto t1 = select(uint32_t(0x00003f00), in.shl<2>(), t0); + + // t2 = [00000000|00bbbbbb|00cccccc|00dddddd] + const auto t2 = select(uint32_t(0x003f0000), in.shr<4>(), t1); + + // t3 = [00aaaaaa|00bbbbbb|00cccccc|00dddddd] + const auto t3 = select(uint32_t(0x3f000000), in.shr<2>(), t2); + + return as_vector_u8(t3); +#else + #define indices4(dx) (dx + 1), (dx + 0), (dx + 2), (dx + 1) + const auto expand_3_to_4 = vector_u8(indices4(0 * 3), indices4(1 * 3), + indices4(2 * 3), indices4(3 * 3)); + #undef indices4 + + // input = [........|ccdddddd|bbbbcccc|aaaaaabb] as uint8_t + // 3 2 1 0 + // + // in' = [bbbbcccc|ccdddddd|aaaaaabb|bbbbcccc] as uint32_t + // 1 2 0 1 + const auto in = as_vector_u32(expand_3_to_4.lookup_16(input)); + + // t0 = [00dddddd|00000000|00000000|00000000] + const auto t0 = in.shl<8>() & uint32_t(0x3f000000); + + // t1 = [00dddddd|00cccccc|00000000|00000000] + const auto t1 = select(uint32_t(0x003f0000), in.shr<6>(), t0); + + // t2 = [00dddddd|00cccccc|00bbbbbb|00000000] + const auto t2 = select(uint32_t(0x00003f00), in.shl<4>(), t1); + + // t3 = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] + const auto t3 = select(uint32_t(0x0000003f), in.shr<10>(), t2); + + return as_vector_u8(t3); +#endif // SIMDUTF_IS_BIG_ENDIAN +} + +template +size_t encode_base64(char *dst, const char *src, size_t srclen, + base64_options options) { + + const uint8_t *input = (const uint8_t *)src; + + uint8_t *out = (uint8_t *)dst; + + size_t i = 0; + for (; i + 52 <= srclen; i += 48) { + const auto in0 = vector_u8::load(input + i + 12 * 0); + const auto in1 = vector_u8::load(input + i + 12 * 1); + const auto in2 = vector_u8::load(input + i + 12 * 2); + const auto in3 = vector_u8::load(input + i + 12 * 3); + + const auto expanded0 = encoding_expand_6bit_fields(in0); + const auto expanded1 = encoding_expand_6bit_fields(in1); + const auto expanded2 = encoding_expand_6bit_fields(in2); + const auto expanded3 = encoding_expand_6bit_fields(in3); + + const auto base64_0 = + encoding_translate_6bit_values(expanded0); + const auto base64_1 = + encoding_translate_6bit_values(expanded1); + const auto base64_2 = + encoding_translate_6bit_values(expanded2); + const auto base64_3 = + encoding_translate_6bit_values(expanded3); + + base64_0.store(out); + out += 16; + + base64_1.store(out); + out += 16; + + base64_2.store(out); + out += 16; + + base64_3.store(out); + out += 16; + } + for (; i + 16 <= srclen; i += 12) { + const auto in = vector_u8::load(input + i); + const auto expanded = encoding_expand_6bit_fields(in); + const auto base64 = encoding_translate_6bit_values(expanded); + + base64.store(out); + out += 16; + } + + return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i, + srclen - i, options); +} + +// --- decoding ----------------------------------------------- + +static simdutf_really_inline void compress(const vector_u8 data, uint16_t mask, + char *output) { + if (mask == 0) { + data.store(output); + return; + } + + // this particular implementation was inspired by work done by @animetosho + // we do it in two steps, first 8 bytes and then second 8 bytes + uint8_t mask1 = uint8_t(mask); // least significant 8 bits + uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits + // next line just loads the 64-bit values thintable_epi8[mask1] and + // thintable_epi8[mask2] into a 128-bit register, using only + // two instructions on most compilers. + +#if SIMDUTF_IS_BIG_ENDIAN + vec_u64_t tmp = { + tables::base64::thintable_epi8[mask2], + tables::base64::thintable_epi8[mask1], + }; + + auto shufmask = vector_u8(vec_reve(vec_u8_t(tmp))); + + // we increment by 0x08 the second half of the mask + shufmask = + shufmask + vector_u8(0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8); +#else + vec_u64_t tmp = { + tables::base64::thintable_epi8[mask1], + tables::base64::thintable_epi8[mask2], + }; + + auto shufmask = vector_u8(vec_u8_t(tmp)); + + // we increment by 0x08 the second half of the mask + shufmask = + shufmask + vector_u8(0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8); +#endif // SIMDUTF_IS_BIG_ENDIAN + + // this is the version "nearly pruned" + const auto pruned = shufmask.lookup_16(data); + // we still need to put the two halves together. + // we compute the popcount of the first half: + const int pop1 = tables::base64::BitsSetTable256mul2[mask1]; + // then load the corresponding mask, what it does is to write + // only the first pop1 bytes from the first 8 bytes, and then + // it fills in with the bytes from the second 8 bytes + some filling + // at the end. + const auto compactmask = + vector_u8::load(tables::base64::pshufb_combine_table + pop1 * 8); + + const auto answer = compactmask.lookup_16(pruned); + + answer.store(output); +} + +static simdutf_really_inline vector_u8 decoding_pack(vector_u8 input) { +#if SIMDUTF_IS_BIG_ENDIAN + // in = [00aaaaaa|00bbbbbb|00cccccc|00dddddd] + // want = [00000000|aaaaaabb|bbbbcccc|ccdddddd] + + auto in = as_vector_u16(input); + // t0 = [00??aaaa|aabbbbbb|00??cccc|ccdddddd] + const auto t0 = in.shr<2>(); + const auto t1 = select(uint16_t(0x0fc0), t0, in); + + // t0 = [00??????|aaaaaabb|bbbbcccc|ccdddddd] + const auto t2 = as_vector_u32(t1); + const auto t3 = t2.shr<4>(); + const auto t4 = select(uint32_t(0x00fff000), t3, t2); + + const auto tmp = as_vector_u8(t4); + + const auto shuffle = + vector_u8(1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15, 0, 0, 0, 0); + + const auto t = shuffle.lookup_16(tmp); + + return t; +#else + // in = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] + // want = [00000000|aaaaaabb|bbbbcccc|ccdddddd] + + auto u = as_vector_u32(input).swap_bytes(); + + auto in = vector_u16((vec_u16_t)u.value); + // t0 = [00??aaaa|aabbbbbb|00??cccc|ccdddddd] + const auto t0 = in.shr<2>(); + const auto t1 = select(uint16_t(0x0fc0), t0, in); + + // t0 = [00??????|aaaaaabb|bbbbcccc|ccdddddd] + const auto t2 = as_vector_u32(t1); + const auto t3 = t2.shr<4>(); + const auto t4 = select(uint32_t(0x00fff000), t3, t2); + + const auto tmp = as_vector_u8(t4); + + const auto shuffle = + vector_u8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 0, 0, 0, 0); + + const auto t = shuffle.lookup_16(tmp); + + return t; +#endif // SIMDUTF_IS_BIG_ENDIAN +} +static simdutf_really_inline void base64_decode(char *out, vector_u8 input) { + const auto expanded = decoding_pack(input); + expanded.store(out); +} + +static simdutf_really_inline void base64_decode_block(char *out, + const char *src) { + base64_decode(out + 12 * 0, vector_u8::load(src + 0 * 16)); + base64_decode(out + 12 * 1, vector_u8::load(src + 1 * 16)); + base64_decode(out + 12 * 2, vector_u8::load(src + 2 * 16)); + base64_decode(out + 12 * 3, vector_u8::load(src + 3 * 16)); +} + +static simdutf_really_inline void base64_decode_block_safe(char *out, + const char *src) { + base64_decode(out + 12 * 0, vector_u8::load(src + 0 * 16)); + base64_decode(out + 12 * 1, vector_u8::load(src + 1 * 16)); + base64_decode(out + 12 * 2, vector_u8::load(src + 2 * 16)); + + char buffer[16]; + base64_decode(buffer, vector_u8::load(src + 3 * 16)); + std::memcpy(out + 36, buffer, 12); +} + +// ---base64 decoding::block64 class -------------------------- + +class block64 { + simd8x64 b; + +public: + simdutf_really_inline block64(const char *src) : b(load_block(src)) {} + simdutf_really_inline block64(const char16_t *src) : b(load_block(src)) {} + +private: + // The caller of this function is responsible to ensure that there are 64 + // bytes available from reading at src. The data is read into a block64 + // structure. + static simdutf_really_inline simd8x64 load_block(const char *src) { + const auto v0 = vector_u8::load(src + 16 * 0); + const auto v1 = vector_u8::load(src + 16 * 1); + const auto v2 = vector_u8::load(src + 16 * 2); + const auto v3 = vector_u8::load(src + 16 * 3); + + return simd8x64(v0, v1, v2, v3); + } + + // The caller of this function is responsible to ensure that there are 128 + // bytes available from reading at src. The data is read into a block64 + // structure. + static simdutf_really_inline simd8x64 + load_block(const char16_t *src) { + const auto m1 = vector_u16::load(src + 8 * 0); + const auto m2 = vector_u16::load(src + 8 * 1); + const auto m3 = vector_u16::load(src + 8 * 2); + const auto m4 = vector_u16::load(src + 8 * 3); + const auto m5 = vector_u16::load(src + 8 * 4); + const auto m6 = vector_u16::load(src + 8 * 5); + const auto m7 = vector_u16::load(src + 8 * 6); + const auto m8 = vector_u16::load(src + 8 * 7); + + return simd8x64(vector_u16::pack(m1, m2), vector_u16::pack(m3, m4), + vector_u16::pack(m5, m6), + vector_u16::pack(m7, m8)); + } + +public: + template + static inline uint16_t to_base64_mask(vector_u8 &src, uint16_t &error) { + const auto ascii_space_tbl = + vector_u8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa, 0x0, + 0xc, 0xd, 0x0, 0x0); + + // credit: aqrit + const auto delta_asso = + default_or_url + ? vector_u8(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x16) + : vector_u8(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F); + + const auto delta_values = + default_or_url + ? vector_u8(0xBF, 0xE0, 0xB9, 0x13, 0x04, 0xBF, 0xBF, 0xB9, 0xB9, + 0x00, 0xFF, 0x11, 0xFF, 0xBF, 0x10, 0xB9) + : (base64_url + ? vector_u8(0x0, 0x0, 0x0, 0x13, 0x4, 0xBF, 0xBF, 0xB9, 0xB9, + 0x0, 0x11, 0xC3, 0xBF, 0xE0, 0xB9, 0xB9) + : vector_u8(0x00, 0x00, 0x00, 0x13, 0x04, 0xBF, 0xBF, 0xB9, + 0xB9, 0x00, 0x10, 0xC3, 0xBF, 0xBF, 0xB9, 0xB9)); + + const auto check_asso = + default_or_url + ? vector_u8(0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x03, 0x07, 0x0B, 0x0E, 0x0B, 0x06) + : (base64_url + ? vector_u8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, + 0x3, 0x7, 0xB, 0xE, 0xB, 0x6) + : vector_u8(0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F)); + + const auto check_values = + default_or_url + ? vector_u8(0x80, 0x80, 0x80, 0x80, 0xCF, 0xBF, 0xD5, 0xA6, 0xB5, + 0xA1, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80) + : (base64_url + ? vector_u8(0x80, 0x80, 0x80, 0x80, 0xCF, 0xBF, 0xB6, 0xA6, + 0xB5, 0xA1, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80) + : vector_u8(0x80, 0x80, 0x80, 0x80, 0xCF, 0xBF, 0xD5, 0xA6, + 0xB5, 0x86, 0xD1, 0x80, 0xB1, 0x80, 0x91, 0x80)); + + const auto shifted = src.shr<3>(); + + const auto delta_hash = avg(src.lookup_16(delta_asso), shifted); + const auto check_hash = avg(src.lookup_16(check_asso), shifted); + + const auto out = as_vector_i8(delta_hash.lookup_16(delta_values)) + .saturating_add(as_vector_i8(src)); + const auto chk = as_vector_i8(check_hash.lookup_16(check_values)) + .saturating_add(as_vector_i8(src)); + + const uint16_t mask = chk.to_bitmask(); + if (!ignore_garbage && mask) { + const auto ascii = src.lookup_16(ascii_space_tbl); + const auto ascii_space = (ascii == src); + error = (mask ^ ascii_space.to_bitmask()); + } + src = out; + + return mask; + } + + template + simdutf_really_inline uint64_t to_base64_mask(uint64_t *error) { + uint16_t err0 = 0; + uint16_t err1 = 0; + uint16_t err2 = 0; + uint16_t err3 = 0; + uint64_t m0 = to_base64_mask( + b.chunks[0], err0); + uint64_t m1 = to_base64_mask( + b.chunks[1], err1); + uint64_t m2 = to_base64_mask( + b.chunks[2], err2); + uint64_t m3 = to_base64_mask( + b.chunks[3], err3); + + if (!ignore_garbage) { + *error = (err0) | ((uint64_t)err1 << 16) | ((uint64_t)err2 << 32) | + ((uint64_t)err3 << 48); + } + return m0 | (m1 << 16) | (m2 << 32) | (m3 << 48); + } + + simdutf_really_inline void copy_block(char *output) { + b.store(reinterpret_cast(output)); + } + + simdutf_really_inline uint64_t compress_block(uint64_t mask, char *output) { + uint64_t nmask = ~mask; + compress(b.chunks[0], uint16_t(mask), output); + compress(b.chunks[1], uint16_t(mask >> 16), + output + count_ones(nmask & 0xFFFF)); + compress(b.chunks[2], uint16_t(mask >> 32), + output + count_ones(nmask & 0xFFFFFFFF)); + compress(b.chunks[3], uint16_t(mask >> 48), + output + count_ones(nmask & 0xFFFFFFFFFFFFULL)); + return count_ones(nmask); + } + + simdutf_really_inline void base64_decode_block(char *out) { + base64_decode(out + 12 * 0, b.chunks[0]); + base64_decode(out + 12 * 1, b.chunks[1]); + base64_decode(out + 12 * 2, b.chunks[2]); + base64_decode(out + 12 * 3, b.chunks[3]); + } + + simdutf_really_inline void base64_decode_block_safe(char *out) { + base64_decode(out + 12 * 0, b.chunks[0]); + base64_decode(out + 12 * 1, b.chunks[1]); + base64_decode(out + 12 * 2, b.chunks[2]); + char buffer[16]; + base64_decode(buffer, b.chunks[3]); + std::memcpy(out + 12 * 3, buffer, 12); + } +}; +/* end file src/ppc64/ppc64_base64.cpp */ +#endif // SIMDUTF_FEATURE_BASE64 + +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf + +#if SIMDUTF_FEATURE_UTF8 +/* begin file src/generic/buf_block_reader.h */ +namespace simdutf { +namespace ppc64 { +namespace { + +// Walks through a buffer in block-sized increments, loading the last part with +// spaces +template struct buf_block_reader { +public: + simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len); + simdutf_really_inline size_t block_index(); + simdutf_really_inline bool has_full_block() const; + simdutf_really_inline const uint8_t *full_block() const; + /** + * Get the last block, padded with spaces. + * + * There will always be a last block, with at least 1 byte, unless len == 0 + * (in which case this function fills the buffer with spaces and returns 0. In + * particular, if len == STEP_SIZE there will be 0 full_blocks and 1 remainder + * block with STEP_SIZE bytes and no spaces for padding. + * + * @return the number of effective characters in the last block. + */ + simdutf_really_inline size_t get_remainder(uint8_t *dst) const; + simdutf_really_inline void advance(); + +private: + const uint8_t *buf; + const size_t len; + const size_t lenminusstep; + size_t idx; +}; + +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char *format_input_text_64(const uint8_t *text) { + static char *buf = + reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + for (size_t i = 0; i < sizeof(simd8x64); i++) { + buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]); + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} + +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char *format_input_text(const simd8x64 &in) { + static char *buf = + reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + in.store(reinterpret_cast(buf)); + for (size_t i = 0; i < sizeof(simd8x64); i++) { + if (buf[i] < ' ') { + buf[i] = '_'; + } + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} + +simdutf_unused static char *format_mask(uint64_t mask) { + static char *buf = reinterpret_cast(malloc(64 + 1)); + for (size_t i = 0; i < 64; i++) { + buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' '; + } + buf[64] = '\0'; + return buf; +} + +template +simdutf_really_inline +buf_block_reader::buf_block_reader(const uint8_t *_buf, size_t _len) + : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, + idx{0} {} + +template +simdutf_really_inline size_t buf_block_reader::block_index() { + return idx; +} + +template +simdutf_really_inline bool buf_block_reader::has_full_block() const { + return idx < lenminusstep; +} + +template +simdutf_really_inline const uint8_t * +buf_block_reader::full_block() const { + return &buf[idx]; +} + +template +simdutf_really_inline size_t +buf_block_reader::get_remainder(uint8_t *dst) const { + if (len == idx) { + return 0; + } // memcpy(dst, null, 0) will trigger an error with some sanitizers + std::memset(dst, 0x20, + STEP_SIZE); // std::memset STEP_SIZE because it is more efficient + // to write out 8 or 16 bytes at once. + std::memcpy(dst, buf + idx, len - idx); + return len - idx; +} + +template +simdutf_really_inline void buf_block_reader::advance() { + idx += STEP_SIZE; +} + +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/buf_block_reader.h */ +/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +namespace simdutf { +namespace ppc64 { +namespace { +namespace utf8_validation { + +using namespace simd; + +simdutf_really_inline simd8 +check_special_cases(const simd8 input, const simd8 prev1) { + // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) + // Bit 1 = Too Long (ASCII followed by continuation) + // Bit 2 = Overlong 3-byte + // Bit 4 = Surrogate + // Bit 5 = Overlong 2-byte + // Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1 << 6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4); + constexpr const uint8_t CARRY = + TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = + (prev1 & 0x0F) + .lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | + OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); + return (byte_1_high & byte_1_low & byte_2_high); +} +simdutf_really_inline simd8 +check_multibyte_lengths(const simd8 input, + const simd8 prev_input, + const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = + simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; +} + +// +// Return nonzero if there are incomplete multibyte characters at the end of the +// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end. +// +simdutf_really_inline simd8 is_incomplete(const simd8 input) { + // If the previous input's last 3 bytes match this, they're too short (they + // ended at EOF): + // ... 1111____ 111_____ 11______ + static const uint8_t max_array[32] = {255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 0b11110000u - 1, + 0b11100000u - 1, + 0b11000000u - 1}; + const simd8 max_value( + &max_array[sizeof(max_array) - sizeof(simd8)]); + return input.gt_bits(max_value); +} + +struct utf8_checker { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + // The last input we received + simd8 prev_input_block; + // Whether the last input we received was incomplete (used for ASCII fast + // path) + simd8 prev_incomplete; + + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, + const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ + // lead bytes (2, 3, 4-byte leads become large positive numbers instead of + // small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + // The only problem that can happen at EOF is that a multibyte character is + // too short or a byte value too large in the last bytes: check_special_cases + // only checks for bytes too large in the first of two bytes. + simdutf_really_inline void check_eof() { + // If the previous block had incomplete UTF-8 characters at the end, an + // ASCII block can't possibly finish them. + this->error |= this->prev_incomplete; + } + + simdutf_really_inline void check_next_input(const simd8x64 &input) { + if (simdutf_likely(is_ascii(input))) { + this->error |= this->prev_incomplete; + } else { + // you might think that a for-loop would work, but under Visual Studio, it + // is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + this->prev_incomplete = + is_incomplete(input.chunks[simd8x64::NUM_CHUNKS - 1]); + this->prev_input_block = input.chunks[simd8x64::NUM_CHUNKS - 1]; + } + } + + // do not forget to call check_eof! + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + +}; // struct utf8_checker +} // namespace utf8_validation + +using utf8_validation::utf8_checker; + +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +/* begin file src/generic/utf8_validation/utf8_validator.h */ +namespace simdutf { +namespace ppc64 { +namespace { +namespace utf8_validation { + +/** + * Validates that the string is actual UTF-8. + */ +template +bool generic_validate_utf8(const uint8_t *input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + reader.advance(); + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + c.check_eof(); + return !c.errors(); +} + +bool generic_validate_utf8(const char *input, size_t length) { + return generic_validate_utf8( + reinterpret_cast(input), length); +} + +/** + * Validates that the string is actual UTF-8 and stops on errors. + */ +template +result generic_validate_utf8_with_errors(const uint8_t *input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + size_t count{0}; + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + if (c.errors()) { + if (count != 0) { + count--; + } // Sometimes the error is only detected in the next chunk + result res = scalar::utf8::rewind_and_validate_with_errors( + reinterpret_cast(input), + reinterpret_cast(input + count), length - count); + res.count += count; + return res; + } + reader.advance(); + count += 64; + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + c.check_eof(); + if (c.errors()) { + if (count != 0) { + count--; + } // Sometimes the error is only detected in the next chunk + result res = scalar::utf8::rewind_and_validate_with_errors( + reinterpret_cast(input), + reinterpret_cast(input) + count, length - count); + res.count += count; + return res; + } else { + return result(error_code::SUCCESS, length); + } +} + +result generic_validate_utf8_with_errors(const char *input, size_t length) { + return generic_validate_utf8_with_errors( + reinterpret_cast(input), length); +} + +} // namespace utf8_validation +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/utf8_validation/utf8_validator.h */ +#endif // SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */ +namespace simdutf { +namespace ppc64 { +namespace { +namespace utf8_to_utf16 { +using namespace simd; + +simdutf_really_inline simd8 +check_special_cases(const simd8 input, const simd8 prev1) { + // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) + // Bit 1 = Too Long (ASCII followed by continuation) + // Bit 2 = Overlong 3-byte + // Bit 4 = Surrogate + // Bit 5 = Overlong 2-byte + // Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1 << 6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4); + constexpr const uint8_t CARRY = + TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = + (prev1 & 0x0F) + .lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | + OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); + return (byte_1_high & byte_1_low & byte_2_high); +} +simdutf_really_inline simd8 +check_multibyte_lengths(const simd8 input, + const simd8 prev_input, + const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = + simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; +} + +struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, + const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ + // lead bytes (2, 3, 4-byte leads become large positive numbers instead of + // small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + template + simdutf_really_inline size_t convert(const char *in, size_t size, + char16_t *utf16_output) { + size_t pos = 0; + char16_t *start{utf16_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 8 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the eight last + // leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (utf8_continuation_mask & 1) { + return 0; // error + } + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf16( + in + pos, utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (errors()) { + return 0; + } + if (pos < size) { + size_t howmany = scalar::utf8_to_utf16::convert( + in + pos, size - pos, utf16_output); + if (howmany == 0) { + return 0; + } + utf16_output += howmany; + } + return utf16_output - start; + } + + template + simdutf_really_inline result convert_with_errors(const char *in, size_t size, + char16_t *utf16_output) { + size_t pos = 0; + char16_t *start{utf16_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 8 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the eight last + // leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (errors() || (utf8_continuation_mask & 1)) { + // rewind_and_convert_with_errors will seek a potential error from + // in+pos onward, with the ability to go back up to pos bytes, and + // read size-pos bytes forward. + result res = + scalar::utf8_to_utf16::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf16_output); + res.count += pos; + return res; + } + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf16( + in + pos, utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (errors()) { + // rewind_and_convert_with_errors will seek a potential error from in+pos + // onward, with the ability to go back up to pos bytes, and read size-pos + // bytes forward. + result res = + scalar::utf8_to_utf16::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf16_output); + res.count += pos; + return res; + } + if (pos < size) { + // rewind_and_convert_with_errors will seek a potential error from in+pos + // onward, with the ability to go back up to pos bytes, and read size-pos + // bytes forward. + result res = + scalar::utf8_to_utf16::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf16_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + utf16_output += res.count; + } + } + return result(error_code::SUCCESS, utf16_output - start); + } + + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + +}; // struct utf8_checker +} // namespace utf8_to_utf16 +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */ +/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ +namespace simdutf { +namespace ppc64 { +namespace { +namespace utf8_to_utf16 { + +using namespace simd; + +template +simdutf_warn_unused size_t convert_valid(const char *input, size_t size, + char16_t *utf16_output) noexcept { + // The implementation is not specific to haswell and should be moved to the + // generic directory. + size_t pos = 0; + char16_t *start{utf16_output}; + const size_t safety_margin = 16; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + // this loop could be unrolled further. For example, we could process the + // mask far more than 64 bytes. + simd8x64 in(reinterpret_cast(input + pos)); + if (in.is_ascii()) { + in.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // Slow path. We hope that the compiler will recognize that this is a slow + // path. Anything that is not a continuation mask is a 'leading byte', + // that is, the start of a new code point. + uint64_t utf8_continuation_mask = in.lt(-65 + 1); + // -65 is 0b10111111 in two-complement's, so largest possible continuation + // byte + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + // The *start* of code points is not so useful, rather, we want the *end* + // of code points. + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times when using solely + // the slow/regular path, and at least four times if there are fast paths. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + // + // Thus we may allow convert_masked_utf8_to_utf16 to process + // more bytes at a time under a fast-path mode where 16 bytes + // are consumed at once (e.g., when encountering ASCII). + size_t consumed = convert_masked_utf8_to_utf16( + input + pos, utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + utf16_output += scalar::utf8_to_utf16::convert_valid( + input + pos, size - pos, utf16_output); + return utf16_output - start; +} + +} // namespace utf8_to_utf16 +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */ +namespace simdutf { +namespace ppc64 { +namespace { +namespace utf8_to_utf32 { +using namespace simd; + +simdutf_really_inline simd8 +check_special_cases(const simd8 input, const simd8 prev1) { + // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) + // Bit 1 = Too Long (ASCII followed by continuation) + // Bit 2 = Overlong 3-byte + // Bit 4 = Surrogate + // Bit 5 = Overlong 2-byte + // Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1 << 6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4); + constexpr const uint8_t CARRY = + TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = + (prev1 & 0x0F) + .lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | + OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); + return (byte_1_high & byte_1_low & byte_2_high); +} +simdutf_really_inline simd8 +check_multibyte_lengths(const simd8 input, + const simd8 prev_input, + const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = + simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; +} + +struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, + const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ + // lead bytes (2, 3, 4-byte leads become large positive numbers instead of + // small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + simdutf_really_inline size_t convert(const char *in, size_t size, + char32_t *utf32_output) { + size_t pos = 0; + char32_t *start{utf32_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 words when calling convert_masked_utf8_to_utf32. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 16 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the fourth + // last leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (utf8_continuation_mask & 1) { + return 0; // we have an error + } + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf32( + in + pos, utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (errors()) { + return 0; + } + if (pos < size) { + size_t howmany = + scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output); + if (howmany == 0) { + return 0; + } + utf32_output += howmany; + } + return utf32_output - start; + } + + simdutf_really_inline result convert_with_errors(const char *in, size_t size, + char32_t *utf32_output) { + size_t pos = 0; + char32_t *start{utf32_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 8 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the fourth + // last leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (errors() || (utf8_continuation_mask & 1)) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf32_output); + res.count += pos; + return res; + } + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf32( + in + pos, utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (errors()) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf32_output); + res.count += pos; + return res; + } + if (pos < size) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf32_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + utf32_output += res.count; + } + } + return result(error_code::SUCCESS, utf32_output - start); + } + + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + +}; // struct utf8_checker +} // namespace utf8_to_utf32 +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */ +/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ +namespace simdutf { +namespace ppc64 { +namespace { +namespace utf8_to_utf32 { + +using namespace simd; + +simdutf_warn_unused size_t convert_valid(const char *input, size_t size, + char32_t *utf32_output) noexcept { + size_t pos = 0; + char32_t *start{utf32_output}; + const size_t safety_margin = 16; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 in(reinterpret_cast(input + pos)); + if (in.is_ascii()) { + in.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // -65 is 0b10111111 in two-complement's, so largest possible continuation + // byte + uint64_t utf8_continuation_mask = in.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + size_t max_starting_point = (pos + 64) - 12; + while (pos < max_starting_point) { + size_t consumed = convert_masked_utf8_to_utf32( + input + pos, utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + } + } + utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, + utf32_output); + return utf32_output - start; +} + +} // namespace utf8_to_utf32 +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 +/* begin file src/generic/utf8.h */ +namespace simdutf { +namespace ppc64 { +namespace { +namespace utf8 { + +using namespace simd; + +simdutf_really_inline size_t count_code_points(const char *in, size_t size) { + size_t pos = 0; + size_t count = 0; + for (; pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.gt(-65); + count += count_ones(utf8_continuation_mask); + } + return count + scalar::utf8::count_code_points(in + pos, size - pos); +} + +#ifdef SIMDUTF_SIMD_HAS_BYTEMASK +simdutf_really_inline size_t count_code_points_bytemask(const char *in, + size_t size) { + using vector_i8 = simd8; + using vector_u8 = simd8; + using vector_u64 = simd64; + + constexpr size_t N = vector_i8::SIZE; + constexpr size_t max_iterations = 255 / 4; + + size_t pos = 0; + size_t count = 0; + + auto counters = vector_u64::zero(); + auto local = vector_u8::zero(); + size_t iterations = 0; + for (; pos + 4 * N <= size; pos += 4 * N) { + const auto input0 = + simd8::load(reinterpret_cast(in + pos + 0 * N)); + const auto input1 = + simd8::load(reinterpret_cast(in + pos + 1 * N)); + const auto input2 = + simd8::load(reinterpret_cast(in + pos + 2 * N)); + const auto input3 = + simd8::load(reinterpret_cast(in + pos + 3 * N)); + const auto mask0 = input0 > int8_t(-65); + const auto mask1 = input1 > int8_t(-65); + const auto mask2 = input2 > int8_t(-65); + const auto mask3 = input3 > int8_t(-65); + + local -= vector_u8(mask0); + local -= vector_u8(mask1); + local -= vector_u8(mask2); + local -= vector_u8(mask3); + + iterations += 1; + if (iterations == max_iterations) { + counters += sum_8bytes(local); + local = vector_u8::zero(); + iterations = 0; + } + } + + if (iterations > 0) { + count += local.sum_bytes(); + } + + count += counters.sum(); + + return count + scalar::utf8::count_code_points(in + pos, size - pos); +} +#endif // SIMDUTF_SIMD_HAS_BYTEMASK + +simdutf_really_inline size_t utf16_length_from_utf8(const char *in, + size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for (; pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + // We count one word for anything that is not a continuation (so + // leading bytes). + count += 64 - count_ones(utf8_continuation_mask); + int64_t utf8_4byte = input.gteq_unsigned(240); + count += count_ones(utf8_4byte); + } + return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos); +} + +} // namespace utf8 +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/utf8.h */ +#endif // SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_UTF16 +/* begin file src/generic/utf16.h */ +namespace simdutf { +namespace ppc64 { +namespace { +namespace utf16 { + +template +simdutf_really_inline size_t count_code_points(const char16_t *in, + size_t size) { + size_t pos = 0; + size_t count = 0; + for (; pos < size / 32 * 32; pos += 32) { + simd16x32 input(reinterpret_cast(in + pos)); + if simdutf_constexpr (!match_system(big_endian)) { + input.swap_bytes(); + } + uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF); + count += count_ones(not_pair) / 2; + } + return count + + scalar::utf16::count_code_points(in + pos, size - pos); +} + +template +simdutf_really_inline size_t utf8_length_from_utf16(const char16_t *in, + size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for (; pos < size / 32 * 32; pos += 32) { + simd16x32 input(reinterpret_cast(in + pos)); + if simdutf_constexpr (!match_system(big_endian)) { + input.swap_bytes(); + } + uint64_t ascii_mask = input.lteq(0x7F); + uint64_t twobyte_mask = input.lteq(0x7FF); + uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF); + + size_t ascii_count = count_ones(ascii_mask) / 2; + size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2; + size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2; + size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2; + count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + + ascii_count; + } + return count + scalar::utf16::utf8_length_from_utf16(in + pos, + size - pos); +} + +template +simdutf_really_inline size_t utf32_length_from_utf16(const char16_t *in, + size_t size) { + return count_code_points(in, size); +} + +simdutf_really_inline void +change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) { + size_t pos = 0; + + while (pos < size / 32 * 32) { + simd16x32 input(reinterpret_cast(in + pos)); + input.swap_bytes(); + input.store(reinterpret_cast(output)); + pos += 32; + output += 32; + } + + scalar::utf16::change_endianness_utf16(in + pos, size - pos, output); +} + +} // namespace utf16 +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/utf16.h */ +/* begin file src/generic/validate_utf16.h */ +namespace simdutf { +namespace ppc64 { +namespace { +namespace utf16 { +/* + UTF-16 validation + -------------------------------------------------- + + In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning. + + In a vectorized algorithm we want to examine the most significant + nibble in order to select a fast path. If none of highest nibbles + are 0xD (13), than we are sure that UTF-16 chunk in a vector + register is valid. + + Let us analyze what we need to check if the nibble is 0xD. The + value of the preceding nibble determines what we have: + + 0xd000 .. 0xd7ff - a valid word + 0xd800 .. 0xdbff - low surrogate + 0xdc00 .. 0xdfff - high surrogate + + Other constraints we have to consider: + - there must not be two consecutive low surrogates (0xd800 .. 0xdbff) + - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff) + - there must not be sole low surrogate nor high surrogate + + We are going to build three bitmasks based on the 3rd nibble: + - V = valid word, + - L = low surrogate (0xd800 .. 0xdbff) + - H = high surrogate (0xdc00 .. 0xdfff) + + 0 1 2 3 4 5 6 7 <--- word index + [ V | L | H | L | H | V | V | L ] + 1 0 0 0 0 1 1 0 - V = valid masks + 0 1 0 1 0 0 0 1 - L = low surrogate + 0 0 1 0 1 0 0 0 - H high surrogate + + + 1 0 0 0 0 1 1 0 V = valid masks + 0 1 0 1 0 0 0 0 a = L & (H >> 1) + 0 0 1 0 1 0 0 0 b = a << 1 + 1 1 1 1 1 1 1 0 c = V | a | b + ^ + the last bit can be zero, we just consume 7 + code units and recheck this word in the next iteration +*/ +template +const result validate_utf16_with_errors(const char16_t *input, size_t size) { + if (simdutf_unlikely(size == 0)) { + return result(error_code::SUCCESS, 0); + } + + const char16_t *start = input; + const char16_t *end = input + size; + + const auto v_d8 = simd8::splat(0xd8); + const auto v_f8 = simd8::splat(0xf8); + const auto v_fc = simd8::splat(0xfc); + const auto v_dc = simd8::splat(0xdc); + + while (input + simd16::SIZE * 2 < end) { + // 0. Load data: since the validation takes into account only higher + // byte of each word, we compress the two vectors into one which + // consists only the higher bytes. + auto in0 = simd16(input); + auto in1 = + simd16(input + simd16::SIZE / sizeof(char16_t)); + + // Function `utf16_gather_high_bytes` consumes two vectors of UTF-16 + // and yields a single vector having only higher bytes of characters. + const auto in = utf16_gather_high_bytes(in0, in1); + + // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). + const auto surrogates_wordmask = (in & v_f8) == v_d8; + const uint16_t surrogates_bitmask = + static_cast(surrogates_wordmask.to_bitmask()); + if (surrogates_bitmask == 0x0000) { + input += 16; + } else { + // 2. We have some surrogates that have to be distinguished: + // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) + // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) + // + // Fact: high surrogate has 11th bit set (3rd bit in the higher byte) + + // V - non-surrogate code units + // V = not surrogates_wordmask + const uint16_t V = static_cast(~surrogates_bitmask); + + // H - word-mask for high surrogates: the six highest bits are 0b1101'11 + const auto vH = (in & v_fc) == v_dc; + const uint16_t H = static_cast(vH.to_bitmask()); + + // L - word mask for low surrogates + // L = not H and surrogates_wordmask + const uint16_t L = static_cast(~H & surrogates_bitmask); + + const uint16_t a = static_cast( + L & (H >> 1)); // A low surrogate must be followed by high one. + // (A low surrogate placed in the 7th register's word + // is an exception we handle.) + const uint16_t b = static_cast( + a << 1); // Just mark that the opinput - startite fact is hold, + // thanks to that we have only two masks for valid case. + const uint16_t c = static_cast( + V | a | b); // Combine all the masks into the final one. + + if (c == 0xffff) { + // The whole input register contains valid UTF-16, i.e., + // either single code units or proper surrogate pairs. + input += 16; + } else if (c == 0x7fff) { + // The 15 lower code units of the input register contains valid UTF-16. + // The 15th word may be either a low or high surrogate. It the next + // iteration we 1) check if the low surrogate is followed by a high + // one, 2) reject sole high surrogate. + input += 15; + } else { + return result(error_code::SURROGATE, input - start); + } + } + } + + return result(error_code::SUCCESS, input - start); +} + +template +const result validate_utf16_as_ascii_with_errors(const char16_t *input, + size_t size) { + if (simdutf_unlikely(size == 0)) { + return result(error_code::SUCCESS, 0); + } + size_t pos = 0; + for (; pos < size / 32 * 32; pos += 32) { + simd16x32 input_vec( + reinterpret_cast(input + pos)); + if simdutf_constexpr (!match_system(big_endian)) { + input_vec.swap_bytes(); + } + uint64_t matches = input_vec.lteq(uint16_t(0x7f)); + if (~matches) { + // Found a match, return the first one + int index = trailing_zeroes(~matches) / 2; + return result(error_code::TOO_LARGE, pos + index); + } + } + + // Scalar tail + while (pos < size) { + + char16_t v = scalar::utf16::swap_if_needed(input[pos]); + if (v > 0x7F) { + return result(error_code::TOO_LARGE, pos); + } + pos++; + } + return result(error_code::SUCCESS, size); +} + +} // namespace utf16 +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/validate_utf16.h */ +#endif // SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF32 +/* begin file src/generic/utf32.h */ +#include + +namespace simdutf { +namespace ppc64 { +namespace { +namespace utf32 { + +template T min(T a, T b) { return a <= b ? a : b; } + +simdutf_really_inline size_t utf8_length_from_utf32(const char32_t *input, + size_t length) { + using vector_u32 = simd32; + + const char32_t *start = input; + + // we add up to three ones in a single iteration (see the vectorized loop in + // section #2 below) + const size_t max_increment = 3; + + const size_t N = vector_u32::ELEMENTS; + +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + const auto v_0000007f = vector_u32::splat(0x0000007f); + const auto v_000007ff = vector_u32::splat(0x000007ff); + const auto v_0000ffff = vector_u32::splat(0x0000ffff); +#else + const auto v_ffffff80 = vector_u32::splat(0xffffff80); + const auto v_fffff800 = vector_u32::splat(0xfffff800); + const auto v_ffff0000 = vector_u32::splat(0xffff0000); + const auto one = vector_u32::splat(1); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP + + size_t counter = 0; + + // 1. vectorized loop unrolled 4 times + { + // we use vector of uint32 counters, this is why this limit is used + const size_t max_iterations = + std::numeric_limits::max() / (max_increment * 4); + size_t blocks = length / (N * 4); + length -= blocks * (N * 4); + while (blocks != 0) { + const size_t iterations = min(blocks, max_iterations); + blocks -= iterations; + + simd32 acc = vector_u32::zero(); + for (size_t i = 0; i < iterations; i++) { + const auto in0 = vector_u32(input + 0 * N); + const auto in1 = vector_u32(input + 1 * N); + const auto in2 = vector_u32(input + 2 * N); + const auto in3 = vector_u32(input + 3 * N); + +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + acc -= as_vector_u32(in0 > v_0000007f); + acc -= as_vector_u32(in1 > v_0000007f); + acc -= as_vector_u32(in2 > v_0000007f); + acc -= as_vector_u32(in3 > v_0000007f); + + acc -= as_vector_u32(in0 > v_000007ff); + acc -= as_vector_u32(in1 > v_000007ff); + acc -= as_vector_u32(in2 > v_000007ff); + acc -= as_vector_u32(in3 > v_000007ff); + + acc -= as_vector_u32(in0 > v_0000ffff); + acc -= as_vector_u32(in1 > v_0000ffff); + acc -= as_vector_u32(in2 > v_0000ffff); + acc -= as_vector_u32(in3 > v_0000ffff); +#else + acc += min(one, in0 & v_ffffff80); + acc += min(one, in1 & v_ffffff80); + acc += min(one, in2 & v_ffffff80); + acc += min(one, in3 & v_ffffff80); + + acc += min(one, in0 & v_fffff800); + acc += min(one, in1 & v_fffff800); + acc += min(one, in2 & v_fffff800); + acc += min(one, in3 & v_fffff800); + + acc += min(one, in0 & v_ffff0000); + acc += min(one, in1 & v_ffff0000); + acc += min(one, in2 & v_ffff0000); + acc += min(one, in3 & v_ffff0000); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP + + input += 4 * N; + } + + counter += acc.sum(); + } + } + + // 2. vectorized loop for tail + { + const size_t max_iterations = + std::numeric_limits::max() / max_increment; + size_t blocks = length / N; + length -= blocks * N; + while (blocks != 0) { + const size_t iterations = min(blocks, max_iterations); + blocks -= iterations; + + auto acc = vector_u32::zero(); + for (size_t i = 0; i < iterations; i++) { + const auto in = vector_u32(input); + +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + acc -= as_vector_u32(in > v_0000007f); + acc -= as_vector_u32(in > v_000007ff); + acc -= as_vector_u32(in > v_0000ffff); +#else + acc += min(one, in & v_ffffff80); + acc += min(one, in & v_fffff800); + acc += min(one, in & v_ffff0000); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP + + input += N; + } + + counter += acc.sum(); + } + } + + const size_t consumed = input - start; + if (consumed != 0) { + // We don't count 0th bytes in the vectorized loops above, this + // is why we need to count them in the end. + counter += consumed; + } + + return counter + scalar::utf32::utf8_length_from_utf32(input, length); +} + +} // namespace utf32 +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/utf32.h */ +/* begin file src/generic/validate_utf32.h */ +namespace simdutf { +namespace ppc64 { +namespace { +namespace utf32 { + +simdutf_really_inline bool validate(const char32_t *input, size_t size) { + if (simdutf_unlikely(size == 0)) { + // empty input is valid UTF-32. protect the implementation from + // handling nullptr + return true; + } + + const char32_t *end = input + size; + + using vector_u32 = simd32; + + const auto standardmax = vector_u32::splat(0x10ffff); + const auto offset = vector_u32::splat(0xffff2000); + const auto standardoffsetmax = vector_u32::splat(0xfffff7ff); + auto currentmax = vector_u32::zero(); + auto currentoffsetmax = vector_u32::zero(); + + constexpr size_t N = vector_u32::ELEMENTS; + + while (input + N < end) { + auto in = vector_u32(input); + if simdutf_constexpr (!match_system(endianness::BIG)) { + in.swap_bytes(); + } + + currentmax = max(currentmax, in); + currentoffsetmax = max(currentoffsetmax, in + offset); + input += N; + } + + const auto too_large = currentmax > standardmax; + if (too_large.any()) { + return false; + } + + const auto surrogate = currentoffsetmax > standardoffsetmax; + if (surrogate.any()) { + return false; + } + + return scalar::utf32::validate(input, end - input); +} + +simdutf_really_inline result validate_with_errors(const char32_t *input, + size_t size) { + if (simdutf_unlikely(size == 0)) { + // empty input is valid UTF-32. protect the implementation from + // handling nullptr + return result(error_code::SUCCESS, 0); + } + + const char32_t *start = input; + const char32_t *end = input + size; + + using vector_u32 = simd32; + + const auto standardmax = vector_u32::splat(0x10ffff + 1); + const auto surrogate_mask = vector_u32::splat(0xfffff800); + const auto surrogate_byte = vector_u32::splat(0x0000d800); + + constexpr size_t N = vector_u32::ELEMENTS; + + while (input + N < end) { + auto in = vector_u32(input); + if simdutf_constexpr (!match_system(endianness::BIG)) { + in.swap_bytes(); + } + + const auto too_large = in >= standardmax; + const auto surrogate = (in & surrogate_mask) == surrogate_byte; + + const auto combined = too_large | surrogate; + if (simdutf_unlikely(combined.any())) { + const size_t consumed = input - start; + auto sr = scalar::utf32::validate_with_errors(input, end - input); + sr.count += consumed; + + return sr; + } + + input += N; + } + + const size_t consumed = input - start; + auto sr = scalar::utf32::validate_with_errors(input, end - input); + sr.count += consumed; + + return sr; +} + +} // namespace utf32 +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/validate_utf32.h */ +#endif // SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_ASCII +/* begin file src/generic/ascii_validation.h */ +namespace simdutf { +namespace ppc64 { +namespace { +namespace ascii_validation { + +result generic_validate_ascii_with_errors(const char *input, size_t length) { + buf_block_reader<64> reader(reinterpret_cast(input), length); + size_t count{0}; + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + if (!in.is_ascii()) { + result res = scalar::ascii::validate_with_errors( + reinterpret_cast(input + count), length - count); + return result(res.error, count + res.count); + } + reader.advance(); + + count += 64; + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + if (!in.is_ascii()) { + result res = scalar::ascii::validate_with_errors( + reinterpret_cast(input + count), length - count); + return result(res.error, count + res.count); + } else { + return result(error_code::SUCCESS, length); + } +} + +bool generic_validate_ascii(const char *input, size_t length) { + buf_block_reader<64> reader(reinterpret_cast(input), length); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + if (!in.is_ascii()) { + return false; + } + reader.advance(); + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + return in.is_ascii(); +} + +} // namespace ascii_validation +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/ascii_validation.h */ +#endif // SIMDUTF_FEATURE_ASCII + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */ +namespace simdutf { +namespace ppc64 { +namespace { +namespace utf8_to_latin1 { +using namespace simd; + +simdutf_really_inline simd8 +check_special_cases(const simd8 input, const simd8 prev1) { + // For UTF-8 to Latin 1, we can allow any ASCII character, and any + // continuation byte, but the non-ASCII leading bytes must be 0b11000011 or + // 0b11000010 and nothing else. + // + // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) + // Bit 1 = Too Long (ASCII followed by continuation) + // Bit 2 = Overlong 3-byte + // Bit 4 = Surrogate + // Bit 5 = Overlong 2-byte + // Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1 << 6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____ + constexpr const uint8_t FORBIDDEN = 0xff; + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + FORBIDDEN, + // 1110____ ________ + FORBIDDEN, + // 1111____ ________ + FORBIDDEN); + constexpr const uint8_t CARRY = + TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = + (prev1 & 0x0F) + .lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, CARRY, + + // ____0100 ________ + FORBIDDEN, + // ____0101 ________ + FORBIDDEN, + // ____011_ ________ + FORBIDDEN, FORBIDDEN, + + // ____1___ ________ + FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, + // ____1101 ________ + FORBIDDEN, FORBIDDEN, FORBIDDEN); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | + OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); + return (byte_1_high & byte_1_low & byte_2_high); +} + +struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, + const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ + // lead bytes (2, 3, 4-byte leads become large positive numbers instead of + // small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + this->error |= check_special_cases(input, prev1); + } + + simdutf_really_inline size_t convert(const char *in, size_t size, + char *latin1_output) { + size_t pos = 0; + char *start{latin1_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 16 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 16; margin--) { + leading_byte += (int8_t(in[margin - 1]) > + -65); // twos complement of -65 is 1011 1111 ... + } + // If the input is long enough, then we have that margin-1 is the eight last + // leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store((int8_t *)latin1_output); + latin1_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = + input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in + // this case, we also have ASCII to account for. + if (utf8_continuation_mask & 1) { + return 0; // error + } + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_latin1( + in + pos, utf8_end_of_code_point_mask, latin1_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (errors()) { + return 0; + } + if (pos < size) { + size_t howmany = + scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output); + if (howmany == 0) { + return 0; + } + latin1_output += howmany; + } + return latin1_output - start; + } + + simdutf_really_inline result convert_with_errors(const char *in, size_t size, + char *latin1_output) { + size_t pos = 0; + char *start{latin1_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 8 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the eight last + // leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store((int8_t *)latin1_output); + latin1_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + if (errors()) { + // rewind_and_convert_with_errors will seek a potential error from + // in+pos onward, with the ability to go back up to pos bytes, and + // read size-pos bytes forward. + result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors( + pos, in + pos, size - pos, latin1_output); + res.count += pos; + return res; + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_latin1( + in + pos, utf8_end_of_code_point_mask, latin1_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (errors()) { + // rewind_and_convert_with_errors will seek a potential error from in+pos + // onward, with the ability to go back up to pos bytes, and read size-pos + // bytes forward. + result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors( + pos, in + pos, size - pos, latin1_output); + res.count += pos; + return res; + } + if (pos < size) { + // rewind_and_convert_with_errors will seek a potential error from in+pos + // onward, with the ability to go back up to pos bytes, and read size-pos + // bytes forward. + result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors( + pos, in + pos, size - pos, latin1_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + latin1_output += res.count; + } + } + return result(error_code::SUCCESS, latin1_output - start); + } + + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + +}; // struct utf8_checker +} // namespace utf8_to_latin1 +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */ +/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */ +namespace simdutf { +namespace ppc64 { +namespace { +namespace utf8_to_latin1 { +using namespace simd; + +simdutf_really_inline size_t convert_valid(const char *in, size_t size, + char *latin1_output) { + size_t pos = 0; + char *start{latin1_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last + // 16 bytes, and if the data is valid, then it is entirely safe because 16 + // UTF-8 bytes generate much more than 8 bytes. However, you cannot generally + // assume that you have valid UTF-8 input, so we are going to go back from the + // end counting 8 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > + -65); // twos complement of -65 is 1011 1111 ... + } + // If the input is long enough, then we have that margin-1 is the eight last + // leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store((int8_t *)latin1_output); + latin1_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it + // is not good enough. + uint64_t utf8_continuation_mask = + input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in + // this case, we also have ASCII to account for. + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_latin1( + in + pos, utf8_end_of_code_point_mask, latin1_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (pos < size) { + size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos, + latin1_output); + latin1_output += howmany; + } + return latin1_output - start; +} + +} // namespace utf8_to_latin1 +} // namespace +} // namespace ppc64 +} // namespace simdutf + // namespace simdutf +/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_BASE64 +/* begin file src/generic/base64.h */ +/** + * References and further reading: + * + * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the + * speed of a memory copy, Software: Practice and Experience 50 (2), 2020. + * https://arxiv.org/abs/1910.05109 + * + * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2 + * Instructions, ACM Transactions on the Web 12 (3), 2018. + * https://arxiv.org/abs/1704.00605 + * + * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings. + * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force, + * Request for Comments: 4648. + * + * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization. + * http://www.alfredklomp.com/programming/sse-base64/. (2014). + * + * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD + * acceleration. https://github.com/aklomp/base64. (2014). + * + * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014). + * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/ + * + * Nick Kopp. 2013. Base64 Encoding on a GPU. + * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013). + */ +namespace simdutf { +namespace ppc64 { +namespace { +namespace base64 { + +/* + The following template function implements API for Base64 decoding. + + An implementation is responsible for providing the `block64` type and + associated methods that perform actual conversion. Please refer + to any vectorized implementation to learn the API of these procedures. +*/ +template +full_result +compress_decode_base64(char *dst, const chartype *src, size_t srclen, + base64_options options, + last_chunk_handling_options last_chunk_options) { + const uint8_t *to_base64 = + default_or_url ? tables::base64::to_base64_default_or_url_value + : (base64_url ? tables::base64::to_base64_url_value + : tables::base64::to_base64_value); + auto ri = simdutf::scalar::base64::find_end(src, srclen, options); + size_t equallocation = ri.equallocation; + size_t equalsigns = ri.equalsigns; + srclen = ri.srclen; + size_t full_input_length = ri.full_input_length; + if (srclen == 0) { + if (!ignore_garbage && equalsigns > 0) { + return {INVALID_BASE64_CHARACTER, equallocation, 0}; + } + return {SUCCESS, full_input_length, 0}; + } + char *end_of_safe_64byte_zone = + dst == nullptr + ? nullptr + : ((srclen + 3) / 4 * 3 >= 63 ? dst + (srclen + 3) / 4 * 3 - 63 + : dst); + + const chartype *const srcinit = src; + const char *const dstinit = dst; + const chartype *const srcend = src + srclen; + + constexpr size_t block_size = 6; + static_assert(block_size >= 2, "block_size must be at least two"); + char buffer[block_size * 64]; + char *bufferptr = buffer; + if (srclen >= 64) { + const chartype *const srcend64 = src + srclen - 64; + while (src <= srcend64) { + block64 b(src); + src += 64; + uint64_t error = 0; + const uint64_t badcharmask = + b.to_base64_mask(&error); + if (!ignore_garbage && error) { + src -= 64; + const size_t error_offset = trailing_zeroes(error); + return {error_code::INVALID_BASE64_CHARACTER, + size_t(src - srcinit + error_offset), size_t(dst - dstinit)}; + } + if (badcharmask != 0) { + bufferptr += b.compress_block(badcharmask, bufferptr); + } else if (bufferptr != buffer) { + b.copy_block(bufferptr); + bufferptr += 64; + } else { + if (dst >= end_of_safe_64byte_zone) { + b.base64_decode_block_safe(dst); + } else { + b.base64_decode_block(dst); + } + dst += 48; + } + if (bufferptr >= (block_size - 1) * 64 + buffer) { + for (size_t i = 0; i < (block_size - 2); i++) { + base64_decode_block(dst, buffer + i * 64); + dst += 48; + } + if (dst >= end_of_safe_64byte_zone) { + base64_decode_block_safe(dst, buffer + (block_size - 2) * 64); + } else { + base64_decode_block(dst, buffer + (block_size - 2) * 64); + } + dst += 48; + std::memcpy(buffer, buffer + (block_size - 1) * 64, + 64); // 64 might be too much + bufferptr -= (block_size - 1) * 64; + } + } + } + + char *buffer_start = buffer; + // Optimization note: if this is almost full, then it is worth our + // time, otherwise, we should just decode directly. + int last_block = (int)((bufferptr - buffer_start) % 64); + if (last_block != 0 && srcend - src + last_block >= 64) { + + while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) { + uint8_t val = to_base64[uint8_t(*src)]; + *bufferptr = char(val); + if (!ignore_garbage && + (!scalar::base64::is_eight_byte(*src) || val > 64)) { + return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit), + size_t(dst - dstinit)}; + } + bufferptr += (val <= 63); + src++; + } + } + + for (; buffer_start + 64 <= bufferptr; buffer_start += 64) { + if (dst >= end_of_safe_64byte_zone) { + base64_decode_block_safe(dst, buffer_start); + } else { + base64_decode_block(dst, buffer_start); + } + dst += 48; + } + if ((bufferptr - buffer_start) % 64 != 0) { + while (buffer_start + 4 < bufferptr) { + uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) + + (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) + + (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) + + (uint32_t(uint8_t(buffer_start[3])) << 0 * 6)) + << 8; +#if !SIMDUTF_IS_BIG_ENDIAN + triple = scalar::u32_swap_bytes(triple); +#endif + std::memcpy(dst, &triple, 3); + + dst += 3; + buffer_start += 4; + } + if (buffer_start + 4 <= bufferptr) { + uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) + + (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) + + (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) + + (uint32_t(uint8_t(buffer_start[3])) << 0 * 6)) + << 8; +#if !SIMDUTF_IS_BIG_ENDIAN + triple = scalar::u32_swap_bytes(triple); +#endif + std::memcpy(dst, &triple, 3); + + dst += 3; + buffer_start += 4; + } + // we may have 1, 2 or 3 bytes left and we need to decode them so let us + // backtrack + int leftover = int(bufferptr - buffer_start); + while (leftover > 0) { + if (!ignore_garbage) { + while (to_base64[uint8_t(*(src - 1))] == 64) { + src--; + } + } else { + while (to_base64[uint8_t(*(src - 1))] >= 64) { + src--; + } + } + src--; + leftover--; + } + } + if (src < srcend + equalsigns) { + full_result r = scalar::base64::base64_tail_decode( + dst, src, srcend - src, equalsigns, options, last_chunk_options); + r = scalar::base64::patch_tail_result( + r, size_t(src - srcinit), size_t(dst - dstinit), equallocation, + full_input_length, last_chunk_options); + // When is_partial(last_chunk_options) is true, we must either end with + // the end of the stream (beyond whitespace) or right after a non-ignorable + // character or at the very beginning of the stream. + // See https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64 + if (is_partial(last_chunk_options) && r.error == error_code::SUCCESS && + r.input_count < full_input_length) { + // First check if we can extend the input to the end of the stream + while (r.input_count < full_input_length && + base64_ignorable(*(srcinit + r.input_count), options)) { + r.input_count++; + } + // If we are still not at the end of the stream, then we must backtrack + // to the last non-ignorable character. + if (r.input_count < full_input_length) { + while (r.input_count > 0 && + base64_ignorable(*(srcinit + r.input_count - 1), options)) { + r.input_count--; + } + } + } + return r; + } + if (!ignore_garbage && equalsigns > 0) { + if ((size_t(dst - dstinit) % 3 == 0) || + ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) { + return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)}; + } + } + return {SUCCESS, srclen, size_t(dst - dstinit)}; +} + +} // namespace base64 +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/base64.h */ +/* begin file src/generic/find.h */ +namespace simdutf { +namespace ppc64 { +namespace { +namespace util { + +simdutf_really_inline const char *find(const char *start, const char *end, + char character) noexcept { + // Handle empty or invalid range + if (start >= end) + return end; + // Align the start pointer to 64 bytes + uintptr_t misalignment = reinterpret_cast(start) % 64; + if (misalignment != 0) { + size_t adjustment = 64 - misalignment; + if (size_t(std::distance(start, end)) < adjustment) { + adjustment = std::distance(start, end); + } + for (size_t i = 0; i < adjustment; i++) { + if (start[i] == character) { + return start + i; + } + } + start += adjustment; + } + + // Main loop for 64-byte aligned data + for (; std::distance(start, end) >= 64; start += 64) { + simd8x64 input(reinterpret_cast(start)); + uint64_t matches = input.eq(uint8_t(character)); + if (matches != 0) { + // Found a match, return the first one + int index = trailing_zeroes(matches); + return start + index; + } + } + return std::find(start, end, character); +} + +simdutf_really_inline const char16_t * +find(const char16_t *start, const char16_t *end, char16_t character) noexcept { + // Handle empty or invalid range + if (start >= end) + return end; + // Align the start pointer to 64 bytes if misalignment is even + uintptr_t misalignment = reinterpret_cast(start) % 64; + if (misalignment != 0 && misalignment % 2 == 0) { + size_t adjustment = (64 - misalignment) / sizeof(char16_t); + if (size_t(std::distance(start, end)) < adjustment) { + adjustment = std::distance(start, end); + } + for (size_t i = 0; i < adjustment; i++) { + if (start[i] == character) { + return start + i; + } + } + start += adjustment; + } + + // Main loop for 64-byte aligned data + for (; std::distance(start, end) >= 32; start += 32) { + simd16x32 input(reinterpret_cast(start)); + uint64_t matches = input.eq(uint16_t(character)); + if (matches != 0) { + // Found a match, return the first one + int index = trailing_zeroes(matches) / 2; + return start + index; + } + } + return std::find(start, end, character); +} + +} // namespace util +} // namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/find.h */ +#endif // SIMDUTF_FEATURE_BASE64 + +/* begin file src/ppc64/templates.cpp */ +/* + Template `convert_impl` implements generic conversion routine between + different encodings. Procedure returns the number of written elements, + or zero in the case of error. + + Parameters: + * VectorizedConvert - vectorized procedure that returns structure having + three fields: error_code (err), const Source* (input), Destination* + (output) + * ScalarConvert - scalar procedure that carries on conversion of tail + * Source - type of input char (like char16_t, char) + * Destination - type of input char +*/ +template +size_t convert_impl(VectorizedConvert vectorized_convert, + ScalarConvert scalar_convert, const Source *buf, size_t len, + Destination *output) { + const auto vr = vectorized_convert(buf, len, output); + const size_t consumed = vr.input - buf; + const size_t written = vr.output - output; + if (vr.err != simdutf::error_code::SUCCESS) { + if (vr.err == simdutf::error_code::OTHER) { + // Vectorized procedure detected an error, but does not know + // exact position. The scalar procedure rescan the portion of + // input and figure out where the error is located. + return scalar_convert(vr.input, len - consumed, vr.output); + } + return 0; + } + + if (consumed == len) { + return written; + } + + const auto ret = scalar_convert(vr.input, len - consumed, vr.output); + if (ret == 0) { + return 0; + } + + return written + ret; +} + +/* + Template `convert_with_errors_impl` implements generic conversion routine + between different encodings. Procedure returns a `result` instance --- + please refer to its documentation for details. + + Parameters: + * VectorizedConvert - vectorized procedure that returns structure having + three fields: error_code (err), const Source* (input), Destination* + (output) + * ScalarConvert - scalar procedure that carries on conversion of tail + * Source - type of input char (like char16_t, char) + * Destination - type of input char +*/ +template +simdutf::result convert_with_errors_impl(VectorizedConvert vectorized_convert, + ScalarConvert scalar_convert, + const Source *buf, size_t len, + Destination *output) { + + const auto vr = vectorized_convert(buf, len, output); + const size_t consumed = vr.input - buf; + const size_t written = vr.output - output; + if (vr.err != simdutf::error_code::SUCCESS) { + if (vr.err == simdutf::error_code::OTHER) { + // Vectorized procedure detected an error, but does not know + // exact position. The scalar procedure rescan the portion of + // input and figure out where the error is located. + auto sr = scalar_convert(vr.input, len - consumed, vr.output); + sr.count += consumed; + return sr; + } + return simdutf::result(vr.err, consumed); + } + + if (consumed == len) { + return simdutf::result(simdutf::error_code::SUCCESS, written); + } + + simdutf::result sr = scalar_convert(vr.input, len - consumed, vr.output); + if (sr.is_ok()) { + sr.count += written; + } else { + sr.count += consumed; + } + + return sr; +} +/* end file src/ppc64/templates.cpp */ + +#ifdef SIMDUTF_INTERNAL_TESTS + #if SIMDUTF_FEATURE_BASE64 + #include "ppc64_base64_internal_tests.cpp" + #endif // SIMDUTF_FEATURE_BASE64 +#endif // SIMDUTF_INTERNAL_TESTS +// +// Implementation-specific overrides +// +namespace simdutf { +namespace ppc64 { + +#if SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused int +implementation::detect_encodings(const char *input, + size_t length) const noexcept { + // If there is a BOM, then we trust it. + auto bom_encoding = simdutf::BOM::check_bom(input, length); + if (bom_encoding != encoding_type::unspecified) { + return bom_encoding; + } + int out = 0; + // todo: reimplement as a one-pass algorithm. + if (validate_utf8(input, length)) { + out |= encoding_type::UTF8; + } + if ((length % 2) == 0) { + if (validate_utf16le(reinterpret_cast(input), + length / 2)) { + out |= encoding_type::UTF16_LE; + } + } + if ((length % 4) == 0) { + if (validate_utf32(reinterpret_cast(input), length / 4)) { + out |= encoding_type::UTF32_LE; + } + } + return out; +} +#endif // SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused bool +implementation::validate_utf8(const char *buf, size_t len) const noexcept { + return ppc64::utf8_validation::generic_validate_utf8(buf, len); +} +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF8 +simdutf_warn_unused result implementation::validate_utf8_with_errors( + const char *buf, size_t len) const noexcept { + return ppc64::utf8_validation::generic_validate_utf8_with_errors(buf, len); +} +#endif // SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_ASCII +simdutf_warn_unused bool +implementation::validate_ascii(const char *buf, size_t len) const noexcept { + return ppc64::ascii_validation::generic_validate_ascii(buf, len); +} + +simdutf_warn_unused result implementation::validate_ascii_with_errors( + const char *buf, size_t len) const noexcept { + return ppc64::ascii_validation::generic_validate_ascii_with_errors(buf, len); +} +#endif // SIMDUTF_FEATURE_ASCII +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII +simdutf_warn_unused bool +implementation::validate_utf16le_as_ascii(const char16_t *buf, + size_t len) const noexcept { + return ppc64::utf16::validate_utf16_as_ascii_with_errors( + buf, len) + .error == SUCCESS; +} + +simdutf_warn_unused bool +implementation::validate_utf16be_as_ascii(const char16_t *buf, + size_t len) const noexcept { + return ppc64::utf16::validate_utf16_as_ascii_with_errors(buf, + len) + .error == SUCCESS; +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused bool +implementation::validate_utf16le(const char16_t *buf, + size_t len) const noexcept { + const auto res = + ppc64::utf16::validate_utf16_with_errors(buf, len); + if (res.is_err()) { + return false; + } + + if (res.count != len) { + return scalar::utf16::validate(buf + res.count, + len - res.count); + } + + return true; +} +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused bool +implementation::validate_utf16be(const char16_t *buf, + size_t len) const noexcept { + return validate_utf16be_with_errors(buf, len).is_ok(); +} + +void implementation::to_well_formed_utf16le(const char16_t *input, size_t len, + char16_t *output) const noexcept { + return scalar::utf16::to_well_formed_utf16(input, len, + output); +} + +void implementation::to_well_formed_utf16be(const char16_t *input, size_t len, + char16_t *output) const noexcept { + return scalar::utf16::to_well_formed_utf16(input, len, + output); +} + +simdutf_warn_unused result implementation::validate_utf16le_with_errors( + const char16_t *buf, size_t len) const noexcept { + const auto res = + ppc64::utf16::validate_utf16_with_errors(buf, len); + if (res.count != len) { + auto scalar = scalar::utf16::validate_with_errors( + buf + res.count, len - res.count); + scalar.count += res.count; + return scalar; + } + + return res; +} + +simdutf_warn_unused result implementation::validate_utf16be_with_errors( + const char16_t *buf, size_t len) const noexcept { + const auto res = + ppc64::utf16::validate_utf16_with_errors(buf, len); + if (res.count != len) { + auto scalar = scalar::utf16::validate_with_errors( + buf + res.count, len - res.count); + scalar.count += res.count; + return scalar; + } + + return res; +} +#endif // SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused bool +implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept { + return utf32::validate(buf, len); +} +#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused result implementation::validate_utf32_with_errors( + const char32_t *buf, size_t len) const noexcept { + return utf32::validate_with_errors(buf, len); +} +#endif // SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_latin1_to_utf8( + const char *buf, size_t len, char *utf8_output) const noexcept { + const auto ret = ppc64_convert_latin1_to_utf8(buf, len, utf8_output); + size_t converted_chars = ret.second - utf8_output; + + if (ret.first != buf + len) { + const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + converted_chars += scalar_converted_chars; + } + + return converted_chars; +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + size_t n = + ppc64_convert_latin1_to_utf16(buf, len, utf16_output); + if (n < len) { + n += scalar::latin1_to_utf16::convert(buf + n, len - n, + utf16_output + n); + } + + return n; +} + +simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + size_t n = + ppc64_convert_latin1_to_utf16(buf, len, utf16_output); + if (n < len) { + n += scalar::latin1_to_utf16::convert(buf + n, len - n, + utf16_output + n); + } + + return n; +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_latin1_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept { + const auto ret = ppc64_convert_latin1_to_utf32(buf, len, utf32_output); + if (ret.first != buf + len) { + const size_t processed = ret.first - buf; + scalar::latin1_to_utf32::convert(ret.first, len - processed, ret.second); + } + + return len; +} +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept { + utf8_to_latin1::validating_transcoder converter; + return converter.convert(buf, len, latin1_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors( + const char *buf, size_t len, char *latin1_output) const noexcept { + utf8_to_latin1::validating_transcoder converter; + return converter.convert_with_errors(buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept { + return ppc64::utf8_to_latin1::convert_valid(buf, len, latin1_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert(buf, len, utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert_with_errors(buf, len, + utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert_with_errors(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + return utf8_to_utf16::convert_valid(buf, len, + utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + return utf8_to_utf16::convert_valid(buf, len, utf16_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_utf8_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept { + utf8_to_utf32::validating_transcoder converter; + return converter.convert(buf, len, utf32_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors( + const char *buf, size_t len, char32_t *utf32_output) const noexcept { + utf8_to_utf32::validating_transcoder converter; + return converter.convert_with_errors(buf, len, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32( + const char *input, size_t size, char32_t *utf32_output) const noexcept { + return utf8_to_utf32::convert_valid(input, size, utf32_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + + return convert_impl( + ppc64_convert_utf16_to_latin1, + scalar::utf16_to_latin1::convert, + buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + + return convert_impl( + ppc64_convert_utf16_to_latin1, + scalar::utf16_to_latin1::convert, + buf, len, latin1_output); +} + +simdutf_warn_unused result +implementation::convert_utf16le_to_latin1_with_errors( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + + return convert_with_errors_impl( + ppc64_convert_utf16_to_latin1, + scalar::utf16_to_latin1::convert_with_errors, + buf, len, latin1_output); +} + +simdutf_warn_unused result +implementation::convert_utf16be_to_latin1_with_errors( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + + return convert_with_errors_impl( + ppc64_convert_utf16_to_latin1, + scalar::utf16_to_latin1::convert_with_errors, + buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + // optimization opportunity: we could provide an optimized function. + return convert_utf16be_to_latin1(buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + // optimization opportunity: we could provide an optimized function. + return convert_utf16le_to_latin1(buf, len, latin1_output); +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + + return convert_impl(ppc64_convert_utf16_to_utf8, + scalar::utf16_to_utf8::convert, + buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + + return convert_impl( + ppc64_convert_utf16_to_utf8, + scalar::utf16_to_utf8::convert, + buf, len, utf8_output); +} + +simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + + return convert_with_errors_impl( + ppc64_convert_utf16_to_utf8, + scalar::utf16_to_utf8::simple_convert_with_errors, + buf, len, utf8_output); +} + +simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + + return convert_with_errors_impl( + ppc64_convert_utf16_to_utf8, + scalar::utf16_to_utf8::simple_convert_with_errors, buf, + len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + return convert_utf16le_to_utf8(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + return convert_utf16be_to_utf8(buf, len, utf8_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_utf32_to_latin1( + const char32_t *buf, size_t len, char *latin1_output) const noexcept { + return convert_impl(ppc64_convert_utf32_to_latin1, + scalar::utf32_to_latin1::convert, buf, len, + latin1_output); +} + +simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors( + const char32_t *buf, size_t len, char *latin1_output) const noexcept { + return convert_with_errors_impl( + ppc64_convert_utf32_to_latin1, + scalar::utf32_to_latin1::convert_with_errors, buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1( + const char32_t *buf, size_t len, char *latin1_output) const noexcept { + return convert_impl(ppc64_convert_utf32_to_latin1, + scalar::utf32_to_latin1::convert, buf, len, + latin1_output); +} +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_output) const noexcept { + return convert_impl(ppc64_convert_utf32_to_utf8, + scalar::utf32_to_utf8::convert, + buf, len, utf8_output); +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors( + const char32_t *buf, size_t len, char *utf8_output) const noexcept { + return convert_with_errors_impl( + ppc64_convert_utf32_to_utf8, + scalar::utf32_to_utf8::convert_with_errors, buf, + len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_output) const noexcept { + return convert_impl(ppc64_convert_utf32_to_utf8, + scalar::utf32_to_utf8::convert, + buf, len, utf8_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + + return convert_impl(ppc64_convert_utf32_to_utf16, + scalar::utf32_to_utf16::convert, buf, + len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + + return convert_impl( + ppc64_convert_utf32_to_utf16, + scalar::utf32_to_utf16::convert, buf, len, utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + + return convert_with_errors_impl( + ppc64_convert_utf32_to_utf16, + scalar::utf32_to_utf16::convert_with_errors, buf, len, + utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + + return convert_with_errors_impl( + ppc64_convert_utf32_to_utf16, + scalar::utf32_to_utf16::convert_with_errors, buf, len, + utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + + return convert_impl( + ppc64_convert_utf32_to_utf16, + scalar::utf32_to_utf16::convert, buf, len, + utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + + return convert_impl( + ppc64_convert_utf32_to_utf16, + scalar::utf32_to_utf16::convert, buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + return convert_impl(ppc64_convert_utf16_to_utf32, + scalar::utf16_to_utf32::convert, buf, + len, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + return convert_impl(ppc64_convert_utf16_to_utf32, + scalar::utf16_to_utf32::convert, buf, + len, utf32_output); +} + +simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + return convert_with_errors_impl( + ppc64_convert_utf16_to_utf32, + scalar::utf16_to_utf32::convert_with_errors, buf, len, + utf32_output); +} + +simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + return convert_with_errors_impl( + ppc64_convert_utf16_to_utf32, + scalar::utf16_to_utf32::convert_with_errors, buf, len, + utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + return convert_utf16le_to_utf32(buf, len, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + return convert_utf16be_to_utf32(buf, len, utf32_output); +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 +void implementation::change_endianness_utf16(const char16_t *input, + size_t length, + char16_t *output) const noexcept { + utf16::change_endianness_utf16(input, length, output); +} + +simdutf_warn_unused size_t implementation::count_utf16le( + const char16_t *input, size_t length) const noexcept { + return utf16::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::count_utf16be( + const char16_t *input, size_t length) const noexcept { + return utf16::count_code_points(input, length); +} +#endif // SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 +simdutf_warn_unused size_t +implementation::count_utf8(const char *input, size_t length) const noexcept { + return utf8::count_code_points(input, length); +} +#endif // SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::latin1_length_from_utf8( + const char *buf, size_t len) const noexcept { + return count_utf8(buf, len); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::utf8_length_from_latin1( + const char *input, size_t length) const noexcept { + const auto ret = ppc64_utf8_length_from_latin1(input, length); + const size_t consumed = ret.first - input; + + if (consumed == length) { + return ret.second; + } + + const auto scalar = + scalar::latin1::utf8_length_from_latin1(ret.first, length - consumed); + return scalar + ret.second; +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::utf8_length_from_utf16le( + const char16_t *input, size_t length) const noexcept { + return utf16::utf8_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf16be( + const char16_t *input, size_t length) const noexcept { + return utf16::utf8_length_from_utf16(input, length); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf32_length_from_utf16le( + const char16_t *input, size_t length) const noexcept { + return utf16::utf32_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf16be( + const char16_t *input, size_t length) const noexcept { + return utf16::utf32_length_from_utf16(input, length); +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::utf16_length_from_utf8( + const char *input, size_t length) const noexcept { + return utf8::utf16_length_from_utf8(input, length); +} +simdutf_warn_unused result +implementation::utf8_length_from_utf16le_with_replacement( + const char16_t *input, size_t length) const noexcept { + return scalar::utf16::utf8_length_from_utf16_with_replacement< + endianness::LITTLE>(input, length); +} + +simdutf_warn_unused result +implementation::utf8_length_from_utf16be_with_replacement( + const char16_t *input, size_t length) const noexcept { + return scalar::utf16::utf8_length_from_utf16_with_replacement< + endianness::BIG>(input, length); +} + +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf8_length_from_utf32( + const char32_t *input, size_t length) const noexcept { + return utf32::utf8_length_from_utf32(input, length); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf16_length_from_utf32( + const char32_t *input, size_t length) const noexcept { + return scalar::utf32::utf16_length_from_utf32(input, length); +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf32_length_from_utf8( + const char *input, size_t length) const noexcept { + return utf8::count_code_points(input, length); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_BASE64 +simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64( + const char *input, size_t length) const noexcept { + return scalar::base64::maximal_binary_length_from_base64(input, length); +} + +simdutf_warn_unused result implementation::base64_to_binary( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + if (options & base64_default_or_url) { + if (options == base64_options::base64_default_or_url_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } +} + +simdutf_warn_unused full_result implementation::base64_to_binary_details( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + if (options & base64_default_or_url) { + if (options == base64_options::base64_default_or_url_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } +} + +simdutf_warn_unused result implementation::base64_to_binary( + const char16_t *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + if (options & base64_default_or_url) { + if (options == base64_options::base64_default_or_url_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } +} + +simdutf_warn_unused full_result implementation::base64_to_binary_details( + const char16_t *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + if (options & base64_default_or_url) { + if (options == base64_options::base64_default_or_url_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } +} + +size_t implementation::binary_to_base64(const char *input, size_t length, + char *output, + base64_options options) const noexcept { + if (options & base64_url) { + return encode_base64(output, input, length, options); + } else { + return encode_base64(output, input, length, options); + } +} + +size_t implementation::binary_to_base64_with_lines( + const char *input, size_t length, char *output, size_t line_length, + base64_options options) const noexcept { + return scalar::base64::tail_encode_base64_impl(output, input, length, + options, line_length); +} + +const char *implementation::find(const char *start, const char *end, + char character) const noexcept { + return util::find(start, end, character); +} + +const char16_t *implementation::find(const char16_t *start, const char16_t *end, + char16_t character) const noexcept { + return util::find(start, end, character); +} +#endif // SIMDUTF_FEATURE_BASE64 + +#ifdef SIMDUTF_INTERNAL_TESTS +std::vector +implementation::internal_tests() const { + #define entry(proc) \ + TestProcedure { #proc, proc } + return {entry(base64_encoding_translate_6bit_values), + entry(base64_encoding_expand_6bit_fields), + entry(base64_decoding_valid), + entry(base64_decoding_invalid_ignore_errors), + entry(base64url_decoding_invalid_ignore_errors), + entry(base64_decoding_invalid_strict_errors), + entry(base64url_decoding_invalid_strict_errors), + entry(base64_decoding_pack), + entry(base64_compress)}; + #undef entry +} +#endif + +} // namespace ppc64 +} // namespace simdutf + +/* begin file src/simdutf/ppc64/end.h */ +/* end file src/simdutf/ppc64/end.h */ +/* end file src/ppc64/implementation.cpp */ +#endif +#if SIMDUTF_IMPLEMENTATION_RVV +/* begin file src/rvv/implementation.cpp */ +/* begin file src/simdutf/rvv/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "rvv" +// #define SIMDUTF_IMPLEMENTATION rvv + +#if SIMDUTF_CAN_ALWAYS_RUN_RVV +// nothing needed. +#else +SIMDUTF_TARGET_RVV +#endif +/* end file src/simdutf/rvv/begin.h */ +namespace simdutf { +namespace rvv { +namespace { +#ifndef SIMDUTF_RVV_H + #error "rvv.h must be included" +#endif + +} // unnamed namespace +} // namespace rvv +} // namespace simdutf + +// +// Implementation-specific overrides +// +namespace simdutf { +namespace rvv { +/* begin file src/rvv/rvv_helpers.inl.cpp */ +template +simdutf_really_inline static size_t +rvv_utf32_store_utf16_m4(uint16_t *dst, vuint32m4_t utf32, size_t vl, + vbool4_t m4even) { + /* convert [000000000000aaaa|aaaaaabbbbbbbbbb] + * to [110111bbbbbbbbbb|110110aaaaaaaaaa] */ + vuint32m4_t sur = __riscv_vsub_vx_u32m4(utf32, 0x10000, vl); + sur = __riscv_vor_vv_u32m4(__riscv_vsll_vx_u32m4(sur, 16, vl), + __riscv_vsrl_vx_u32m4(sur, 10, vl), vl); + sur = __riscv_vand_vx_u32m4(sur, 0x3FF03FF, vl); + sur = __riscv_vor_vx_u32m4(sur, 0xDC00D800, vl); + /* merge 1 byte utf32 and 2 byte sur */ + vbool8_t m4 = __riscv_vmsgtu_vx_u32m4_b8(utf32, 0xFFFF, vl); + vuint16m4_t utf32_16 = __riscv_vreinterpret_v_u32m4_u16m4( + __riscv_vmerge_vvm_u32m4(utf32, sur, m4, vl)); + /* compress and store */ + vbool4_t mOut = __riscv_vmor_mm_b4( + __riscv_vmsne_vx_u16m4_b4(utf32_16, 0, vl * 2), m4even, vl * 2); + vuint16m4_t vout = __riscv_vcompress_vm_u16m4(utf32_16, mOut, vl * 2); + vl = __riscv_vcpop_m_b4(mOut, vl * 2); + __riscv_vse16_v_u16m4(dst, simdutf_byteflip(vout, vl), vl); + return vl; +}; +/* end file src/rvv/rvv_helpers.inl.cpp */ + +/* begin file src/rvv/rvv_length_from.inl.cpp */ +#if SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t +implementation::count_utf16le(const char16_t *src, size_t len) const noexcept { + return utf32_length_from_utf16le(src, len); +} + +simdutf_warn_unused size_t +implementation::count_utf16be(const char16_t *src, size_t len) const noexcept { + return utf32_length_from_utf16be(src, len); +} +#endif // SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 +simdutf_warn_unused size_t +implementation::count_utf8(const char *src, size_t len) const noexcept { + return utf32_length_from_utf8(src, len); +} +#endif // SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::latin1_length_from_utf8( + const char *src, size_t len) const noexcept { + return utf32_length_from_utf8(src, len); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf32_length_from_utf8( + const char *src, size_t len) const noexcept { + size_t count = 0; + for (size_t vl; len > 0; len -= vl, src += vl) { + vl = __riscv_vsetvl_e8m8(len); + vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl); + vbool1_t mask = __riscv_vmsgt_vx_i8m8_b1(v, -65, vl); + count += __riscv_vcpop_m_b1(mask, vl); + } + return count; +} +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32 +template +simdutf_really_inline static size_t +rvv_utf32_length_from_utf16(const char16_t *src, size_t len) { + size_t count = 0; + for (size_t vl; len > 0; len -= vl, src += vl) { + vl = __riscv_vsetvl_e16m8(len); + vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl); + v = simdutf_byteflip(v, vl); + vbool2_t notHigh = + __riscv_vmor_mm_b2(__riscv_vmsgtu_vx_u16m8_b2(v, 0xDFFF, vl), + __riscv_vmsltu_vx_u16m8_b2(v, 0xDC00, vl), vl); + count += __riscv_vcpop_m_b2(notHigh, vl); + } + return count; +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf16le( + const char16_t *src, size_t len) const noexcept { + return rvv_utf32_length_from_utf16(src, len); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf16be( + const char16_t *src, size_t len) const noexcept { + if (supports_zvbb()) + return rvv_utf32_length_from_utf16(src, len); + else + return rvv_utf32_length_from_utf16(src, len); +} +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::utf8_length_from_latin1( + const char *src, size_t len) const noexcept { + size_t count = len; + for (size_t vl; len > 0; len -= vl, src += vl) { + vl = __riscv_vsetvl_e8m8(len); + vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl); + count += __riscv_vcpop_m_b1(__riscv_vmslt_vx_i8m8_b1(v, 0, vl), vl); + } + return count; +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +template +simdutf_really_inline static size_t +rvv_utf8_length_from_utf16(const char16_t *src, size_t len) { + size_t count = 0; + for (size_t vl; len > 0; len -= vl, src += vl) { + vl = __riscv_vsetvl_e16m8(len); + vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl); + v = simdutf_byteflip(v, vl); + vbool2_t m234 = __riscv_vmsgtu_vx_u16m8_b2(v, 0x7F, vl); + vbool2_t m34 = __riscv_vmsgtu_vx_u16m8_b2(v, 0x7FF, vl); + vbool2_t notSur = + __riscv_vmor_mm_b2(__riscv_vmsltu_vx_u16m8_b2(v, 0xD800, vl), + __riscv_vmsgtu_vx_u16m8_b2(v, 0xDFFF, vl), vl); + vbool2_t m3 = __riscv_vmand_mm_b2(m34, notSur, vl); + count += vl + __riscv_vcpop_m_b2(m234, vl) + __riscv_vcpop_m_b2(m3, vl); + } + return count; +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf16le( + const char16_t *src, size_t len) const noexcept { + return rvv_utf8_length_from_utf16(src, len); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf16be( + const char16_t *src, size_t len) const noexcept { + if (supports_zvbb()) + return rvv_utf8_length_from_utf16(src, len); + else + return rvv_utf8_length_from_utf16(src, len); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf8_length_from_utf32( + const char32_t *src, size_t len) const noexcept { + size_t count = 0; + for (size_t vl; len > 0; len -= vl, src += vl) { + vl = __riscv_vsetvl_e32m8(len); + vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl); + vbool4_t m234 = __riscv_vmsgtu_vx_u32m8_b4(v, 0x7F, vl); + vbool4_t m34 = __riscv_vmsgtu_vx_u32m8_b4(v, 0x7FF, vl); + vbool4_t m4 = __riscv_vmsgtu_vx_u32m8_b4(v, 0xFFFF, vl); + count += vl + __riscv_vcpop_m_b4(m234, vl) + __riscv_vcpop_m_b4(m34, vl) + + __riscv_vcpop_m_b4(m4, vl); + } + return count; +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::utf16_length_from_utf8( + const char *src, size_t len) const noexcept { + size_t count = 0; + for (size_t vl; len > 0; len -= vl, src += vl) { + vl = __riscv_vsetvl_e8m8(len); + vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl); + vbool1_t m1234 = __riscv_vmsgt_vx_i8m8_b1(v, -65, vl); + vbool1_t m4 = __riscv_vmsgtu_vx_u8m8_b1(__riscv_vreinterpret_u8m8(v), + (uint8_t)0b11101111, vl); + count += __riscv_vcpop_m_b1(m1234, vl) + __riscv_vcpop_m_b1(m4, vl); + } + return count; +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf16_length_from_utf32( + const char32_t *src, size_t len) const noexcept { + size_t count = 0; + for (size_t vl; len > 0; len -= vl, src += vl) { + vl = __riscv_vsetvl_e32m8(len); + vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl); + vbool4_t m4 = __riscv_vmsgtu_vx_u32m8_b4(v, 0xFFFF, vl); + count += vl + __riscv_vcpop_m_b4(m4, vl); + } + return count; +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +/* end file src/rvv/rvv_length_from.inl.cpp */ +/* begin file src/rvv/rvv_validate.inl.cpp */ +#if SIMDUTF_FEATURE_ASCII +simdutf_warn_unused bool +implementation::validate_ascii(const char *src, size_t len) const noexcept { + size_t vlmax = __riscv_vsetvlmax_e8m8(); + vint8m8_t mask = __riscv_vmv_v_x_i8m8(0, vlmax); + for (size_t vl; len > 0; len -= vl, src += vl) { + vl = __riscv_vsetvl_e8m8(len); + vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl); + mask = __riscv_vor_vv_i8m8_tu(mask, mask, v, vl); + } + return __riscv_vfirst_m_b1(__riscv_vmslt_vx_i8m8_b1(mask, 0, vlmax), vlmax) < + 0; +} + +simdutf_warn_unused result implementation::validate_ascii_with_errors( + const char *src, size_t len) const noexcept { + const char *beg = src; + for (size_t vl; len > 0; len -= vl, src += vl) { + vl = __riscv_vsetvl_e8m8(len); + vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl); + long idx = __riscv_vfirst_m_b1(__riscv_vmslt_vx_i8m8_b1(v, 0, vl), vl); + if (idx >= 0) + return result(error_code::TOO_LARGE, src - beg + idx); + } + return result(error_code::SUCCESS, src - beg); +} +#endif // SIMDUTF_FEATURE_ASCII +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII +template +simdutf_really_inline bool rvv_validate_utf16_as_ascii(const char16_t *buf, + size_t len) noexcept { + const char16_t *src = buf; + for (size_t vl; len > 0; len -= vl, src += vl) { + vl = __riscv_vsetvl_e16m8(len); + vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl); + v = simdutf_byteflip(v, vl); + long idx = __riscv_vfirst_m_b2(__riscv_vmsgtu_vx_u16m8_b2(v, 0x7f, vl), vl); + if (idx >= 0) + return false; + } + return true; +} +simdutf_warn_unused bool +implementation::validate_utf16le_as_ascii(const char16_t *buf, + size_t len) const noexcept { + return rvv_validate_utf16_as_ascii(buf, len); +} + +simdutf_warn_unused bool +implementation::validate_utf16be_as_ascii(const char16_t *buf, + size_t len) const noexcept { + if (supports_zvbb()) + return rvv_validate_utf16_as_ascii(buf, len); + else + return rvv_validate_utf16_as_ascii(buf, len); +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +/* Returns a close estimation of the number of valid UTF-8 bytes up to the + * first invalid one, but never overestimating. */ +simdutf_really_inline static size_t rvv_count_valid_utf8(const char *src, + size_t len) { + const char *beg = src; + if (len < 32) + return 0; + + /* validate first three bytes */ + { + size_t idx = 3; + while (idx < len && (uint8_t(src[idx]) >> 6) == 0b10) + ++idx; + if (idx > 3 + 3 || !scalar::utf8::validate(src, idx)) + return 0; + } + + static const uint64_t err1m[] = {0x0202020202020202, 0x4915012180808080}; + static const uint64_t err2m[] = {0xCBCBCB8B8383A3E7, 0xCBCBDBCBCBCBCBCB}; + static const uint64_t err3m[] = {0x0101010101010101, 0X01010101BABAAEE6}; + + const vuint8m1_t err1tbl = + __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err1m, 2)); + const vuint8m1_t err2tbl = + __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err2m, 2)); + const vuint8m1_t err3tbl = + __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err3m, 2)); + + size_t tail = 3; + size_t n = len - tail; + + for (size_t vl; n > 0; n -= vl, src += vl) { + vl = __riscv_vsetvl_e8m4(n); + vuint8m4_t v0 = __riscv_vle8_v_u8m4((uint8_t const *)src, vl); + + uint8_t next0 = src[vl + 0]; + uint8_t next1 = src[vl + 1]; + uint8_t next2 = src[vl + 2]; + + /* fast path: ASCII */ + if (__riscv_vfirst_m_b2(__riscv_vmsgtu_vx_u8m4_b2(v0, 0b01111111, vl), vl) < + 0 && + (next0 | next1 | next2) < 0b10000000) + continue; + + /* see "Validating UTF-8 In Less Than One Instruction Per Byte" + * https://arxiv.org/abs/2010.03090 */ + vuint8m4_t v1 = __riscv_vslide1down_vx_u8m4(v0, next0, vl); + vuint8m4_t v2 = __riscv_vslide1down_vx_u8m4(v1, next1, vl); + + vuint8m4_t v2_hi_nibble = __riscv_vsrl_vx_u8m4(v2, 4, vl); + vuint8m4_t v3_hi_nibble = + __riscv_vslide1down_vx_u8m4(v2_hi_nibble, next2 >> 4, vl); + + vuint8m4_t idx2 = __riscv_vand_vx_u8m4(v2, 0xF, vl); + vuint8m4_t idx1 = v2_hi_nibble; + vuint8m4_t idx3 = v3_hi_nibble; + + vuint8m4_t err1 = simdutf_vrgather_u8m1x4(err1tbl, idx1); + vuint8m4_t err2 = simdutf_vrgather_u8m1x4(err2tbl, idx2); + vuint8m4_t err3 = simdutf_vrgather_u8m1x4(err3tbl, idx3); + vint8m4_t errs = __riscv_vreinterpret_v_u8m4_i8m4( + __riscv_vand_vv_u8m4(__riscv_vand_vv_u8m4(err1, err2, vl), err3, vl)); + + vbool2_t is_3 = __riscv_vmsgtu_vx_u8m4_b2(v1, 0b11100000 - 1, vl); + vbool2_t is_4 = __riscv_vmsgtu_vx_u8m4_b2(v0, 0b11110000 - 1, vl); + vbool2_t is_34 = __riscv_vmor_mm_b2(is_3, is_4, vl); + vbool2_t err34 = + __riscv_vmxor_mm_b2(is_34, __riscv_vmslt_vx_i8m4_b2(errs, 0, vl), vl); + vbool2_t errm = + __riscv_vmor_mm_b2(__riscv_vmsgt_vx_i8m4_b2(errs, 0, vl), err34, vl); + if (__riscv_vfirst_m_b2(errm, vl) >= 0) + break; + } + + /* we need to validate the last character */ + while (tail < len && (uint8_t(src[0]) >> 6) == 0b10) + --src, ++tail; + return src - beg; +} + +simdutf_warn_unused bool +implementation::validate_utf8(const char *src, size_t len) const noexcept { + size_t count = rvv_count_valid_utf8(src, len); + return scalar::utf8::validate(src + count, len - count); +} +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF8 +simdutf_warn_unused result implementation::validate_utf8_with_errors( + const char *src, size_t len) const noexcept { + size_t count = rvv_count_valid_utf8(src, len); + result res = scalar::utf8::validate_with_errors(src + count, len - count); + return result(res.error, count + res.count); +} +#endif // SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +template +simdutf_really_inline static result +rvv_validate_utf16_with_errors(const char16_t *src, size_t len) { + const char16_t *beg = src; + + const uint16_t mask = simdutf_byteflip(0xfc00); + const uint16_t hi_surrogate = simdutf_byteflip(0xd800); + const uint16_t lo_surrogate = simdutf_byteflip(0xdc00); + + uint16_t last = 0; + for (size_t vl; len > 0; len -= vl, src += vl, last = src[-1]) { + vl = __riscv_vsetvl_e16m8(len); + vuint16m8_t v1 = __riscv_vle16_v_u16m8((const uint16_t *)src, vl); + vuint16m8_t v0 = __riscv_vslide1up_vx_u16m8(v1, last, vl); + + vbool2_t surhi = __riscv_vmseq_vx_u16m8_b2( + __riscv_vand_vx_u16m8(v0, mask, vl), hi_surrogate, vl); + vbool2_t surlo = __riscv_vmseq_vx_u16m8_b2( + __riscv_vand_vx_u16m8(v1, mask, vl), lo_surrogate, vl); + + long idx = __riscv_vfirst_m_b2(__riscv_vmxor_mm_b2(surhi, surlo, vl), vl); + if (idx >= 0) { + last = simdutf_byteflip(idx > 0 ? src[idx - 1] : last); + return result(error_code::SURROGATE, + src - beg + idx - (last - 0xD800u < 0x400u)); + break; + } + } + if (simdutf_byteflip(last) - 0xD800u < 0x400u) { + return result(error_code::SURROGATE, + src - beg - 1); /* end on high surrogate */ + } else { + return result(error_code::SUCCESS, src - beg); + } +} +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused bool +implementation::validate_utf16le(const char16_t *src, + size_t len) const noexcept { + return rvv_validate_utf16_with_errors(src, len) + .error == error_code::SUCCESS; +} +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused bool +implementation::validate_utf16be(const char16_t *src, + size_t len) const noexcept { + return validate_utf16be_with_errors(src, len).error == error_code::SUCCESS; +} +#endif // SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused result implementation::validate_utf16le_with_errors( + const char16_t *src, size_t len) const noexcept { + return rvv_validate_utf16_with_errors(src, len); +} + +simdutf_warn_unused result implementation::validate_utf16be_with_errors( + const char16_t *src, size_t len) const noexcept { + if (supports_zvbb()) + return rvv_validate_utf16_with_errors(src, len); + else + return rvv_validate_utf16_with_errors(src, len); +} +#endif // SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused bool +implementation::validate_utf32(const char32_t *src, size_t len) const noexcept { + size_t vlmax = __riscv_vsetvlmax_e32m8(); + vuint32m8_t max = __riscv_vmv_v_x_u32m8(0x10FFFF, vlmax); + vuint32m8_t maxOff = __riscv_vmv_v_x_u32m8(0xFFFFF7FF, vlmax); + for (size_t vl; len > 0; len -= vl, src += vl) { + vl = __riscv_vsetvl_e32m8(len); + vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl); + vuint32m8_t off = __riscv_vadd_vx_u32m8(v, 0xFFFF2000, vl); + max = __riscv_vmaxu_vv_u32m8_tu(max, max, v, vl); + maxOff = __riscv_vmaxu_vv_u32m8_tu(maxOff, maxOff, off, vl); + } + return __riscv_vfirst_m_b4( + __riscv_vmor_mm_b4( + __riscv_vmsne_vx_u32m8_b4(max, 0x10FFFF, vlmax), + __riscv_vmsne_vx_u32m8_b4(maxOff, 0xFFFFF7FF, vlmax), vlmax), + vlmax) < 0; +} +#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused result implementation::validate_utf32_with_errors( + const char32_t *src, size_t len) const noexcept { + const char32_t *beg = src; + for (size_t vl; len > 0; len -= vl, src += vl) { + vl = __riscv_vsetvl_e32m8(len); + vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl); + vuint32m8_t off = __riscv_vadd_vx_u32m8(v, 0xFFFF2000, vl); + long idx1 = + __riscv_vfirst_m_b4(__riscv_vmsgtu_vx_u32m8_b4(v, 0x10FFFF, vl), vl); + long idx2 = __riscv_vfirst_m_b4( + __riscv_vmsgtu_vx_u32m8_b4(off, 0xFFFFF7FF, vl), vl); + if (idx1 >= 0 && idx2 >= 0) { + if (idx1 <= idx2) { + return result(error_code::TOO_LARGE, src - beg + idx1); + } else { + return result(error_code::SURROGATE, src - beg + idx2); + } + } + if (idx1 >= 0) { + return result(error_code::TOO_LARGE, src - beg + idx1); + } + if (idx2 >= 0) { + return result(error_code::SURROGATE, src - beg + idx2); + } + } + return result(error_code::SUCCESS, src - beg); +} +#endif // SIMDUTF_FEATURE_UTF32 +/* end file src/rvv/rvv_validate.inl.cpp */ + +/* begin file src/rvv/rvv_latin1_to.inl.cpp */ +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_latin1_to_utf8( + const char *src, size_t len, char *dst) const noexcept { + char *beg = dst; + for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut) { + vl = __riscv_vsetvl_e8m2(len); + vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t *)src, vl); + vbool4_t nascii = + __riscv_vmslt_vx_i8m2_b4(__riscv_vreinterpret_v_u8m2_i8m2(v1), 0, vl); + size_t cnt = __riscv_vcpop_m_b4(nascii, vl); + vlOut = vl + cnt; + if (cnt == 0) { + __riscv_vse8_v_u8m2((uint8_t *)dst, v1, vlOut); + continue; + } + + vuint8m2_t v0 = + __riscv_vor_vx_u8m2(__riscv_vsrl_vx_u8m2(v1, 6, vl), 0b11000000, vl); + v1 = __riscv_vand_vx_u8m2_mu(nascii, v1, v1, 0b10111111, vl); + + vuint8m4_t wide = + __riscv_vreinterpret_v_u16m4_u8m4(__riscv_vwmaccu_vx_u16m4( + __riscv_vwaddu_vv_u16m4(v0, v1, vl), 0xFF, v1, vl)); + vbool2_t mask = __riscv_vmsgtu_vx_u8m4_b2( + __riscv_vsub_vx_u8m4(wide, 0b11000000, vl * 2), 1, vl * 2); + vuint8m4_t comp = __riscv_vcompress_vm_u8m4(wide, mask, vl * 2); + + __riscv_vse8_v_u8m4((uint8_t *)dst, comp, vlOut); + } + return dst - beg; +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le( + const char *src, size_t len, char16_t *dst) const noexcept { + char16_t *beg = dst; + for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) { + vl = __riscv_vsetvl_e8m4(len); + vuint8m4_t v = __riscv_vle8_v_u8m4((uint8_t *)src, vl); + __riscv_vse16_v_u16m8((uint16_t *)dst, __riscv_vzext_vf2_u16m8(v, vl), vl); + } + return dst - beg; +} + +simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be( + const char *src, size_t len, char16_t *dst) const noexcept { + char16_t *beg = dst; + for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) { + vl = __riscv_vsetvl_e8m4(len); + vuint8m4_t v = __riscv_vle8_v_u8m4((uint8_t *)src, vl); + __riscv_vse16_v_u16m8( + (uint16_t *)dst, + __riscv_vsll_vx_u16m8(__riscv_vzext_vf2_u16m8(v, vl), 8, vl), vl); + } + return dst - beg; +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_latin1_to_utf32( + const char *src, size_t len, char32_t *dst) const noexcept { + char32_t *beg = dst; + for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) { + vl = __riscv_vsetvl_e8m2(len); + vuint8m2_t v = __riscv_vle8_v_u8m2((uint8_t *)src, vl); + __riscv_vse32_v_u32m8((uint32_t *)dst, __riscv_vzext_vf4_u32m8(v, vl), vl); + } + return dst - beg; +} +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +/* end file src/rvv/rvv_latin1_to.inl.cpp */ +/* begin file src/rvv/rvv_utf16_to.inl.cpp */ +#if SIMDUTF_FEATURE_UTF16 +template +simdutf_really_inline static result +rvv_utf16_to_latin1_with_errors(const char16_t *src, size_t len, char *dst) { + const char16_t *const beg = src; + for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) { + vl = __riscv_vsetvl_e16m8(len); + vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl); + v = simdutf_byteflip(v, vl); + long idx = __riscv_vfirst_m_b2(__riscv_vmsgtu_vx_u16m8_b2(v, 255, vl), vl); + if (idx >= 0) + return result(error_code::TOO_LARGE, src - beg + idx); + __riscv_vse8_v_u8m4((uint8_t *)dst, __riscv_vncvt_x_x_w_u8m4(v, vl), vl); + } + return result(error_code::SUCCESS, src - beg); +} +#endif // SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1( + const char16_t *src, size_t len, char *dst) const noexcept { + result res = convert_utf16le_to_latin1_with_errors(src, len, dst); + return res.error == error_code::SUCCESS ? res.count : 0; +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1( + const char16_t *src, size_t len, char *dst) const noexcept { + result res = convert_utf16be_to_latin1_with_errors(src, len, dst); + return res.error == error_code::SUCCESS ? res.count : 0; +} + +simdutf_warn_unused result +implementation::convert_utf16le_to_latin1_with_errors( + const char16_t *src, size_t len, char *dst) const noexcept { + return rvv_utf16_to_latin1_with_errors(src, len, dst); +} + +simdutf_warn_unused result +implementation::convert_utf16be_to_latin1_with_errors( + const char16_t *src, size_t len, char *dst) const noexcept { + if (supports_zvbb()) + return rvv_utf16_to_latin1_with_errors(src, len, + dst); + else + return rvv_utf16_to_latin1_with_errors(src, len, dst); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1( + const char16_t *src, size_t len, char *dst) const noexcept { + const char16_t *const beg = src; + for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) { + vl = __riscv_vsetvl_e16m8(len); + vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl); + __riscv_vse8_v_u8m4((uint8_t *)dst, __riscv_vncvt_x_x_w_u8m4(v, vl), vl); + } + return src - beg; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1( + const char16_t *src, size_t len, char *dst) const noexcept { + const char16_t *const beg = src; + for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) { + vl = __riscv_vsetvl_e16m8(len); + vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl); + __riscv_vse8_v_u8m4((uint8_t *)dst, __riscv_vnsrl_wx_u8m4(v, 8, vl), vl); + } + return src - beg; +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +template +simdutf_really_inline static result +rvv_utf16_to_utf8_with_errors(const char16_t *src, size_t len, char *dst) { + size_t n = len; + const char16_t *srcBeg = src; + const char *dstBeg = dst; + size_t vl8m4 = __riscv_vsetvlmax_e8m4(); + vbool2_t m4mulp2 = __riscv_vmseq_vx_u8m4_b2( + __riscv_vand_vx_u8m4(__riscv_vid_v_u8m4(vl8m4), 3, vl8m4), 2, vl8m4); + + for (size_t vl, vlOut; n > 0;) { + vl = __riscv_vsetvl_e16m2(n); + + vuint16m2_t v = __riscv_vle16_v_u16m2((uint16_t const *)src, vl); + v = simdutf_byteflip(v, vl); + vbool8_t m234 = __riscv_vmsgtu_vx_u16m2_b8(v, 0x80 - 1, vl); + + if (__riscv_vfirst_m_b8(m234, vl) < 0) { /* 1 byte utf8 */ + vlOut = vl; + __riscv_vse8_v_u8m1((uint8_t *)dst, __riscv_vncvt_x_x_w_u8m1(v, vlOut), + vlOut); + n -= vl, src += vl, dst += vlOut; + continue; + } + + vbool8_t m34 = __riscv_vmsgtu_vx_u16m2_b8(v, 0x800 - 1, vl); + + if (__riscv_vfirst_m_b8(m34, vl) < 0) { /* 1/2 byte utf8 */ + /* 0: [ aaa|aabbbbbb] + * 1: [aabbbbbb| ] vsll 8 + * 2: [ | aaaaa] vsrl 6 + * 3: [00111111|00011111] + * 4: [ bbbbbb|000aaaaa] (1|2)&3 + * 5: [11000000|11000000] + * 6: [10bbbbbb|110aaaaa] 4|5 */ + vuint16m2_t twoByte = __riscv_vand_vx_u16m2( + __riscv_vor_vv_u16m2(__riscv_vsll_vx_u16m2(v, 8, vl), + __riscv_vsrl_vx_u16m2(v, 6, vl), vl), + 0b0011111100011111, vl); + vuint16m2_t vout16 = + __riscv_vor_vx_u16m2_mu(m234, v, twoByte, 0b1000000011000000, vl); + vuint8m2_t vout = __riscv_vreinterpret_v_u16m2_u8m2(vout16); + + /* Every high byte that is zero should be compressed + * low bytes should never be compressed, so we set them + * to all ones, and then create a non-zero bytes mask */ + vbool4_t mcomp = + __riscv_vmsne_vx_u8m2_b4(__riscv_vreinterpret_v_u16m2_u8m2( + __riscv_vor_vx_u16m2(vout16, 0xFF, vl)), + 0, vl * 2); + vlOut = __riscv_vcpop_m_b4(mcomp, vl * 2); + + vout = __riscv_vcompress_vm_u8m2(vout, mcomp, vl * 2); + __riscv_vse8_v_u8m2((uint8_t *)dst, vout, vlOut); + + n -= vl, src += vl, dst += vlOut; + continue; + } + + vbool8_t sur = __riscv_vmseq_vx_u16m2_b8( + __riscv_vand_vx_u16m2(v, 0xF800, vl), 0xD800, vl); + long first = __riscv_vfirst_m_b8(sur, vl); + size_t tail = vl - first; + vl = first < 0 ? vl : first; + + if (vl > 0) { /* 1/2/3 byte utf8 */ + /* in: [aaaabbbb|bbcccccc] + * v1: [0bcccccc| ] vsll 8 + * v1: [10cccccc| ] vsll 8 & 0b00111111 | 0b10000000 + * v2: [ |110bbbbb] vsrl 6 & 0b00111111 | 0b11000000 + * v2: [ |10bbbbbb] vsrl 6 & 0b00111111 | 0b10000000 + * v3: [ |1110aaaa] vsrl 12 | 0b11100000 + * 1: [00000000|0bcccccc|00000000|00000000] => [0bcccccc] + * 2: [00000000|10cccccc|110bbbbb|00000000] => [110bbbbb] [10cccccc] + * 3: [00000000|10cccccc|10bbbbbb|1110aaaa] => [1110aaaa] [10bbbbbb] + * [10cccccc] + */ + vuint16m2_t v1, v2, v3, v12; + v1 = __riscv_vor_vx_u16m2_mu( + m234, v, __riscv_vand_vx_u16m2(v, 0b00111111, vl), 0b10000000, vl); + v1 = __riscv_vsll_vx_u16m2(v1, 8, vl); + + v2 = __riscv_vor_vx_u16m2( + __riscv_vand_vx_u16m2(__riscv_vsrl_vx_u16m2(v, 6, vl), 0b00111111, + vl), + 0b10000000, vl); + v2 = __riscv_vor_vx_u16m2_mu(__riscv_vmnot_m_b8(m34, vl), v2, v2, + 0b01000000, vl); + v3 = __riscv_vor_vx_u16m2(__riscv_vsrl_vx_u16m2(v, 12, vl), 0b11100000, + vl); + v12 = __riscv_vor_vv_u16m2_mu(m234, v1, v1, v2, vl); + + vuint32m4_t w12 = __riscv_vwmulu_vx_u32m4(v12, 1 << 8, vl); + vuint32m4_t w123 = __riscv_vwaddu_wv_u32m4_mu(m34, w12, w12, v3, vl); + vuint8m4_t vout = __riscv_vreinterpret_v_u32m4_u8m4(w123); + + vbool2_t mcomp = __riscv_vmor_mm_b2( + m4mulp2, __riscv_vmsne_vx_u8m4_b2(vout, 0, vl * 4), vl * 4); + vlOut = __riscv_vcpop_m_b2(mcomp, vl * 4); + + vout = __riscv_vcompress_vm_u8m4(vout, mcomp, vl * 4); + __riscv_vse8_v_u8m4((uint8_t *)dst, vout, vlOut); + + n -= vl, src += vl, dst += vlOut; + } + + if (tail) + while (n) { + uint16_t word = simdutf_byteflip(src[0]); + if ((word & 0xFF80) == 0) { + break; + } else if ((word & 0xF800) == 0) { + break; + } else if ((word & 0xF800) != 0xD800) { + break; + } else { + // must be a surrogate pair + if (n <= 1) + return result(error_code::SURROGATE, src - srcBeg); + uint16_t diff = word - 0xD800; + if (diff > 0x3FF) + return result(error_code::SURROGATE, src - srcBeg); + uint16_t diff2 = simdutf_byteflip(src[1]) - 0xDC00; + if (diff2 > 0x3FF) + return result(error_code::SURROGATE, src - srcBeg); + + uint32_t value = ((diff + 0x40) << 10) + diff2; + + // will generate four UTF-8 bytes + // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX + *dst++ = (char)((value >> 18) | 0b11110000); + *dst++ = (char)(((value >> 12) & 0b111111) | 0b10000000); + *dst++ = (char)(((value >> 6) & 0b111111) | 0b10000000); + *dst++ = (char)((value & 0b111111) | 0b10000000); + src += 2; + n -= 2; + } + } + } + + return result(error_code::SUCCESS, dst - dstBeg); +} + +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8( + const char16_t *src, size_t len, char *dst) const noexcept { + result res = convert_utf16le_to_utf8_with_errors(src, len, dst); + return res.error == error_code::SUCCESS ? res.count : 0; +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8( + const char16_t *src, size_t len, char *dst) const noexcept { + result res = convert_utf16be_to_utf8_with_errors(src, len, dst); + return res.error == error_code::SUCCESS ? res.count : 0; +} + +simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors( + const char16_t *src, size_t len, char *dst) const noexcept { + return rvv_utf16_to_utf8_with_errors(src, len, dst); +} + +simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors( + const char16_t *src, size_t len, char *dst) const noexcept { + if (supports_zvbb()) + return rvv_utf16_to_utf8_with_errors(src, len, dst); + else + return rvv_utf16_to_utf8_with_errors(src, len, dst); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8( + const char16_t *src, size_t len, char *dst) const noexcept { + return convert_utf16le_to_utf8(src, len, dst); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8( + const char16_t *src, size_t len, char *dst) const noexcept { + return convert_utf16be_to_utf8(src, len, dst); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +template +simdutf_really_inline static result +rvv_utf16_to_utf32_with_errors(const char16_t *src, size_t len, char32_t *dst) { + const char16_t *const srcBeg = src; + char32_t *const dstBeg = dst; + + constexpr const uint16_t ANY_SURROGATE_MASK = 0xf800; + constexpr const uint16_t ANY_SURROGATE_VALUE = 0xd800; + constexpr const uint16_t LO_SURROGATE_MASK = 0xfc00; + constexpr const uint16_t LO_SURROGATE_VALUE = 0xdc00; + constexpr const uint16_t HI_SURROGATE_MASK = 0xfc00; + constexpr const uint16_t HI_SURROGATE_VALUE = 0xd800; + + uint16_t last = 0; + while (len > 0) { + size_t vl = __riscv_vsetvl_e16m2(len); + vuint16m2_t v0 = __riscv_vle16_v_u16m2((uint16_t const *)src, vl); + v0 = simdutf_byteflip(v0, vl); + + { // check fast-path + const vuint16m2_t v = __riscv_vand_vx_u16m2(v0, ANY_SURROGATE_MASK, vl); + const vbool8_t any_surrogate = + __riscv_vmseq_vx_u16m2_b8(v, ANY_SURROGATE_VALUE, vl); + if (__riscv_vfirst_m_b8(any_surrogate, vl) < 0) { + /* no surrogates */ + __riscv_vse32_v_u32m4((uint32_t *)dst, __riscv_vzext_vf2_u32m4(v0, vl), + vl); + len -= vl; + src += vl; + dst += vl; + continue; + } + } + + if ((simdutf_byteflip(src[0]) & LO_SURROGATE_MASK) == + LO_SURROGATE_VALUE) { + return result(error_code::SURROGATE, src - srcBeg); + } + + // decode surrogates + vuint16m2_t v1 = __riscv_vslide1down_vx_u16m2(v0, 0, vl); + vl = __riscv_vsetvl_e16m2(vl - 1); + if (vl == 0) { + return result(error_code::SURROGATE, src - srcBeg); + } + + const vbool8_t surhi = __riscv_vmseq_vx_u16m2_b8( + __riscv_vand_vx_u16m2(v0, HI_SURROGATE_MASK, vl), HI_SURROGATE_VALUE, + vl); + const vbool8_t surlo = __riscv_vmseq_vx_u16m2_b8( + __riscv_vand_vx_u16m2(v1, LO_SURROGATE_MASK, vl), LO_SURROGATE_VALUE, + vl); + + // compress everything but lo surrogates + const vbool8_t compress = __riscv_vmsne_vx_u16m2_b8( + __riscv_vand_vx_u16m2(v0, LO_SURROGATE_MASK, vl), LO_SURROGATE_VALUE, + vl); + + { + const vbool8_t diff = __riscv_vmxor_mm_b8(surhi, surlo, vl); + const long idx = __riscv_vfirst_m_b8(diff, vl); + if (idx >= 0) { + uint16_t word = simdutf_byteflip(src[idx]); + if (word < 0xD800 || word > 0xDBFF) { + return result(error_code::SURROGATE, src - srcBeg + idx + 1); + } + return result(error_code::SURROGATE, src - srcBeg + idx); + } + } + + last = simdutf_byteflip(src[vl]); + vuint32m4_t utf32 = __riscv_vzext_vf2_u32m4(v0, vl); + + // v0 = 110110yyyyyyyyyy (0xd800 + yyyyyyyyyy) --- hi surrogate + // v1 = 110111xxxxxxxxxx (0xdc00 + xxxxxxxxxx) --- lo surrogate + + // t0 = u16( 0000_00yy_yyyy_yyyy) + const vuint32m4_t t0 = + __riscv_vzext_vf2_u32m4(__riscv_vand_vx_u16m2(v0, 0x03ff, vl), vl); + // t1 = u32(0000_0000_0000_yyyy_yyyy_yy00_0000_0000) + const vuint32m4_t t1 = __riscv_vsll_vx_u32m4(t0, 10, vl); + + // t2 = u32(0000_0000_0000_0000_0000_00xx_xxxx_xxxx) + const vuint32m4_t t2 = + __riscv_vzext_vf2_u32m4(__riscv_vand_vx_u16m2(v1, 0x03ff, vl), vl); + + // t3 = u32(0000_0000_0000_yyyy_yyyy_yyxx_xxxx_xxxx) + const vuint32m4_t t3 = __riscv_vor_vv_u32m4(t1, t2, vl); + + // t4 = utf32 from surrogate pairs + const vuint32m4_t t4 = __riscv_vadd_vx_u32m4(t3, 0x10000, vl); + + const vuint32m4_t result = __riscv_vmerge_vvm_u32m4(utf32, t4, surhi, vl); + + const vuint32m4_t comp = __riscv_vcompress_vm_u32m4(result, compress, vl); + const size_t vlOut = __riscv_vcpop_m_b8(compress, vl); + __riscv_vse32_v_u32m4((uint32_t *)dst, comp, vlOut); + + len -= vl; + src += vl; + dst += vlOut; + + if ((last & LO_SURROGATE_MASK) == LO_SURROGATE_VALUE) { + // last item is lo surrogate and got already consumed + len -= 1; + src += 1; + } + } + + return result(error_code::SUCCESS, dst - dstBeg); +} + +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32( + const char16_t *src, size_t len, char32_t *dst) const noexcept { + result res = convert_utf16le_to_utf32_with_errors(src, len, dst); + return res.error == error_code::SUCCESS ? res.count : 0; +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32( + const char16_t *src, size_t len, char32_t *dst) const noexcept { + result res = convert_utf16be_to_utf32_with_errors(src, len, dst); + return res.error == error_code::SUCCESS ? res.count : 0; +} + +simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors( + const char16_t *src, size_t len, char32_t *dst) const noexcept { + return rvv_utf16_to_utf32_with_errors(src, len, dst); +} + +simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors( + const char16_t *src, size_t len, char32_t *dst) const noexcept { + if (supports_zvbb()) + return rvv_utf16_to_utf32_with_errors(src, len, + dst); + else + return rvv_utf16_to_utf32_with_errors(src, len, dst); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32( + const char16_t *src, size_t len, char32_t *dst) const noexcept { + return convert_utf16le_to_utf32(src, len, dst); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32( + const char16_t *src, size_t len, char32_t *dst) const noexcept { + return convert_utf16be_to_utf32(src, len, dst); +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +/* end file src/rvv/rvv_utf16_to.inl.cpp */ + +/* begin file src/rvv/rvv_utf32_to.inl.cpp */ +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_utf32_to_latin1( + const char32_t *src, size_t len, char *dst) const noexcept { + result res = convert_utf32_to_latin1_with_errors(src, len, dst); + return res.error == error_code::SUCCESS ? res.count : 0; +} + +simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors( + const char32_t *src, size_t len, char *dst) const noexcept { + const char32_t *const beg = src; + for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) { + vl = __riscv_vsetvl_e32m8(len); + vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl); + long idx = __riscv_vfirst_m_b4(__riscv_vmsgtu_vx_u32m8_b4(v, 255, vl), vl); + if (idx >= 0) + return result(error_code::TOO_LARGE, src - beg + idx); + /* We don't use vcompress here, because its performance varies widely on + * current platforms. This might be worth reconsidering once there is more + * hardware available. */ + __riscv_vse8_v_u8m2( + (uint8_t *)dst, + __riscv_vncvt_x_x_w_u8m2(__riscv_vncvt_x_x_w_u16m4(v, vl), vl), vl); + } + return result(error_code::SUCCESS, src - beg); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1( + const char32_t *src, size_t len, char *dst) const noexcept { + return convert_utf32_to_latin1(src, len, dst); +} +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +template +simdutf_warn_unused result convert_utf32_to_utf8_aux(const char32_t *src, + size_t len, + char *dst) noexcept { + size_t n = len; + const char32_t *srcBeg = src; + const char *dstBeg = dst; + size_t vl8m4 = __riscv_vsetvlmax_e8m4(); + vbool2_t m4mulp2 = __riscv_vmseq_vx_u8m4_b2( + __riscv_vand_vx_u8m4(__riscv_vid_v_u8m4(vl8m4), 3, vl8m4), 2, vl8m4); + + for (size_t vl, vlOut; n > 0;) { + vl = __riscv_vsetvl_e32m4(n); + + vuint32m4_t v = __riscv_vle32_v_u32m4((uint32_t const *)src, vl); + vbool8_t m234 = __riscv_vmsgtu_vx_u32m4_b8(v, 0x80 - 1, vl); + vuint16m2_t vn = __riscv_vncvt_x_x_w_u16m2(v, vl); + + if (__riscv_vfirst_m_b8(m234, vl) < 0) { /* 1 byte utf8 */ + vlOut = vl; + __riscv_vse8_v_u8m1((uint8_t *)dst, __riscv_vncvt_x_x_w_u8m1(vn, vlOut), + vlOut); + n -= vl, src += vl, dst += vlOut; + continue; + } + + vbool8_t m34 = __riscv_vmsgtu_vx_u32m4_b8(v, 0x800 - 1, vl); + + if (__riscv_vfirst_m_b8(m34, vl) < 0) { /* 1/2 byte utf8 */ + /* 0: [ aaa|aabbbbbb] + * 1: [aabbbbbb| ] vsll 8 + * 2: [ | aaaaa] vsrl 6 + * 3: [00111111|00111111] + * 4: [ bbbbbb|000aaaaa] (1|2)&3 + * 5: [10000000|11000000] + * 6: [10bbbbbb|110aaaaa] 4|5 */ + vuint16m2_t twoByte = __riscv_vand_vx_u16m2( + __riscv_vor_vv_u16m2(__riscv_vsll_vx_u16m2(vn, 8, vl), + __riscv_vsrl_vx_u16m2(vn, 6, vl), vl), + 0b0011111100111111, vl); + vuint16m2_t vout16 = + __riscv_vor_vx_u16m2_mu(m234, vn, twoByte, 0b1000000011000000, vl); + vuint8m2_t vout = __riscv_vreinterpret_v_u16m2_u8m2(vout16); + + /* Every high byte that is zero should be compressed + * low bytes should never be compressed, so we set them + * to all ones, and then create a non-zero bytes mask */ + vbool4_t mcomp = + __riscv_vmsne_vx_u8m2_b4(__riscv_vreinterpret_v_u16m2_u8m2( + __riscv_vor_vx_u16m2(vout16, 0xFF, vl)), + 0, vl * 2); + vlOut = __riscv_vcpop_m_b4(mcomp, vl * 2); + + vout = __riscv_vcompress_vm_u8m2(vout, mcomp, vl * 2); + __riscv_vse8_v_u8m2((uint8_t *)dst, vout, vlOut); + + n -= vl, src += vl, dst += vlOut; + continue; + } + + if (with_validation) { + const long idx1 = + __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0x10FFFF, vl), vl); + vbool8_t sur = __riscv_vmseq_vx_u32m4_b8( + __riscv_vand_vx_u32m4(v, 0xFFFFF800, vl), 0xD800, vl); + const long idx2 = __riscv_vfirst_m_b8(sur, vl); + if (idx1 >= 0 || idx2 >= 0) { + if (static_cast(idx1) <= + static_cast(idx2)) { + return result(error_code::TOO_LARGE, src - srcBeg + idx1); + } else { + return result(error_code::SURROGATE, src - srcBeg + idx2); + } + } + } + + vbool8_t m4 = __riscv_vmsgtu_vx_u32m4_b8(v, 0x10000 - 1, vl); + long first = __riscv_vfirst_m_b8(m4, vl); + size_t tail = vl - first; + vl = first < 0 ? vl : first; + + if (vl > 0) { /* 1/2/3 byte utf8 */ + /* vn: [aaaabbbb|bbcccccc] + * v1: [0bcccccc| ] vsll 8 + * v1: [10cccccc| ] vsll 8 & 0b00111111 | 0b10000000 + * v2: [ |110bbbbb] vsrl 6 & 0b00111111 | 0b11000000 + * v2: [ |10bbbbbb] vsrl 6 & 0b00111111 | 0b10000000 + * v3: [ |1110aaaa] vsrl 12 | 0b11100000 + * 1: [00000000|0bcccccc|00000000|00000000] => [0bcccccc] + * 2: [00000000|10cccccc|110bbbbb|00000000] => [110bbbbb] [10cccccc] + * 3: [00000000|10cccccc|10bbbbbb|1110aaaa] => [1110aaaa] [10bbbbbb] + * [10cccccc] + */ + vuint16m2_t v1, v2, v3, v12; + v1 = __riscv_vor_vx_u16m2_mu( + m234, vn, __riscv_vand_vx_u16m2(vn, 0b00111111, vl), 0b10000000, vl); + v1 = __riscv_vsll_vx_u16m2(v1, 8, vl); + + v2 = __riscv_vor_vx_u16m2( + __riscv_vand_vx_u16m2(__riscv_vsrl_vx_u16m2(vn, 6, vl), 0b00111111, + vl), + 0b10000000, vl); + v2 = __riscv_vor_vx_u16m2_mu(__riscv_vmnot_m_b8(m34, vl), v2, v2, + 0b01000000, vl); + v3 = __riscv_vor_vx_u16m2(__riscv_vsrl_vx_u16m2(vn, 12, vl), 0b11100000, + vl); + v12 = __riscv_vor_vv_u16m2_mu(m234, v1, v1, v2, vl); + + vuint32m4_t w12 = __riscv_vwmulu_vx_u32m4(v12, 1 << 8, vl); + vuint32m4_t w123 = __riscv_vwaddu_wv_u32m4_mu(m34, w12, w12, v3, vl); + vuint8m4_t vout = __riscv_vreinterpret_v_u32m4_u8m4(w123); + + vbool2_t mcomp = __riscv_vmor_mm_b2( + m4mulp2, __riscv_vmsne_vx_u8m4_b2(vout, 0, vl * 4), vl * 4); + vlOut = __riscv_vcpop_m_b2(mcomp, vl * 4); + + vout = __riscv_vcompress_vm_u8m4(vout, mcomp, vl * 4); + __riscv_vse8_v_u8m4((uint8_t *)dst, vout, vlOut); + + n -= vl, src += vl, dst += vlOut; + } + + if (tail) + while (n) { + uint32_t word = src[0]; + if (word < 0x10000) + break; + if (word > 0x10FFFF) + return result(error_code::TOO_LARGE, src - srcBeg); + *dst++ = (uint8_t)((word >> 18) | 0b11110000); + *dst++ = (uint8_t)(((word >> 12) & 0b111111) | 0b10000000); + *dst++ = (uint8_t)(((word >> 6) & 0b111111) | 0b10000000); + *dst++ = (uint8_t)((word & 0b111111) | 0b10000000); + ++src; + --n; + } + } + + return result(error_code::SUCCESS, dst - dstBeg); +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors( + const char32_t *src, size_t len, char *dst) const noexcept { + constexpr bool with_validation = true; + return convert_utf32_to_utf8_aux(src, len, dst); +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf8( + const char32_t *src, size_t len, char *dst) const noexcept { + result res = convert_utf32_to_utf8_with_errors(src, len, dst); + return res.error == error_code::SUCCESS ? res.count : 0; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8( + const char32_t *src, size_t len, char *dst) const noexcept { + constexpr bool with_validation = false; + const auto res = convert_utf32_to_utf8_aux(src, len, dst); + return res.count; +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +template +simdutf_really_inline static result +rvv_convert_utf32_to_utf16_with_errors(const char32_t *src, size_t len, + char16_t *dst) { + size_t vl8m2 = __riscv_vsetvlmax_e8m2(); + vbool4_t m4even = __riscv_vmseq_vx_u8m2_b4( + __riscv_vand_vx_u8m2(__riscv_vid_v_u8m2(vl8m2), 1, vl8m2), 0, vl8m2); + const char16_t *dstBeg = dst; + const char32_t *srcBeg = src; + for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut) { + vl = __riscv_vsetvl_e32m4(len); + vuint32m4_t v = __riscv_vle32_v_u32m4((uint32_t *)src, vl); + vuint32m4_t off = __riscv_vadd_vx_u32m4(v, 0xFFFF2000, vl); + const long err_surrogate_idx = __riscv_vfirst_m_b8( + __riscv_vmsgtu_vx_u32m4_b8(off, 0xFFFFF7FF, vl), vl); + const long idx = + __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0xFFFF, vl), vl); + if (idx < 0) { + if (err_surrogate_idx >= 0) { + return result(error_code::SURROGATE, src - srcBeg + err_surrogate_idx); + } + + vlOut = vl; + vuint16m2_t n = + simdutf_byteflip(__riscv_vncvt_x_x_w_u16m2(v, vlOut), vlOut); + __riscv_vse16_v_u16m2((uint16_t *)dst, n, vlOut); + continue; + } + + const long err_too_big_idx = + __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0x10FFFF, vl), vl); + if (err_too_big_idx >= 0 || err_surrogate_idx >= 0) { + if (static_cast(err_too_big_idx) <= + static_cast(err_surrogate_idx)) { + return result(error_code::TOO_LARGE, src - srcBeg + err_too_big_idx); + } else { + return result(error_code::SURROGATE, src - srcBeg + err_surrogate_idx); + } + } + + vlOut = rvv_utf32_store_utf16_m4((uint16_t *)dst, v, vl, m4even); + } + return result(error_code::SUCCESS, dst - dstBeg); +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le( + const char32_t *src, size_t len, char16_t *dst) const noexcept { + result res = convert_utf32_to_utf16le_with_errors(src, len, dst); + return res.error == error_code::SUCCESS ? res.count : 0; +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be( + const char32_t *src, size_t len, char16_t *dst) const noexcept { + result res = convert_utf32_to_utf16be_with_errors(src, len, dst); + return res.error == error_code::SUCCESS ? res.count : 0; +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors( + const char32_t *src, size_t len, char16_t *dst) const noexcept { + return rvv_convert_utf32_to_utf16_with_errors( + src, len, dst); +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors( + const char32_t *src, size_t len, char16_t *dst) const noexcept { + if (supports_zvbb()) + return rvv_convert_utf32_to_utf16_with_errors( + src, len, dst); + else + return rvv_convert_utf32_to_utf16_with_errors(src, len, + dst); +} + +template +simdutf_really_inline static size_t +rvv_convert_valid_utf32_to_utf16(const char32_t *src, size_t len, + char16_t *dst) { + size_t vl8m2 = __riscv_vsetvlmax_e8m2(); + vbool4_t m4even = __riscv_vmseq_vx_u8m2_b4( + __riscv_vand_vx_u8m2(__riscv_vid_v_u8m2(vl8m2), 1, vl8m2), 0, vl8m2); + char16_t *dstBeg = dst; + for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut) { + vl = __riscv_vsetvl_e32m4(len); + vuint32m4_t v = __riscv_vle32_v_u32m4((uint32_t *)src, vl); + if (__riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0xFFFF, vl), vl) < + 0) { + vlOut = vl; + vuint16m2_t n = + simdutf_byteflip(__riscv_vncvt_x_x_w_u16m2(v, vlOut), vlOut); + __riscv_vse16_v_u16m2((uint16_t *)dst, n, vlOut); + continue; + } + vlOut = rvv_utf32_store_utf16_m4((uint16_t *)dst, v, vl, m4even); + } + return dst - dstBeg; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le( + const char32_t *src, size_t len, char16_t *dst) const noexcept { + return rvv_convert_valid_utf32_to_utf16(src, len, + dst); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be( + const char32_t *src, size_t len, char16_t *dst) const noexcept { + if (supports_zvbb()) + return rvv_convert_valid_utf32_to_utf16(src, len, + dst); + else + return rvv_convert_valid_utf32_to_utf16(src, len, dst); +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +/* end file src/rvv/rvv_utf32_to.inl.cpp */ +/* begin file src/rvv/rvv_utf8_to.inl.cpp */ +#if SIMDUTF_FEATURE_UTF8 && (SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32) +template +simdutf_really_inline static size_t rvv_utf8_to_common(char const *src, + size_t len, Tdst *dst) { + static_assert(std::is_same() || + std::is_same(), + "invalid type"); + constexpr bool is16 = std::is_same(); + constexpr endianness endian = + bflip == simdutf_ByteFlip::NONE ? endianness::LITTLE : endianness::BIG; + const auto scalar = [](char const *in, size_t count, Tdst *out) { + return is16 ? scalar::utf8_to_utf16::convert(in, count, + (char16_t *)out) + : scalar::utf8_to_utf32::convert(in, count, (char32_t *)out); + }; + + if (len < 32) + return scalar(src, len, dst); + + /* validate first three bytes */ + if (validate) { + size_t idx = 3; + while (idx < len && (uint8_t(src[idx]) >> 6) == 0b10) + ++idx; + if (idx > 3 + 3 || !scalar::utf8::validate(src, idx)) + return 0; + } + + size_t tail = 3; + size_t n = len - tail; + Tdst *beg = dst; + + static const uint64_t err1m[] = {0x0202020202020202, 0x4915012180808080}; + static const uint64_t err2m[] = {0xCBCBCB8B8383A3E7, 0xCBCBDBCBCBCBCBCB}; + static const uint64_t err3m[] = {0x0101010101010101, 0X01010101BABAAEE6}; + + const vuint8m1_t err1tbl = + __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err1m, 2)); + const vuint8m1_t err2tbl = + __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err2m, 2)); + const vuint8m1_t err3tbl = + __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err3m, 2)); + + size_t vl8m1 = __riscv_vsetvlmax_e8m1(); + size_t vl8m2 = __riscv_vsetvlmax_e8m2(); + vbool4_t m4even = __riscv_vmseq_vx_u8m2_b4( + __riscv_vand_vx_u8m2(__riscv_vid_v_u8m2(vl8m2), 1, vl8m2), 0, vl8m2); + + for (size_t vl, vlOut; n > 0; n -= vl, src += vl, dst += vlOut) { + vl = __riscv_vsetvl_e8m2(n); + + vuint8m2_t v0 = __riscv_vle8_v_u8m2((uint8_t const *)src, vl); + uint64_t max = __riscv_vmv_x_s_u8m1_u8( + __riscv_vredmaxu_vs_u8m2_u8m1(v0, __riscv_vmv_s_x_u8m1(0, vl), vl)); + + uint8_t next0 = src[vl + 0]; + uint8_t next1 = src[vl + 1]; + uint8_t next2 = src[vl + 2]; + + /* fast path: ASCII */ + if ((max | next0 | next1 | next2) < 0b10000000) { + vlOut = vl; + if (is16) + __riscv_vse16_v_u16m4( + (uint16_t *)dst, + simdutf_byteflip(__riscv_vzext_vf2_u16m4(v0, vlOut), vlOut), + vlOut); + else + __riscv_vse32_v_u32m8((uint32_t *)dst, + __riscv_vzext_vf4_u32m8(v0, vlOut), vlOut); + continue; + } + + /* see "Validating UTF-8 In Less Than One Instruction Per Byte" + * https://arxiv.org/abs/2010.03090 */ + vuint8m2_t v1 = __riscv_vslide1down_vx_u8m2(v0, next0, vl); + vuint8m2_t v2 = __riscv_vslide1down_vx_u8m2(v1, next1, vl); + vuint8m2_t v3 = __riscv_vslide1down_vx_u8m2(v2, next2, vl); + + if (validate) { + vuint8m2_t idx2 = __riscv_vand_vx_u8m2(v2, 0xF, vl); + vuint8m2_t idx1 = __riscv_vsrl_vx_u8m2(v2, 4, vl); + vuint8m2_t idx3 = __riscv_vsrl_vx_u8m2(v3, 4, vl); + + vuint8m2_t err1 = simdutf_vrgather_u8m1x2(err1tbl, idx1); + vuint8m2_t err2 = simdutf_vrgather_u8m1x2(err2tbl, idx2); + vuint8m2_t err3 = simdutf_vrgather_u8m1x2(err3tbl, idx3); + vint8m2_t errs = __riscv_vreinterpret_v_u8m2_i8m2( + __riscv_vand_vv_u8m2(__riscv_vand_vv_u8m2(err1, err2, vl), err3, vl)); + + vbool4_t is_3 = __riscv_vmsgtu_vx_u8m2_b4(v1, 0b11100000 - 1, vl); + vbool4_t is_4 = __riscv_vmsgtu_vx_u8m2_b4(v0, 0b11110000 - 1, vl); + vbool4_t is_34 = __riscv_vmor_mm_b4(is_3, is_4, vl); + vbool4_t err34 = + __riscv_vmxor_mm_b4(is_34, __riscv_vmslt_vx_i8m2_b4(errs, 0, vl), vl); + vbool4_t errm = + __riscv_vmor_mm_b4(__riscv_vmsgt_vx_i8m2_b4(errs, 0, vl), err34, vl); + if (__riscv_vfirst_m_b4(errm, vl) >= 0) + return 0; + } + + /* decoding */ + + /* mask of non continuation bytes */ + vbool4_t m = + __riscv_vmsgt_vx_i8m2_b4(__riscv_vreinterpret_v_u8m2_i8m2(v0), -65, vl); + vlOut = __riscv_vcpop_m_b4(m, vl); + + /* extract first and second bytes */ + vuint8m2_t b1 = __riscv_vcompress_vm_u8m2(v0, m, vl); + vuint8m2_t b2 = __riscv_vcompress_vm_u8m2(v1, m, vl); + + /* fast path: one and two byte */ + if (max < 0b11100000) { + b2 = __riscv_vand_vx_u8m2(b2, 0b00111111, vlOut); + + vbool4_t m1 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b10111111, vlOut); + b1 = __riscv_vand_vx_u8m2_mu(m1, b1, b1, 63, vlOut); + + vuint16m4_t b12 = __riscv_vwmulu_vv_u16m4( + b1, + __riscv_vmerge_vxm_u8m2(__riscv_vmv_v_x_u8m2(1, vlOut), 1 << 6, m1, + vlOut), + vlOut); + b12 = __riscv_vwaddu_wv_u16m4_mu(m1, b12, b12, b2, vlOut); + if (is16) + __riscv_vse16_v_u16m4((uint16_t *)dst, + simdutf_byteflip(b12, vlOut), vlOut); + else + __riscv_vse32_v_u32m8((uint32_t *)dst, + __riscv_vzext_vf2_u32m8(b12, vlOut), vlOut); + continue; + } + + /* fast path: one, two and three byte */ + if (max < 0b11110000) { + vuint8m2_t b3 = __riscv_vcompress_vm_u8m2(v2, m, vl); + + b2 = __riscv_vand_vx_u8m2(b2, 0b00111111, vlOut); + b3 = __riscv_vand_vx_u8m2(b3, 0b00111111, vlOut); + + vbool4_t m1 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b10111111, vlOut); + vbool4_t m3 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b11011111, vlOut); + + vuint8m2_t t1 = __riscv_vand_vx_u8m2_mu(m1, b1, b1, 63, vlOut); + b1 = __riscv_vand_vx_u8m2_mu(m3, t1, b1, 15, vlOut); + + vuint16m4_t b12 = __riscv_vwmulu_vv_u16m4( + b1, + __riscv_vmerge_vxm_u8m2(__riscv_vmv_v_x_u8m2(1, vlOut), 1 << 6, m1, + vlOut), + vlOut); + b12 = __riscv_vwaddu_wv_u16m4_mu(m1, b12, b12, b2, vlOut); + vuint16m4_t b123 = __riscv_vwaddu_wv_u16m4_mu( + m3, b12, __riscv_vsll_vx_u16m4_mu(m3, b12, b12, 6, vlOut), b3, vlOut); + if (is16) + __riscv_vse16_v_u16m4((uint16_t *)dst, + simdutf_byteflip(b123, vlOut), vlOut); + else + __riscv_vse32_v_u32m8((uint32_t *)dst, + __riscv_vzext_vf2_u32m8(b123, vlOut), vlOut); + continue; + } + + /* extract third and fourth bytes */ + vuint8m2_t b3 = __riscv_vcompress_vm_u8m2(v2, m, vl); + vuint8m2_t b4 = __riscv_vcompress_vm_u8m2(v3, m, vl); + + /* remove prefix from leading bytes + * + * We could also use vrgather here, but it increases register pressure, + * and its performance varies widely on current platforms. It might be + * worth reconsidering, though, once there is more hardware available. + * Same goes for the __riscv_vsrl_vv_u32m4 correction step. + * + * We shift left and then right by the number of bytes in the prefix, + * which can be calculated as follows: + * x max(x-10, 0) + * 0xxx -> 0000-0111 -> sift by 0 or 1 -> 0 + * 10xx -> 1000-1011 -> don't care + * 110x -> 1100,1101 -> sift by 3 -> 2,3 + * 1110 -> 1110 -> sift by 4 -> 4 + * 1111 -> 1111 -> sift by 5 -> 5 + * + * vssubu.vx v, 10, (max(x-10, 0)) almost gives us what we want, we + * just need to manually detect and handle the one special case: + */ + #define SIMDUTF_RVV_UTF8_TO_COMMON_M1(idx) \ + vuint8m1_t c1 = __riscv_vget_v_u8m2_u8m1(b1, idx); \ + vuint8m1_t c2 = __riscv_vget_v_u8m2_u8m1(b2, idx); \ + vuint8m1_t c3 = __riscv_vget_v_u8m2_u8m1(b3, idx); \ + vuint8m1_t c4 = __riscv_vget_v_u8m2_u8m1(b4, idx); \ + /* remove prefix from trailing bytes */ \ + c2 = __riscv_vand_vx_u8m1(c2, 0b00111111, vlOut); \ + c3 = __riscv_vand_vx_u8m1(c3, 0b00111111, vlOut); \ + c4 = __riscv_vand_vx_u8m1(c4, 0b00111111, vlOut); \ + vuint8m1_t shift = __riscv_vsrl_vx_u8m1(c1, 4, vlOut); \ + shift = __riscv_vmerge_vxm_u8m1( \ + __riscv_vssubu_vx_u8m1(shift, 10, vlOut), 3, \ + __riscv_vmseq_vx_u8m1_b8(shift, 12, vlOut), vlOut); \ + c1 = __riscv_vsll_vv_u8m1(c1, shift, vlOut); \ + c1 = __riscv_vsrl_vv_u8m1(c1, shift, vlOut); \ + /* unconditionally widen and combine to c1234 */ \ + vuint16m2_t c34 = __riscv_vwaddu_wv_u16m2( \ + __riscv_vwmulu_vx_u16m2(c3, 1 << 6, vlOut), c4, vlOut); \ + vuint16m2_t c12 = __riscv_vwaddu_wv_u16m2( \ + __riscv_vwmulu_vx_u16m2(c1, 1 << 6, vlOut), c2, vlOut); \ + vuint32m4_t c1234 = __riscv_vwaddu_wv_u32m4( \ + __riscv_vwmulu_vx_u32m4(c12, 1 << 12, vlOut), c34, vlOut); \ + /* derive required right-shift amount from `shift` to reduce \ + * c1234 to the required number of bytes */ \ + c1234 = __riscv_vsrl_vv_u32m4( \ + c1234, \ + __riscv_vzext_vf4_u32m4( \ + __riscv_vmul_vx_u8m1( \ + __riscv_vrsub_vx_u8m1(__riscv_vssubu_vx_u8m1(shift, 2, vlOut), \ + 3, vlOut), \ + 6, vlOut), \ + vlOut), \ + vlOut); \ + /* store result in desired format */ \ + if (is16) \ + vlDst = rvv_utf32_store_utf16_m4((uint16_t *)dst, c1234, vlOut, \ + m4even); \ + else \ + vlDst = vlOut, __riscv_vse32_v_u32m4((uint32_t *)dst, c1234, vlOut); + + /* Unrolling this manually reduces register pressure and allows + * us to terminate early. */ + { + size_t vlOutm2 = vlOut, vlDst; + vlOut = __riscv_vsetvl_e8m1(vlOut < vl8m1 ? vlOut : vl8m1); + SIMDUTF_RVV_UTF8_TO_COMMON_M1(0) + if (vlOutm2 == vlOut) { + vlOut = vlDst; + continue; + } + + dst += vlDst; + vlOut = vlOutm2 - vlOut; + } + { + size_t vlDst; + SIMDUTF_RVV_UTF8_TO_COMMON_M1(1) + vlOut = vlDst; + } + + #undef SIMDUTF_RVV_UTF8_TO_COMMON_M1 + } + + /* validate the last character and reparse it + tail */ + if (len > tail) { + if ((uint8_t(src[0]) >> 6) == 0b10) + --dst; + while ((uint8_t(src[0]) >> 6) == 0b10 && tail < len) + --src, ++tail; + if (is16) { + /* go back one more, when on high surrogate */ + if (simdutf_byteflip((uint16_t)dst[-1]) >= 0xD800 && + simdutf_byteflip((uint16_t)dst[-1]) <= 0xDBFF) + --dst; + } + } + size_t ret = scalar(src, tail, dst); + if (ret == 0) + return 0; + return (size_t)(dst - beg) + ret; +} +#endif // SIMDUTF_FEATURE_UTF8 && (SIMDUTF_FEATURE_UTF16 || + // SIMDUTF_FEATURE_UTF32) + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_utf8_to_latin1( + const char *src, size_t len, char *dst) const noexcept { + const char *beg = dst; + uint8_t last = 0; + for (size_t vl, vlOut; len > 0; + len -= vl, src += vl, dst += vlOut, last = src[-1]) { + vl = __riscv_vsetvl_e8m2(len); + vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t *)src, vl); + // check which bytes are ASCII + vbool4_t ascii = __riscv_vmsltu_vx_u8m2_b4(v1, 0b10000000, vl); + // count ASCII bytes + vlOut = __riscv_vcpop_m_b4(ascii, vl); + // The original code would only enter the next block after this check: + // vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl); + // vlOut = __riscv_vcpop_m_b4(m, vl); + // if (vlOut != vl || last > 0b01111111) {...}q + // So that everything is ASCII or continuation bytes, we just proceeded + // without any processing, going straight to __riscv_vse8_v_u8m2. + // But you need the __riscv_vslide1up_vx_u8m2 whenever there is a non-ASCII + // byte. + if (vlOut != vl) { // If not pure ASCII + // Non-ASCII characters + // We now want to mark the ascii and continuation bytes + vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl); + // We count them, that's our new vlOut (output vector length) + vlOut = __riscv_vcpop_m_b4(m, vl); + + vuint8m2_t v0 = __riscv_vslide1up_vx_u8m2(v1, last, vl); + + vbool4_t leading0 = __riscv_vmsgtu_vx_u8m2_b4(v0, 0b10111111, vl); + vbool4_t trailing1 = __riscv_vmslt_vx_i8m2_b4( + __riscv_vreinterpret_v_u8m2_i8m2(v1), (uint8_t)0b11000000, vl); + // -62 i 0b11000010, so we check whether any of v0 is too big + vbool4_t tobig = __riscv_vmand_mm_b4( + leading0, + __riscv_vmsgtu_vx_u8m2_b4(__riscv_vxor_vx_u8m2(v0, (uint8_t)-62, vl), + 1, vl), + vl); + if (__riscv_vfirst_m_b4( + __riscv_vmor_mm_b4( + tobig, __riscv_vmxor_mm_b4(leading0, trailing1, vl), vl), + vl) >= 0) + return 0; + + v1 = __riscv_vor_vx_u8m2_mu(__riscv_vmseq_vx_u8m2_b4(v0, 0b11000011, vl), + v1, v1, 0b01000000, vl); + v1 = __riscv_vcompress_vm_u8m2(v1, m, vl); + } else if (last >= 0b11000000) { // If last byte is a leading byte and we + // got only ASCII, error! + return 0; + } + __riscv_vse8_v_u8m2((uint8_t *)dst, v1, vlOut); + } + if (last > 0b10111111) + return 0; + return dst - beg; +} + +simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors( + const char *src, size_t len, char *dst) const noexcept { + size_t res = convert_utf8_to_latin1(src, len, dst); + if (res) + return result(error_code::SUCCESS, res); + return scalar::utf8_to_latin1::convert_with_errors(src, len, dst); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1( + const char *src, size_t len, char *dst) const noexcept { + const char *beg = dst; + uint8_t last = 0; + for (size_t vl, vlOut; len > 0; + len -= vl, src += vl, dst += vlOut, last = src[-1]) { + vl = __riscv_vsetvl_e8m2(len); + vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t *)src, vl); + vbool4_t ascii = __riscv_vmsltu_vx_u8m2_b4(v1, 0b10000000, vl); + vlOut = __riscv_vcpop_m_b4(ascii, vl); + if (vlOut != vl) { // If not pure ASCII + vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl); + vlOut = __riscv_vcpop_m_b4(m, vl); + vuint8m2_t v0 = __riscv_vslide1up_vx_u8m2(v1, last, vl); + v1 = __riscv_vor_vx_u8m2_mu(__riscv_vmseq_vx_u8m2_b4(v0, 0b11000011, vl), + v1, v1, 0b01000000, vl); + v1 = __riscv_vcompress_vm_u8m2(v1, m, vl); + } + __riscv_vse8_v_u8m2((uint8_t *)dst, v1, vlOut); + } + return dst - beg; +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le( + const char *src, size_t len, char16_t *dst) const noexcept { + return rvv_utf8_to_common(src, len, + (uint16_t *)dst); +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be( + const char *src, size_t len, char16_t *dst) const noexcept { + if (supports_zvbb()) + return rvv_utf8_to_common( + src, len, (uint16_t *)dst); + else + return rvv_utf8_to_common(src, len, + (uint16_t *)dst); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors( + const char *src, size_t len, char16_t *dst) const noexcept { + size_t res = convert_utf8_to_utf16le(src, len, dst); + if (res) + return result(error_code::SUCCESS, res); + return scalar::utf8_to_utf16::convert_with_errors( + src, len, dst); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors( + const char *src, size_t len, char16_t *dst) const noexcept { + size_t res = convert_utf8_to_utf16be(src, len, dst); + if (res) + return result(error_code::SUCCESS, res); + return scalar::utf8_to_utf16::convert_with_errors(src, len, + dst); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le( + const char *src, size_t len, char16_t *dst) const noexcept { + return rvv_utf8_to_common( + src, len, (uint16_t *)dst); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be( + const char *src, size_t len, char16_t *dst) const noexcept { + if (supports_zvbb()) + return rvv_utf8_to_common( + src, len, (uint16_t *)dst); + else + return rvv_utf8_to_common( + src, len, (uint16_t *)dst); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_utf8_to_utf32( + const char *src, size_t len, char32_t *dst) const noexcept { + return rvv_utf8_to_common(src, len, + (uint32_t *)dst); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors( + const char *src, size_t len, char32_t *dst) const noexcept { + size_t res = convert_utf8_to_utf32(src, len, dst); + if (res) + return result(error_code::SUCCESS, res); + return scalar::utf8_to_utf32::convert_with_errors(src, len, dst); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32( + const char *src, size_t len, char32_t *dst) const noexcept { + return rvv_utf8_to_common( + src, len, (uint32_t *)dst); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +/* end file src/rvv/rvv_utf8_to.inl.cpp */ + +#if SIMDUTF_FEATURE_BASE64 +/* begin file src/rvv/rvv_find.cpp */ +const char *implementation::find(const char *start, const char *end, + char character) const noexcept { + const char *src = start; + for (size_t len = end - start, vl; len > 0; len -= vl, src += vl) { + vl = __riscv_vsetvl_e8m8(len); + vuint8m8_t v = __riscv_vle8_v_u8m8((uint8_t *)src, vl); + long idx = + __riscv_vfirst_m_b1(__riscv_vmseq_vx_u8m8_b1(v, character, vl), vl); + if (idx >= 0) + return src + idx; + } + return end; +} + +const char16_t *implementation::find(const char16_t *start, const char16_t *end, + char16_t character) const noexcept { + const char16_t *src = start; + for (size_t len = end - start, vl; len > 0; len -= vl, src += vl) { + vl = __riscv_vsetvl_e16m8(len); + vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl); + long idx = + __riscv_vfirst_m_b2(__riscv_vmseq_vx_u16m8_b2(v, character, vl), vl); + if (idx >= 0) + return src + idx; + } + return end; +} +/* end file src/rvv/rvv_find.cpp */ +#endif // SIMDUTF_FEATURE_BASE64 + +#if SIMDUTF_FEATURE_UTF16 +/* begin file src/rvv/rvv_utf16fix.cpp */ +template +simdutf_really_inline void utf16fix_block_rvv(char16_t *out, const char16_t *in, + size_t vl) { + const char16_t replacement = scalar::utf16::replacement(); + vuint16m8_t block = __riscv_vle16_v_u16m8((const uint16_t *)in, vl); + vuint16m8_t lookback = __riscv_vslide1up_vx_u16m8(block, in[-1], vl); + vuint16m8_t lb_masked = __riscv_vand_vx_u16m8( + lookback, scalar::utf16::swap_if_needed(0xfc00U), vl); + vuint16m8_t block_masked = __riscv_vand_vx_u16m8( + block, scalar::utf16::swap_if_needed(0xfc00U), vl); + vbool2_t lb_is_high = __riscv_vmseq_vx_u16m8_b2( + lb_masked, scalar::utf16::swap_if_needed(0xd800U), vl); + vbool2_t block_is_low = __riscv_vmseq_vx_u16m8_b2( + block_masked, scalar::utf16::swap_if_needed(0xdc00U), vl); + + vbool2_t illseq = __riscv_vmxor_mm_b2(lb_is_high, block_is_low, vl); + if (__riscv_vfirst_m_b2(illseq, vl) >= 0) { + vbool2_t lb_illseq = __riscv_vmandn_mm_b2(lb_is_high, block_is_low, vl); + + vbool2_t lb_illseq_right_shifted; + if (vlmax) { + /* right shift mask register directly via reinterpret at vlmax */ + size_t vlm = __riscv_vsetvlmax_e8mf2(); + vuint8mf2_t vlb_illseq = + __riscv_vlmul_trunc_u8mf2(__riscv_vreinterpret_u8m1(lb_illseq)); + lb_illseq_right_shifted = + __riscv_vreinterpret_b2(__riscv_vlmul_ext_u8m1(__riscv_vmacc_vx_u8mf2( + __riscv_vsrl_vx_u8mf2(vlb_illseq, 1, vlm), 1 << 7, + __riscv_vslide1down_vx_u8mf2(vlb_illseq, 0, vlm), vlm))); + } else { + lb_illseq_right_shifted = __riscv_vmandn_mm_b2( + __riscv_vmseq_vx_u16m8_b2( + __riscv_vslide1down_vx_u16m8(lb_masked, 0, vl), + scalar::utf16::swap_if_needed(0xd800U), vl), + __riscv_vmseq_vx_u16m8_b2( + __riscv_vslide1down_vx_u16m8(block_masked, 0, vl), + scalar::utf16::swap_if_needed(0xdc00U), vl), + vl); + } + + char16_t last = out[-1]; /* allow compiler to generate branchless code */ + out[-1] = __riscv_vfirst_m_b2(lb_illseq, vl) == 0 ? replacement : last; + vbool2_t block_illseq = + __riscv_vmor_mm_b2(__riscv_vmandn_mm_b2(block_is_low, lb_is_high, vl), + lb_illseq_right_shifted, vl); + block = __riscv_vmerge_vxm_u16m8(block, replacement, block_illseq, vl); + __riscv_vse16_v_u16m8((uint16_t *)out, block, vl); + } else if (!in_place) { + __riscv_vse16_v_u16m8((uint16_t *)out, block, vl); + } +} + +template +void rvv_to_well_formed_utf16(const char16_t *in, size_t n, char16_t *out) { + const char16_t replacement = scalar::utf16::replacement(); + const size_t VL = __riscv_vsetvlmax_e16m8(); + if (n == 0) + return; + + out[0] = + scalar::utf16::is_low_surrogate(in[0]) ? replacement : in[0]; + n -= 1; + in += 1; + out += 1; + + /* duplicate code to have the compiler specialise utf16fix_block() */ + if (in == out) { + for (; n > VL; n -= VL, in += VL, out += VL) { + utf16fix_block_rvv(out, in, VL); + } + utf16fix_block_rvv(out, in, n); + } else { + for (; n > VL; n -= VL, in += VL, out += VL) { + utf16fix_block_rvv(out, in, VL); + } + utf16fix_block_rvv(out, in, n); + } + + out[n - 1] = scalar::utf16::is_high_surrogate(out[n - 1]) + ? replacement + : out[n - 1]; +} + +void implementation::to_well_formed_utf16le(const char16_t *input, size_t len, + char16_t *output) const noexcept { + return rvv_to_well_formed_utf16(input, len, output); +} + +void implementation::to_well_formed_utf16be(const char16_t *input, size_t len, + char16_t *output) const noexcept { + return rvv_to_well_formed_utf16(input, len, output); +} + +template +simdutf_really_inline static void +rvv_change_endianness_utf16(const char16_t *src, size_t len, char16_t *dst) { + for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) { + vl = __riscv_vsetvl_e16m8(len); + vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl); + __riscv_vse16_v_u16m8((uint16_t *)dst, simdutf_byteflip(v, vl), vl); + } +} + +void implementation::change_endianness_utf16(const char16_t *src, size_t len, + char16_t *dst) const noexcept { + if (supports_zvbb()) + return rvv_change_endianness_utf16(src, len, dst); + else + return rvv_change_endianness_utf16(src, len, dst); +} +/* end file src/rvv/rvv_utf16fix.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused int +implementation::detect_encodings(const char *input, + size_t length) const noexcept { + // If there is a BOM, then we trust it. + auto bom_encoding = simdutf::BOM::check_bom(input, length); + if (bom_encoding != encoding_type::unspecified) + return bom_encoding; + // todo: reimplement as a one-pass algorithm. + int out = 0; + if (validate_utf8(input, length)) + out |= encoding_type::UTF8; + if (length % 2 == 0) { + if (validate_utf16le(reinterpret_cast(input), length / 2)) + out |= encoding_type::UTF16_LE; + } + if (length % 4 == 0) { + if (validate_utf32(reinterpret_cast(input), length / 4)) + out |= encoding_type::UTF32_LE; + } + + return out; +} +#endif // SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_BASE64 +simdutf_warn_unused result implementation::base64_to_binary( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + return simdutf::scalar::base64::base64_to_binary_details_impl( + input, length, output, options, last_chunk_options); +} + +simdutf_warn_unused result implementation::base64_to_binary( + const char16_t *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + return simdutf::scalar::base64::base64_to_binary_details_impl( + input, length, output, options, last_chunk_options); +} + +simdutf_warn_unused full_result implementation::base64_to_binary_details( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + return simdutf::scalar::base64::base64_to_binary_details_impl( + input, length, output, options, last_chunk_options); +} + +simdutf_warn_unused full_result implementation::base64_to_binary_details( + const char16_t *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + return simdutf::scalar::base64::base64_to_binary_details_impl( + input, length, output, options, last_chunk_options); +} + +size_t implementation::binary_to_base64(const char *input, size_t length, + char *output, + base64_options options) const noexcept { + return scalar::base64::tail_encode_base64(output, input, length, options); +} + +size_t implementation::binary_to_base64_with_lines( + const char *input, size_t length, char *output, size_t line_length, + base64_options options) const noexcept { + return scalar::base64::tail_encode_base64_impl(output, input, length, + options, line_length); +} +#endif // SIMDUTF_FEATURE_BASE64 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused result +implementation::utf8_length_from_utf16le_with_replacement( + const char16_t *input, size_t length) const noexcept { + return scalar::utf16::utf8_length_from_utf16_with_replacement< + endianness::LITTLE>(input, length); +} + +simdutf_warn_unused result +implementation::utf8_length_from_utf16be_with_replacement( + const char16_t *input, size_t length) const noexcept { + return scalar::utf16::utf8_length_from_utf16_with_replacement< + endianness::BIG>(input, length); +} + +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +} // namespace rvv +} // namespace simdutf + +/* begin file src/simdutf/rvv/end.h */ +#if SIMDUTF_CAN_ALWAYS_RUN_RVV +// nothing needed. +#else +SIMDUTF_UNTARGET_REGION +#endif + +/* end file src/simdutf/rvv/end.h */ +/* end file src/rvv/implementation.cpp */ +#endif +#if SIMDUTF_IMPLEMENTATION_WESTMERE +/* begin file src/westmere/implementation.cpp */ +/* begin file src/simdutf/westmere/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "westmere" +// #define SIMDUTF_IMPLEMENTATION westmere +#define SIMDUTF_SIMD_HAS_BYTEMASK 1 + +#if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE +// nothing needed. +#else +SIMDUTF_TARGET_WESTMERE +#endif +/* end file src/simdutf/westmere/begin.h */ + +namespace simdutf { +namespace westmere { +namespace { +#ifndef SIMDUTF_WESTMERE_H + #error "westmere.h must be included" +#endif +using namespace simd; + +#if SIMDUTF_FEATURE_ASCII || SIMDUTF_FEATURE_DETECT_ENCODING || \ + SIMDUTF_FEATURE_UTF8 +simdutf_really_inline bool is_ascii(const simd8x64 &input) { + return input.reduce_or().is_ascii(); +} +#endif // SIMDUTF_FEATURE_ASCII || SIMDUTF_FEATURE_DETECT_ENCODING || + // SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_really_inline simd8 +must_be_2_3_continuation(const simd8 prev2, + const simd8 prev3) { + simd8 is_third_byte = + prev2.saturating_sub(0xe0u - 0x80); // Only 111_____ will be >= 0x80 + simd8 is_fourth_byte = + prev3.saturating_sub(0xf0u - 0x80); // Only 1111____ will be >= 0x80 + return simd8(is_third_byte | is_fourth_byte); +} +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF8 +/* begin file src/westmere/internal/loader.cpp */ +namespace internal { +namespace westmere { + +/* begin file src/westmere/internal/write_v_u16_11bits_to_utf8.cpp */ +/* + * reads a vector of uint16 values + * bits after 11th are ignored + * first 11 bits are encoded into utf8 + * !important! utf8_output must have at least 16 writable bytes + */ + +inline void write_v_u16_11bits_to_utf8(const __m128i v_u16, char *&utf8_output, + const __m128i one_byte_bytemask, + const uint16_t one_byte_bitmask) { + // 0b1100_0000_1000_0000 + const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080); + // 0b0001_1111_0000_0000 + const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00); + // 0b0000_0000_0011_1111 + const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f); + + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + + // t0 = [000a|aaaa|bbbb|bb00] + const __m128i t0 = _mm_slli_epi16(v_u16, 2); + // t1 = [000a|aaaa|0000|0000] + const __m128i t1 = _mm_and_si128(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const __m128i t2 = _mm_and_si128(v_u16, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const __m128i t3 = _mm_or_si128(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const __m128i t4 = _mm_or_si128(t3, v_c080); + + // 2. merge ASCII and 2-byte codewords + const __m128i utf8_unpacked = _mm_blendv_epi8(t4, v_u16, one_byte_bytemask); + + // 3. prepare bitmask for 8-bit lookup + // one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a + // - LSB) + const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a + const uint16_t m1 = static_cast(m0 >> 7); // m1 = 00000000h0g0f0e0 + const uint8_t m2 = static_cast((m0 | m1) & 0xff); // m2 = hdgcfbea + // 4. pack the bytes + const uint8_t *row = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; + const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1)); + const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle); + + // 5. store bytes + _mm_storeu_si128((__m128i *)utf8_output, utf8_packed); + + // 6. adjust pointers + utf8_output += row[0]; +} + +inline void write_v_u16_11bits_to_utf8(const __m128i v_u16, char *&utf8_output, + const __m128i v_0000, + const __m128i v_ff80) { + // no bits set above 7th bit + const __m128i one_byte_bytemask = + _mm_cmpeq_epi16(_mm_and_si128(v_u16, v_ff80), v_0000); + const uint16_t one_byte_bitmask = + static_cast(_mm_movemask_epi8(one_byte_bytemask)); + + write_v_u16_11bits_to_utf8(v_u16, utf8_output, one_byte_bytemask, + one_byte_bitmask); +} +/* end file src/westmere/internal/write_v_u16_11bits_to_utf8.cpp */ + +} // namespace westmere +} // namespace internal +/* end file src/westmere/internal/loader.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_UTF16 +/* begin file src/westmere/sse_utf16fix.cpp */ +/* + * Process one block of 8 characters. If in_place is false, + * copy the block from in to out. If there is a sequencing + * error in the block, overwrite the illsequenced characters + * with the replacement character. This function reads one + * character before the beginning of the buffer as a lookback. + * If that character is illsequenced, it too is overwritten. + */ +template +simdutf_really_inline void utf16fix_block_sse(char16_t *out, + const char16_t *in) { + auto swap_if_needed = [](uint16_t x) simdutf_constexpr -> uint16_t { + return scalar::utf16::swap_if_needed(x); + }; + const char16_t replacement = scalar::utf16::replacement(); + + __m128i lookback, block, lb_masked, block_masked, lb_is_high, block_is_low; + __m128i illseq, lb_illseq, block_illseq; + + lookback = _mm_loadu_si128((const __m128i *)(in - 1)); + block = _mm_loadu_si128((const __m128i *)in); + lb_masked = _mm_and_si128(lookback, _mm_set1_epi16(swap_if_needed(0xfc00U))); + block_masked = _mm_and_si128(block, _mm_set1_epi16(swap_if_needed(0xfc00U))); + lb_is_high = + _mm_cmpeq_epi16(lb_masked, _mm_set1_epi16(swap_if_needed(0xd800U))); + block_is_low = + _mm_cmpeq_epi16(block_masked, _mm_set1_epi16(swap_if_needed(0xdc00U))); + + illseq = _mm_xor_si128(lb_is_high, block_is_low); + if (_mm_movemask_epi8(illseq) != 0) { + int lb; + + /* compute the cause of the illegal sequencing */ + lb_illseq = _mm_andnot_si128(block_is_low, lb_is_high); + block_illseq = _mm_or_si128(_mm_andnot_si128(lb_is_high, block_is_low), + _mm_bsrli_si128(lb_illseq, 2)); + + /* fix illegal sequencing in the lookback */ + lb = _mm_cvtsi128_si32(lb_illseq); + lb = (lb & replacement) | (~lb & out[-1]); + out[-1] = char16_t(lb); + /* fix illegal sequencing in the main block */ + block = + _mm_or_si128(_mm_andnot_si128(block_illseq, block), + _mm_and_si128(block_illseq, _mm_set1_epi16(replacement))); + _mm_storeu_si128((__m128i *)out, block); + } else if (!in_place) { + _mm_storeu_si128((__m128i *)out, block); + } +} + +template +void utf16fix_sse(const char16_t *in, size_t n, char16_t *out) { + const char16_t replacement = scalar::utf16::replacement(); + size_t i; + if (n < 9) { + scalar::utf16::to_well_formed_utf16(in, n, out); + return; + } + + out[0] = + scalar::utf16::is_low_surrogate(in[0]) ? replacement : in[0]; + + /* duplicate code to have the compiler specialise utf16fix_block() */ + if (in == out) { + for (i = 1; i + 8 < n; i += 8) { + utf16fix_block_sse(out + i, in + i); + } + + utf16fix_block_sse(out + n - 8, in + n - 8); + } else { + for (i = 1; i + 8 < n; i += 8) { + utf16fix_block_sse(out + i, in + i); + } + + utf16fix_block_sse(out + n - 8, in + n - 8); + } + + out[n - 1] = scalar::utf16::is_high_surrogate(out[n - 1]) + ? replacement + : out[n - 1]; +} +/* end file src/westmere/sse_utf16fix.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +/* begin file src/westmere/sse_validate_utf16.cpp */ +template +simd8 utf16_gather_high_bytes(const simd16 in0, + const simd16 in1) { + if (big_endian) { + // we want lower bytes + const auto mask = simd16(0x00ff); + const auto t0 = in0 & mask; + const auto t1 = in1 & mask; + + return simd16::pack(t0, t1); + } else { + const auto t0 = in0.shr<8>(); + const auto t1 = in1.shr<8>(); + + return simd16::pack(t0, t1); + } +} +/* end file src/westmere/sse_validate_utf16.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/westmere/sse_convert_latin1_to_utf8.cpp */ +std::pair +sse_convert_latin1_to_utf8(const char *latin_input, + const size_t latin_input_length, char *utf8_output) { + const char *end = latin_input + latin_input_length; + + const __m128i v_0000 = _mm_setzero_si128(); + // 0b1000_0000 + const __m128i v_80 = _mm_set1_epi8((uint8_t)0x80); + // 0b1111_1111_1000_0000 + const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80); + + const __m128i latin_1_half_into_u16_byte_mask = + _mm_setr_epi8(0, '\x80', 1, '\x80', 2, '\x80', 3, '\x80', 4, '\x80', 5, + '\x80', 6, '\x80', 7, '\x80'); + + const __m128i latin_2_half_into_u16_byte_mask = + _mm_setr_epi8(8, '\x80', 9, '\x80', 10, '\x80', 11, '\x80', 12, '\x80', + 13, '\x80', 14, '\x80', 15, '\x80'); + + // each latin1 takes 1-2 utf8 bytes + // slow path writes useful 8-15 bytes twice (eagerly writes 16 bytes and then + // adjust the pointer) so the last write can exceed the utf8_output size by + // 8-1 bytes by reserving 8 extra input bytes, we expect the output to have + // 8-16 bytes free + while (end - latin_input >= 16 + 8) { + // Load 16 Latin1 characters (16 bytes) into a 128-bit register + __m128i v_latin = _mm_loadu_si128((__m128i *)latin_input); + + if (_mm_testz_si128(v_latin, v_80)) { // ASCII fast path!!!! + _mm_storeu_si128((__m128i *)utf8_output, v_latin); + latin_input += 16; + utf8_output += 16; + continue; + } + + // assuming a/b are bytes and A/B are uint16 of the same value + // aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA + __m128i v_u16_latin_1_half = + _mm_shuffle_epi8(v_latin, latin_1_half_into_u16_byte_mask); + // aaaa_aaaa_bbbb_bbbb -> BBBB_BBBB + __m128i v_u16_latin_2_half = + _mm_shuffle_epi8(v_latin, latin_2_half_into_u16_byte_mask); + + internal::westmere::write_v_u16_11bits_to_utf8(v_u16_latin_1_half, + utf8_output, v_0000, v_ff80); + internal::westmere::write_v_u16_11bits_to_utf8(v_u16_latin_2_half, + utf8_output, v_0000, v_ff80); + latin_input += 16; + } + + if (end - latin_input >= 16) { + // Load 16 Latin1 characters (16 bytes) into a 128-bit register + __m128i v_latin = _mm_loadu_si128((__m128i *)latin_input); + + if (_mm_testz_si128(v_latin, v_80)) { // ASCII fast path!!!! + _mm_storeu_si128((__m128i *)utf8_output, v_latin); + latin_input += 16; + utf8_output += 16; + } else { + // assuming a/b are bytes and A/B are uint16 of the same value + // aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA + __m128i v_u16_latin_1_half = + _mm_shuffle_epi8(v_latin, latin_1_half_into_u16_byte_mask); + internal::westmere::write_v_u16_11bits_to_utf8( + v_u16_latin_1_half, utf8_output, v_0000, v_ff80); + latin_input += 8; + } + } + + return std::make_pair(latin_input, utf8_output); +} +/* end file src/westmere/sse_convert_latin1_to_utf8.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/westmere/sse_convert_latin1_to_utf16.cpp */ +template +std::pair +sse_convert_latin1_to_utf16(const char *latin1_input, size_t len, + char16_t *utf16_output) { + size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16 + for (size_t i = 0; i < rounded_len; i += 16) { + // Load 16 Latin1 characters into a 128-bit register + __m128i in = + _mm_loadu_si128(reinterpret_cast(&latin1_input[i])); + __m128i out1 = big_endian ? _mm_unpacklo_epi8(_mm_setzero_si128(), in) + : _mm_unpacklo_epi8(in, _mm_setzero_si128()); + __m128i out2 = big_endian ? _mm_unpackhi_epi8(_mm_setzero_si128(), in) + : _mm_unpackhi_epi8(in, _mm_setzero_si128()); + // Zero extend each Latin1 character to 16-bit integers and store the + // results back to memory + _mm_storeu_si128(reinterpret_cast<__m128i *>(&utf16_output[i]), out1); + _mm_storeu_si128(reinterpret_cast<__m128i *>(&utf16_output[i + 8]), out2); + } + // return pointers pointing to where we left off + return std::make_pair(latin1_input + rounded_len, utf16_output + rounded_len); +} +/* end file src/westmere/sse_convert_latin1_to_utf16.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/westmere/sse_convert_latin1_to_utf32.cpp */ +std::pair +sse_convert_latin1_to_utf32(const char *buf, size_t len, + char32_t *utf32_output) { + const char *end = buf + len; + + while (end - buf >= 16) { + // Load 16 Latin1 characters (16 bytes) into a 128-bit register + __m128i in = _mm_loadu_si128((__m128i *)buf); + + // Shift input to process next 4 bytes + __m128i in_shifted1 = _mm_srli_si128(in, 4); + __m128i in_shifted2 = _mm_srli_si128(in, 8); + __m128i in_shifted3 = _mm_srli_si128(in, 12); + + // expand 8-bit to 32-bit unit + __m128i out1 = _mm_cvtepu8_epi32(in); + __m128i out2 = _mm_cvtepu8_epi32(in_shifted1); + __m128i out3 = _mm_cvtepu8_epi32(in_shifted2); + __m128i out4 = _mm_cvtepu8_epi32(in_shifted3); + + _mm_storeu_si128((__m128i *)utf32_output, out1); + _mm_storeu_si128((__m128i *)(utf32_output + 4), out2); + _mm_storeu_si128((__m128i *)(utf32_output + 8), out3); + _mm_storeu_si128((__m128i *)(utf32_output + 12), out4); + + utf32_output += 16; + buf += 16; + } + + return std::make_pair(buf, utf32_output); +} +/* end file src/westmere/sse_convert_latin1_to_utf32.cpp */ +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +/* begin file src/westmere/sse_convert_utf8_to_utf16.cpp */ +// depends on "tables/utf8_to_utf16_tables.h" + +// Convert up to 12 bytes from utf8 to utf16 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 12). +template +size_t convert_masked_utf8_to_utf16(const char *input, + uint64_t utf8_end_of_code_point_mask, + char16_t *&utf16_output) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + // + // Optimization note: our main path below is load-latency dependent. Thus it + // is maybe beneficial to have fast paths that depend on branch prediction but + // have less latency. This results in more instructions but, potentially, also + // higher speeds. + // + // We first try a few fast paths. + const __m128i swap = + _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + const __m128i in = _mm_loadu_si128((__m128i *)input); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & 0xfff; + if (utf8_end_of_code_point_mask == 0xfff) { + // We process the data in chunks of 12 bytes. + // Note: using 16 bytes is unsafe, see issue_ossfuzz_71218 + __m128i ascii_first = _mm_cvtepu8_epi16(in); + __m128i ascii_second = _mm_cvtepu8_epi16(_mm_srli_si128(in, 8)); + if (big_endian) { + ascii_first = _mm_shuffle_epi8(ascii_first, swap); + ascii_second = _mm_shuffle_epi8(ascii_second, swap); + } + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output), ascii_first); + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + 8), + ascii_second); + utf16_output += 12; // We wrote 12 16-bit characters. + return 12; // We consumed 12 bytes. + } + if (((utf8_end_of_code_point_mask & 0xFFFF) == 0xaaaa)) { + // We want to take 8 2-byte UTF-8 code units and turn them into 8 2-byte + // UTF-16 code units. There is probably a more efficient sequence, but the + // following might do. + const __m128i sh = + _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); + __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); + if (big_endian) + composed = _mm_shuffle_epi8(composed, swap); + _mm_storeu_si128((__m128i *)utf16_output, composed); + utf16_output += 8; // We wrote 16 bytes, 8 code points. + return 16; + } + if (input_utf8_end_of_code_point_mask == 0x924) { + // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte + // UTF-16 code units. There is probably a more efficient sequence, but the + // following might do. + const __m128i sh = + _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = + _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits + const __m128i middlebyte = + _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + const __m128i highbyte = + _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); + __m128i composed_repacked = _mm_packus_epi32(composed, composed); + if (big_endian) + composed_repacked = _mm_shuffle_epi8(composed_repacked, swap); + _mm_storeu_si128((__m128i *)utf16_output, composed_repacked); + utf16_output += 4; + return 12; + } + /// We do not have a fast path available, so we fallback. + + const uint8_t idx = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; + const uint8_t consumed = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; + if (idx < 64) { + // SIX (6) input code-code units + // this is a relatively easy scenario + // we process SIX (6) input code-code units. The max length in bytes of six + // code code units spanning between 1 and 2 bytes each is 12 bytes. On + // processors where pdep/pext is fast, we might be able to use a small + // lookup table. + const __m128i sh = + _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); + __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); + if (big_endian) + composed = _mm_shuffle_epi8(composed, swap); + _mm_storeu_si128((__m128i *)utf16_output, composed); + utf16_output += 6; // We wrote 12 bytes, 6 code points. + } else if (idx < 145) { + // FOUR (4) input code-code units + const __m128i sh = + _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = + _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits + const __m128i middlebyte = + _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + const __m128i highbyte = + _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); + __m128i composed_repacked = _mm_packus_epi32(composed, composed); + if (big_endian) + composed_repacked = _mm_shuffle_epi8(composed_repacked, swap); + _mm_storeu_si128((__m128i *)utf16_output, composed_repacked); + utf16_output += 4; + } else if (idx < 209) { + // TWO (2) input code-code units + ////////////// + // There might be garbage inputs where a leading byte mascarades as a + // four-byte leading byte (by being followed by 3 continuation byte), but is + // not greater than 0xf0. This could trigger a buffer overflow if we only + // counted leading bytes of the form 0xf0 as generating surrogate pairs, + // without further UTF-8 validation. Thus we must be careful to ensure that + // only leading bytes at least as large as 0xf0 generate surrogate pairs. We + // do as at the cost of an extra mask. + ///////////// + const __m128i sh = + _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); + const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000)); + // correct for spurious high bit + const __m128i correct = + _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1); + middlehighbyte = _mm_xor_si128(correct, middlehighbyte); + const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4); + // We deliberately carry the leading four bits in highbyte if they are + // present, we remove them later when computing hightenbits. + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0xff000000)); + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6); + // When we need to generate a surrogate pair (leading byte > 0xF0), then + // the corresponding 32-bit value in 'composed' will be greater than + // > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the + // location of the surrogate pairs. + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), + _mm_or_si128(highbyte_shifted, middlehighbyte_shifted)); + const __m128i composedminus = + _mm_sub_epi32(composed, _mm_set1_epi32(0x10000)); + const __m128i lowtenbits = + _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff)); + // Notice the 0x3ff mask: + const __m128i hightenbits = + _mm_and_si128(_mm_srli_epi32(composedminus, 10), _mm_set1_epi32(0x3ff)); + const __m128i lowtenbitsadd = + _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00)); + const __m128i hightenbitsadd = + _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800)); + const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16); + __m128i surrogates = _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted); + uint32_t basic_buffer[4]; + uint32_t basic_buffer_swap[4]; + if (big_endian) { + _mm_storeu_si128((__m128i *)basic_buffer_swap, + _mm_shuffle_epi8(composed, swap)); + surrogates = _mm_shuffle_epi8(surrogates, swap); + } + _mm_storeu_si128((__m128i *)basic_buffer, composed); + uint32_t surrogate_buffer[4]; + _mm_storeu_si128((__m128i *)surrogate_buffer, surrogates); + for (size_t i = 0; i < 3; i++) { + if (basic_buffer[i] > 0x3c00000) { + utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff); + utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16); + utf16_output += 2; + } else { + utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i]) + : uint16_t(basic_buffer[i]); + utf16_output++; + } + } + } else { + // here we know that there is an error but we do not handle errors + } + return consumed; +} +/* end file src/westmere/sse_convert_utf8_to_utf16.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +/* begin file src/westmere/sse_convert_utf8_to_utf32.cpp */ +// depends on "tables/utf8_to_utf16_tables.h" + +// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 12). +size_t convert_masked_utf8_to_utf32(const char *input, + uint64_t utf8_end_of_code_point_mask, + char32_t *&utf32_output) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + // + // Optimization note: our main path below is load-latency dependent. Thus it + // is maybe beneficial to have fast paths that depend on branch prediction but + // have less latency. This results in more instructions but, potentially, also + // higher speeds. + // + // We first try a few fast paths. + const __m128i in = _mm_loadu_si128((__m128i *)input); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & 0xfff; + if (utf8_end_of_code_point_mask == 0xfff) { + // We process the data in chunks of 12 bytes. + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), + _mm_cvtepu8_epi32(in)); + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4), + _mm_cvtepu8_epi32(_mm_srli_si128(in, 4))); + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 8), + _mm_cvtepu8_epi32(_mm_srli_si128(in, 8))); + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 12), + _mm_cvtepu8_epi32(_mm_srli_si128(in, 12))); + utf32_output += 12; // We wrote 12 32-bit characters. + return 12; // We consumed 12 bytes. + } + if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) { + // We want to take 8 2-byte UTF-8 code units and turn them into 8 4-byte + // UTF-32 code units. There is probably a more efficient sequence, but the + // following might do. + const __m128i sh = + _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); + const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), + _mm_cvtepu16_epi32(composed)); + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4), + _mm_cvtepu16_epi32(_mm_srli_si128(composed, 8))); + utf32_output += 8; // We wrote 32 bytes, 8 code points. + return 16; + } + if (input_utf8_end_of_code_point_mask == 0x924) { + // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte + // UTF-32 code units. There is probably a more efficient sequence, but the + // following might do. + const __m128i sh = + _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = + _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits + const __m128i middlebyte = + _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + const __m128i highbyte = + _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); + _mm_storeu_si128((__m128i *)utf32_output, composed); + utf32_output += 4; + return 12; + } + /// We do not have a fast path available, so we fallback. + + const uint8_t idx = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; + const uint8_t consumed = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; + if (idx < 64) { + // SIX (6) input code-code units + // this is a relatively easy scenario + // we process SIX (6) input code-code units. The max length in bytes of six + // code code units spanning between 1 and 2 bytes each is 12 bytes. On + // processors where pdep/pext is fast, we might be able to use a small + // lookup table. + const __m128i sh = + _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); + const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), + _mm_cvtepu16_epi32(composed)); + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4), + _mm_cvtepu16_epi32(_mm_srli_si128(composed, 8))); + utf32_output += 6; // We wrote 12 bytes, 6 code points. + } else if (idx < 145) { + // FOUR (4) input code-code units + const __m128i sh = + _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = + _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits + const __m128i middlebyte = + _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + const __m128i highbyte = + _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); + _mm_storeu_si128((__m128i *)utf32_output, composed); + utf32_output += 4; + } else if (idx < 209) { + // TWO (2) input code-code units + const __m128i sh = + _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); + const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000)); + // correct for spurious high bit + const __m128i correct = + _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1); + middlehighbyte = _mm_xor_si128(correct, middlehighbyte); + const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000)); + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), + _mm_or_si128(highbyte_shifted, middlehighbyte_shifted)); + _mm_storeu_si128((__m128i *)utf32_output, composed); + utf32_output += 3; + } else { + // here we know that there is an error but we do not handle errors + } + return consumed; } +/* end file src/westmere/sse_convert_utf8_to_utf32.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 -simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - std::tuple ret = icelake::convert_utf16_to_utf32(buf, len, utf32_output); - if (!std::get<2>(ret)) { - result scalar_res = scalar::utf16_to_utf32::convert_with_errors( - std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret)); - scalar_res.count += (std::get<0>(ret) - buf); - return scalar_res; - } - size_t saved_bytes = std::get<1>(ret) - utf32_output; - if (std::get<0>(ret) != buf + len) { - result scalar_res = scalar::utf16_to_utf32::convert_with_errors( - std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret)); - if (scalar_res.error) { - scalar_res.count += (std::get<0>(ret) - buf); - return scalar_res; - } else { - scalar_res.count += saved_bytes; - return scalar_res; - } - } - return simdutf::result(simdutf::SUCCESS, saved_bytes); -} +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/westmere/sse_convert_utf8_to_latin1.cpp */ +// depends on "tables/utf8_to_utf16_tables.h" -simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - std::tuple ret = icelake::convert_utf16_to_utf32(buf, len, utf32_output); - if (!std::get<2>(ret)) { - result scalar_res = scalar::utf16_to_utf32::convert_with_errors( - std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret)); - scalar_res.count += (std::get<0>(ret) - buf); - return scalar_res; +// Convert up to 12 bytes from utf8 to latin1 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 12). +size_t convert_masked_utf8_to_latin1(const char *input, + uint64_t utf8_end_of_code_point_mask, + char *&latin1_output) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + // + // Optimization note: our main path below is load-latency dependent. Thus it + // is maybe beneficial to have fast paths that depend on branch prediction but + // have less latency. This results in more instructions but, potentially, also + // higher speeds. + // + const __m128i in = _mm_loadu_si128((__m128i *)input); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & + 0xfff; // we are only processing 12 bytes in case it is not all ASCII + if (utf8_end_of_code_point_mask == 0xfff) { + // We process the data in chunks of 12 bytes. + _mm_storeu_si128(reinterpret_cast<__m128i *>(latin1_output), in); + latin1_output += 12; // We wrote 12 characters. + return 12; // We consumed 12 bytes. } - size_t saved_bytes = std::get<1>(ret) - utf32_output; - if (std::get<0>(ret) != buf + len) { - result scalar_res = scalar::utf16_to_utf32::convert_with_errors( - std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret)); - if (scalar_res.error) { - scalar_res.count += (std::get<0>(ret) - buf); - return scalar_res; - } else { - scalar_res.count += saved_bytes; - return scalar_res; - } + /// We do not have a fast path available, so we fallback. + const uint8_t idx = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; + const uint8_t consumed = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; + // this indicates an invalid input: + if (idx >= 64) { + return consumed; } - return simdutf::result(simdutf::SUCCESS, saved_bytes); + // Here we should have (idx < 64), if not, there is a bug in the validation or + // elsewhere. SIX (6) input code-code units this is a relatively easy scenario + // we process SIX (6) input code-code units. The max length in bytes of six + // code code units spanning between 1 and 2 bytes each is 12 bytes. On + // processors where pdep/pext is fast, we might be able to use a small lookup + // table. + const __m128i sh = + _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); + __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); + const __m128i latin1_packed = _mm_packus_epi16(composed, composed); + // writing 8 bytes even though we only care about the first 6 bytes. + // performance note: it would be faster to use _mm_storeu_si128, we should + // investigate. + _mm_storel_epi64((__m128i *)latin1_output, latin1_packed); + latin1_output += 6; // We wrote 6 bytes. + return consumed; } +/* end file src/westmere/sse_convert_utf8_to_latin1.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 -simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - std::tuple ret = icelake::convert_utf16_to_utf32(buf, len, utf32_output); - if (!std::get<2>(ret)) { return 0; } - size_t saved_bytes = std::get<1>(ret) - utf32_output; - if (std::get<0>(ret) != buf + len) { - const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert( - std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret)); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; - } - return saved_bytes; -} +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/westmere/sse_convert_utf16_to_latin1.cpp */ +template +std::pair +sse_convert_utf16_to_latin1(const char16_t *buf, size_t len, + char *latin1_output) { + const char16_t *end = buf + len; + while (end - buf >= 8) { + // Load 8 UTF-16 characters into 128-bit SSE register + __m128i in = _mm_loadu_si128(reinterpret_cast(buf)); -simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - std::tuple ret = icelake::convert_utf16_to_utf32(buf, len, utf32_output); - if (!std::get<2>(ret)) { return 0; } - size_t saved_bytes = std::get<1>(ret) - utf32_output; - if (std::get<0>(ret) != buf + len) { - const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert( - std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret)); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; - } - return saved_bytes; -} + if simdutf_constexpr (!match_system(big_endian)) { + const __m128i swap = + _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + in = _mm_shuffle_epi8(in, swap); + } -void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept { - size_t pos = 0; - const __m512i byteflip = _mm512_setr_epi64( - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809 - ); - while (pos + 32 <= length) { - __m512i utf16 = _mm512_loadu_si512((const __m512i*)(input + pos)); - utf16 = _mm512_shuffle_epi8(utf16, byteflip); - _mm512_storeu_si512(output + pos, utf16); - pos += 32; - } - if(pos < length) { - __mmask32 m((1<< (length - pos))-1); - __m512i utf16 = _mm512_maskz_loadu_epi16(m, (const __m512i*)(input + pos)); - utf16 = _mm512_shuffle_epi8(utf16, byteflip); - _mm512_mask_storeu_epi16(output + pos, m, utf16); - } + __m128i high_byte_mask = _mm_set1_epi16((int16_t)0xFF00); + if (_mm_testz_si128(in, high_byte_mask)) { + // Pack 16-bit characters into 8-bit and store in latin1_output + __m128i latin1_packed = _mm_packus_epi16(in, in); + _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output), + latin1_packed); + // Adjust pointers for next iteration + buf += 8; + latin1_output += 8; + } else { + return std::make_pair(nullptr, reinterpret_cast(latin1_output)); + } + } // while + return std::make_pair(buf, latin1_output); } +template +std::pair +sse_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len, + char *latin1_output) { + const char16_t *start = buf; + const char16_t *end = buf + len; + while (end - buf >= 8) { + __m128i in = _mm_loadu_si128(reinterpret_cast(buf)); -simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept { - const char16_t* end = length >= 32 ? input + length - 32 : nullptr; - const char16_t* ptr = input; + if simdutf_constexpr (!match_system(big_endian)) { + const __m128i swap = + _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + in = _mm_shuffle_epi8(in, swap); + } - const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00); - const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff); + __m128i high_byte_mask = _mm_set1_epi16((int16_t)0xFF00); + if (_mm_testz_si128(in, high_byte_mask)) { + __m128i latin1_packed = _mm_packus_epi16(in, in); + _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output), + latin1_packed); + buf += 8; + latin1_output += 8; + } else { + // Fallback to scalar code for handling errors + for (int k = 0; k < 8; k++) { + uint16_t word = scalar::utf16::swap_if_needed(buf[k]); + if (word <= 0xff) { + *latin1_output++ = char(word); + } else { + return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), + latin1_output); + } + } + buf += 8; + } + } // while + return std::make_pair(result(error_code::SUCCESS, buf - start), + latin1_output); +} +/* end file src/westmere/sse_convert_utf16_to_latin1.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 - size_t count{0}; +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +/* begin file src/westmere/sse_convert_utf16_to_utf8.cpp */ +/* + The vectorized algorithm works on single SSE register i.e., it + loads eight 16-bit code units. - while (ptr <= end) { - __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr); - ptr += 32; - uint64_t not_high_surrogate = static_cast(_mm512_cmpgt_epu16_mask(utf16, high) | _mm512_cmplt_epu16_mask(utf16, low)); - count += count_ones(not_high_surrogate); - } + We consider three cases: + 1. an input register contains no surrogates and each value + is in range 0x0000 .. 0x07ff. + 2. an input register contains no surrogates and values are + is in range 0x0000 .. 0xffff. + 3. an input register contains surrogates --- i.e. codepoints + can have 16 or 32 bits. - return count + scalar::utf16::count_code_points(ptr, length - (ptr - input)); -} + Ad 1. -simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept { - const char16_t* end = length >= 32 ? input + length - 32 : nullptr; - const char16_t* ptr = input; + When values are less than 0x0800, it means that a 16-bit code unit + can be converted into: 1) single UTF8 byte (when it is an ASCII + char) or 2) two UTF8 bytes. - const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00); - const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff); + For this case we do only some shuffle to obtain these 2-byte + codes and finally compress the whole SSE register with a single + shuffle. - size_t count{0}; - const __m512i byteflip = _mm512_setr_epi64( - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809 - ); - while (ptr <= end) { - __m512i utf16 = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)ptr), byteflip); - ptr += 32; - uint64_t not_high_surrogate = static_cast(_mm512_cmpgt_epu16_mask(utf16, high) | _mm512_cmplt_epu16_mask(utf16, low)); - count += count_ones(not_high_surrogate); - } + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. - return count + scalar::utf16::count_code_points(ptr, length - (ptr - input)); -} + Ad 2. + When values fit in 16-bit code units, but are above 0x07ff, then + a single word may produce one, two or three UTF8 bytes. -simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept { - const uint8_t *str = reinterpret_cast(input); - size_t answer = length / sizeof(__m512i) * sizeof(__m512i); // Number of 512-bit chunks that fits into the length. - size_t i = 0; - __m512i unrolled_popcount{0}; + We prepare data for all these three cases in two registers. + The first register contains lower two UTF8 bytes (used in all + cases), while the second one contains just the third byte for + the three-UTF8-bytes case. - const __m512i continuation = _mm512_set1_epi8(char(0b10111111)); + Finally these two registers are interleaved forming eight-element + array of 32-bit values. The array spans two SSE registers. + The bytes from the registers are compressed using two shuffles. - while (i + sizeof(__m512i) <= length) { - size_t iterations = (length - i) / sizeof(__m512i); + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. - size_t max_i = i + iterations * sizeof(__m512i) - sizeof(__m512i); - for (; i + 8*sizeof(__m512i) <= max_i; i += 8*sizeof(__m512i)) { - __m512i input1 = _mm512_loadu_si512((const __m512i *)(str + i)); - __m512i input2 = _mm512_loadu_si512((const __m512i *)(str + i + sizeof(__m512i))); - __m512i input3 = _mm512_loadu_si512((const __m512i *)(str + i + 2*sizeof(__m512i))); - __m512i input4 = _mm512_loadu_si512((const __m512i *)(str + i + 3*sizeof(__m512i))); - __m512i input5 = _mm512_loadu_si512((const __m512i *)(str + i + 4*sizeof(__m512i))); - __m512i input6 = _mm512_loadu_si512((const __m512i *)(str + i + 5*sizeof(__m512i))); - __m512i input7 = _mm512_loadu_si512((const __m512i *)(str + i + 6*sizeof(__m512i))); - __m512i input8 = _mm512_loadu_si512((const __m512i *)(str + i + 7*sizeof(__m512i))); + To summarize: + - We need two 256-entry tables that have 8704 bytes in total. +*/ - __mmask64 mask1 = _mm512_cmple_epi8_mask(input1, continuation); - __mmask64 mask2 = _mm512_cmple_epi8_mask(input2, continuation); - __mmask64 mask3 = _mm512_cmple_epi8_mask(input3, continuation); - __mmask64 mask4 = _mm512_cmple_epi8_mask(input4, continuation); - __mmask64 mask5 = _mm512_cmple_epi8_mask(input5, continuation); - __mmask64 mask6 = _mm512_cmple_epi8_mask(input6, continuation); - __mmask64 mask7 = _mm512_cmple_epi8_mask(input7, continuation); - __mmask64 mask8 = _mm512_cmple_epi8_mask(input8, continuation); +/* + Returns a pair: the first unprocessed byte from buf and utf8_output + A scalar routing should carry on the conversion of the tail. +*/ +template +std::pair +sse_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_output) { - __m512i mask_register = _mm512_set_epi64(mask8, mask7, mask6, mask5, mask4, mask3, mask2, mask1); + const char16_t *end = buf + len; + const __m128i v_0000 = _mm_setzero_si128(); + const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800); + const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800); + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 - unrolled_popcount = _mm512_add_epi64(unrolled_popcount, _mm512_popcnt_epi64(mask_register)); + while (end - buf >= std::ptrdiff_t(16 + safety_margin)) { + __m128i in = _mm_loadu_si128((__m128i *)buf); + if (big_endian) { + const __m128i swap = + _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + in = _mm_shuffle_epi8(in, swap); } - - for (; i <= max_i; i += sizeof(__m512i)) { - __m512i more_input = _mm512_loadu_si512((const __m512i *)(str + i)); - uint64_t continuation_bitmask = static_cast(_mm512_cmple_epi8_mask(more_input, continuation)); - answer -= count_ones(continuation_bitmask); + // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes + const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80); + if (_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!! + __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1); + if (big_endian) { + const __m128i swap = + _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + nextin = _mm_shuffle_epi8(nextin, swap); + } + if (!_mm_testz_si128(nextin, v_ff80)) { + // 1. pack the bytes + // obviously suboptimal. + const __m128i utf8_packed = _mm_packus_epi16(in, in); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i *)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 8; + utf8_output += 8; + in = nextin; + } else { + // 1. pack the bytes + // obviously suboptimal. + const __m128i utf8_packed = _mm_packus_epi16(in, nextin); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i *)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } } - } - - __m256i first_half = _mm512_extracti64x4_epi64(unrolled_popcount, 0); - __m256i second_half = _mm512_extracti64x4_epi64(unrolled_popcount, 1); - answer -= (size_t)_mm256_extract_epi64(first_half, 0) + - (size_t)_mm256_extract_epi64(first_half, 1) + - (size_t)_mm256_extract_epi64(first_half, 2) + - (size_t)_mm256_extract_epi64(first_half, 3) + - (size_t)_mm256_extract_epi64(second_half, 0) + - (size_t)_mm256_extract_epi64(second_half, 1) + - (size_t)_mm256_extract_epi64(second_half, 2) + - (size_t)_mm256_extract_epi64(second_half, 3); - return answer + scalar::utf8::count_code_points(reinterpret_cast(str + i), length - i); -} + // no bits set above 7th bit + const __m128i one_byte_bytemask = + _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000); + const uint16_t one_byte_bitmask = + static_cast(_mm_movemask_epi8(one_byte_bytemask)); -simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept { - return count_utf8(buf,len); -} + // no bits set above 11th bit + const __m128i one_or_two_bytes_bytemask = + _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000); + const uint16_t one_or_two_bytes_bitmask = + static_cast(_mm_movemask_epi8(one_or_two_bytes_bytemask)); -simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept { - return scalar::utf16::latin1_length_from_utf16(length); -} + if (one_or_two_bytes_bitmask == 0xffff) { + internal::westmere::write_v_u16_11bits_to_utf8( + in, utf8_output, one_byte_bytemask, one_byte_bitmask); + buf += 8; + continue; + } -simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t length) const noexcept { - return scalar::utf32::latin1_length_from_utf32(length); -} + // 1. Check if there are any surrogate word in the input chunk. + // We have also deal with situation when there is a surrogate word + // at the end of a chunk. + const __m128i surrogates_bytemask = + _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800); -simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept { - const char16_t* end = length >= 32 ? input + length - 32 : nullptr; - const char16_t* ptr = input; + // bitmask = 0x0000 if there are no surrogates + // = 0xc000 if the last word is a surrogate + const uint16_t surrogates_bitmask = + static_cast(_mm_movemask_epi8(surrogates_bytemask)); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. + // However, it is likely an uncommon occurrence. + if (surrogates_bitmask == 0x0000) { + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes + const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); - const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f); - const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff); - const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff); - const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800); + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - + single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two + UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - + three UTF-8 bytes - size_t count{0}; + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. - while (ptr <= end) { - __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr); - ptr += 32; - __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f); - __mmask32 two_bytes_bitmask = _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff); - __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask); - __mmask32 surrogates_bitmask = _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) & _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800); + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. - size_t ascii_count = count_ones(ascii_bitmask); - size_t two_bytes_count = count_ones(two_bytes_bitmask); - size_t surrogate_bytes_count = count_ones(surrogates_bitmask); - size_t three_bytes_count = 32 - ascii_count - two_bytes_count - surrogate_bytes_count; + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. - count += ascii_count + 2*two_bytes_count + 3*three_bytes_count + 2*surrogate_bytes_count; - } + Finally from these two code units we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define simdutf_vec(x) _mm_set1_epi16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const __m128i t0 = _mm_shuffle_epi8(in, dup_even); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000)); - return count + scalar::utf16::utf8_length_from_utf16(ptr, length - (ptr - input)); -} + // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] + const __m128i s0 = _mm_srli_epi16(in, 4); + // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] + const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100)); + // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] + const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140)); + // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000)); + const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, + simdutf_vec(0b0100000000000000)); + const __m128i s4 = _mm_xor_si128(s3, m0); +#undef simdutf_vec -simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept { - const char16_t* end = length >= 32 ? input + length - 32 : nullptr; - const char16_t* ptr = input; + // 4. expand code units 16-bit => 32-bit + const __m128i out0 = _mm_unpacklo_epi16(t2, s4); + const __m128i out1 = _mm_unpackhi_epi16(t2, s4); - const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f); - const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff); - const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff); - const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800); + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + const uint16_t mask = + (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa); + if (mask == 0) { + // We only have three-byte code units. Use fast path. + const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14, + 15, 13, -1, -1, -1, -1); + const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle); + const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle); + _mm_storeu_si128((__m128i *)utf8_output, utf8_0); + utf8_output += 12; + _mm_storeu_si128((__m128i *)utf8_output, utf8_1); + utf8_output += 12; + buf += 8; + continue; + } + const uint8_t mask0 = uint8_t(mask); - size_t count{0}; - const __m512i byteflip = _mm512_setr_epi64( - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809 - ); - while (ptr <= end) { - __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr); - utf16 = _mm512_shuffle_epi8(utf16, byteflip); - ptr += 32; - __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f); - __mmask32 two_bytes_bitmask = _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff); - __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask); - __mmask32 surrogates_bitmask = _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) & _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800); + const uint8_t *row0 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1)); + const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0); - size_t ascii_count = count_ones(ascii_bitmask); - size_t two_bytes_count = count_ones(two_bytes_bitmask); - size_t surrogate_bytes_count = count_ones(surrogates_bitmask); - size_t three_bytes_count = 32 - ascii_count - two_bytes_count - surrogate_bytes_count; - count += ascii_count + 2*two_bytes_count + 3*three_bytes_count + 2*surrogate_bytes_count; - } + const uint8_t mask1 = static_cast(mask >> 8); - return count + scalar::utf16::utf8_length_from_utf16(ptr, length - (ptr - input)); -} + const uint8_t *row1 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1)); + const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1); -simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept { - return implementation::count_utf16le(input, length); -} + _mm_storeu_si128((__m128i *)utf8_output, utf8_0); + utf8_output += row0[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_1); + utf8_output += row1[0]; -simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept { - return implementation::count_utf16be(input, length); -} + buf += 8; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint16_t word = scalar::utf16::swap_if_needed(buf[k]); + if ((word & 0xFF80) == 0) { + *utf8_output++ = char(word); + } else if ((word & 0xF800) == 0) { + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if ((word & 0xF800) != 0xD800) { + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = + scalar::utf16::swap_if_needed(buf[k + 1]); + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if ((diff | diff2) > 0x3FF) { + return std::make_pair(nullptr, utf8_output); + } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf8_output++ = char((value >> 18) | 0b11110000); + *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while -simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept { - return scalar::latin1::utf16_length_from_latin1(length); + return std::make_pair(buf, utf8_output); } +/* + Returns a pair: a result struct and utf8_output. + If there is an error, the count field of the result is the position of the + error. Otherwise, it is the position of the first unprocessed byte in buf + (even if finished). A scalar routing should carry on the conversion of the + tail if needed. +*/ +template +std::pair +sse_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len, + char *utf8_output) { + const char16_t *start = buf; + const char16_t *end = buf + len; -simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept { - return scalar::latin1::utf32_length_from_latin1(length); -} + const __m128i v_0000 = _mm_setzero_si128(); + const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800); + const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800); + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 -simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char * input, size_t length) const noexcept { - const uint8_t *str = reinterpret_cast(input); - size_t answer = length / sizeof(__m512i) * sizeof(__m512i); - size_t i = 0; - unsigned char v_0xFF = 0xff; - __m512i eight_64bits = _mm512_setzero_si512(); - while (i + sizeof(__m512i) <= length) { - __m512i runner = _mm512_setzero_si512(); - size_t iterations = (length - i) / sizeof(__m512i); - if (iterations > 255) { - iterations = 255; - } - size_t max_i = i + iterations * sizeof(__m512i) - sizeof(__m512i); - for (; i + 4*sizeof(__m512i) <= max_i; i += 4*sizeof(__m512i)) { - // Load four __m512i vectors - __m512i input1 = _mm512_loadu_si512((const __m512i *)(str + i)); - __m512i input2 = _mm512_loadu_si512((const __m512i *)(str + i + sizeof(__m512i))); - __m512i input3 = _mm512_loadu_si512((const __m512i *)(str + i + 2*sizeof(__m512i))); - __m512i input4 = _mm512_loadu_si512((const __m512i *)(str + i + 3*sizeof(__m512i))); - - // Generate four masks - __mmask64 mask1 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input1); - __mmask64 mask2 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input2); - __mmask64 mask3 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input3); - __mmask64 mask4 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input4); - // Apply the masks and subtract from the runner - __m512i not_ascii1 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask1, v_0xFF); - __m512i not_ascii2 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask2, v_0xFF); - __m512i not_ascii3 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask3, v_0xFF); - __m512i not_ascii4 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask4, v_0xFF); - - runner = _mm512_sub_epi8(runner, not_ascii1); - runner = _mm512_sub_epi8(runner, not_ascii2); - runner = _mm512_sub_epi8(runner, not_ascii3); - runner = _mm512_sub_epi8(runner, not_ascii4); + while (end - buf >= std::ptrdiff_t(16 + safety_margin)) { + __m128i in = _mm_loadu_si128((__m128i *)buf); + if (big_endian) { + const __m128i swap = + _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + in = _mm_shuffle_epi8(in, swap); } - - for (; i <= max_i; i += sizeof(__m512i)) { - __m512i more_input = _mm512_loadu_si512((const __m512i *)(str + i)); - - __mmask64 mask = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), more_input); - __m512i not_ascii = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask, v_0xFF); - runner = _mm512_sub_epi8(runner, not_ascii); + // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes + const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80); + if (_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!! + __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1); + if (big_endian) { + const __m128i swap = + _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + nextin = _mm_shuffle_epi8(nextin, swap); + } + if (!_mm_testz_si128(nextin, v_ff80)) { + // 1. pack the bytes + // obviously suboptimal. + const __m128i utf8_packed = _mm_packus_epi16(in, in); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i *)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 8; + utf8_output += 8; + in = nextin; + } else { + // 1. pack the bytes + // obviously suboptimal. + const __m128i utf8_packed = _mm_packus_epi16(in, nextin); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i *)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } } - eight_64bits = _mm512_add_epi64(eight_64bits, _mm512_sad_epu8(runner, _mm512_setzero_si512())); - } + // no bits set above 7th bit + const __m128i one_byte_bytemask = + _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000); + const uint16_t one_byte_bitmask = + static_cast(_mm_movemask_epi8(one_byte_bytemask)); - __m256i first_half = _mm512_extracti64x4_epi64(eight_64bits, 0); - __m256i second_half = _mm512_extracti64x4_epi64(eight_64bits, 1); - answer += (size_t)_mm256_extract_epi64(first_half, 0) + - (size_t)_mm256_extract_epi64(first_half, 1) + - (size_t)_mm256_extract_epi64(first_half, 2) + - (size_t)_mm256_extract_epi64(first_half, 3) + - (size_t)_mm256_extract_epi64(second_half, 0) + - (size_t)_mm256_extract_epi64(second_half, 1) + - (size_t)_mm256_extract_epi64(second_half, 2) + - (size_t)_mm256_extract_epi64(second_half, 3); - return answer + scalar::latin1::utf8_length_from_latin1(reinterpret_cast(str + i), length - i); -} + // no bits set above 11th bit + const __m128i one_or_two_bytes_bytemask = + _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000); + const uint16_t one_or_two_bytes_bitmask = + static_cast(_mm_movemask_epi8(one_or_two_bytes_bytemask)); -simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept { - size_t pos = 0; - size_t count = 0; - // This algorithm could no doubt be improved! - for(;pos + 64 <= length; pos += 64) { - __m512i utf8 = _mm512_loadu_si512((const __m512i*)(input+pos)); - uint64_t utf8_continuation_mask = _mm512_cmple_epi8_mask(utf8, _mm512_set1_epi8(-65+1)); - // We count one word for anything that is not a continuation (so - // leading bytes). - count += 64 - count_ones(utf8_continuation_mask); - uint64_t utf8_4byte = _mm512_cmpge_epu8_mask(utf8, _mm512_set1_epi8(int8_t(240))); - count += count_ones(utf8_4byte); + if (one_or_two_bytes_bitmask == 0xffff) { + internal::westmere::write_v_u16_11bits_to_utf8( + in, utf8_output, one_byte_bytemask, one_byte_bitmask); + buf += 8; + continue; } - return count + scalar::utf8::utf16_length_from_utf8(input + pos, length - pos); -} -simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept { - const char32_t* end = length >= 16 ? input + length - 16 : nullptr; - const char32_t* ptr = input; + // 1. Check if there are any surrogate word in the input chunk. + // We have also deal with situation when there is a surrogate word + // at the end of a chunk. + const __m128i surrogates_bytemask = + _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800); + + // bitmask = 0x0000 if there are no surrogates + // = 0xc000 if the last word is a surrogate + const uint16_t surrogates_bitmask = + static_cast(_mm_movemask_epi8(surrogates_bytemask)); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. + // However, it is likely an uncommon occurrence. + if (surrogates_bitmask == 0x0000) { + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes + const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); - const __m512i v_0000_007f = _mm512_set1_epi32((uint32_t)0x7f); - const __m512i v_0000_07ff = _mm512_set1_epi32((uint32_t)0x7ff); - const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff); + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - + single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two + UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - + three UTF-8 bytes - size_t count{0}; + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. - while (ptr <= end) { - __m512i utf32 = _mm512_loadu_si512((const __m512i*)ptr); - ptr += 16; - __mmask16 ascii_bitmask = _mm512_cmple_epu32_mask(utf32, v_0000_007f); - __mmask16 two_bytes_bitmask = _mm512_mask_cmple_epu32_mask(_knot_mask16(ascii_bitmask), utf32, v_0000_07ff); - __mmask16 three_bytes_bitmask = _mm512_mask_cmple_epu32_mask(_knot_mask16(_mm512_kor(ascii_bitmask, two_bytes_bitmask)), utf32, v_0000_ffff); + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. - size_t ascii_count = count_ones(ascii_bitmask); - size_t two_bytes_count = count_ones(two_bytes_bitmask); - size_t three_bytes_count = count_ones(three_bytes_bitmask); - size_t four_bytes_count = 16 - ascii_count - two_bytes_count - three_bytes_count; - count += ascii_count + 2*two_bytes_count + 3*three_bytes_count + 4*four_bytes_count; - } + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. - return count + scalar::utf32::utf8_length_from_utf32(ptr, length - (ptr - input)); -} + Finally from these two code units we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define simdutf_vec(x) _mm_set1_epi16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const __m128i t0 = _mm_shuffle_epi8(in, dup_even); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000)); -simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept { - const char32_t* end = length >= 16 ? input + length - 16 : nullptr; - const char32_t* ptr = input; + // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] + const __m128i s0 = _mm_srli_epi16(in, 4); + // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] + const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100)); + // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] + const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140)); + // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000)); + const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, + simdutf_vec(0b0100000000000000)); + const __m128i s4 = _mm_xor_si128(s3, m0); +#undef simdutf_vec - const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff); + // 4. expand code units 16-bit => 32-bit + const __m128i out0 = _mm_unpacklo_epi16(t2, s4); + const __m128i out1 = _mm_unpackhi_epi16(t2, s4); - size_t count{0}; + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + const uint16_t mask = + (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa); + if (mask == 0) { + // We only have three-byte code units. Use fast path. + const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14, + 15, 13, -1, -1, -1, -1); + const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle); + const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle); + _mm_storeu_si128((__m128i *)utf8_output, utf8_0); + utf8_output += 12; + _mm_storeu_si128((__m128i *)utf8_output, utf8_1); + utf8_output += 12; + buf += 8; + continue; + } + const uint8_t mask0 = uint8_t(mask); - while (ptr <= end) { - __m512i utf32 = _mm512_loadu_si512((const __m512i*)ptr); - ptr += 16; - __mmask16 surrogates_bitmask = _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff); + const uint8_t *row0 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1)); + const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0); - count += 16 + count_ones(surrogates_bitmask); - } + const uint8_t mask1 = static_cast(mask >> 8); - return count + scalar::utf32::utf16_length_from_utf32(ptr, length - (ptr - input)); -} + const uint8_t *row1 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1)); + const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1); -simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept { - return implementation::count_utf8(input, length); -} + _mm_storeu_si128((__m128i *)utf8_output, utf8_0); + utf8_output += row0[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_1); + utf8_output += row1[0]; -simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char * input, size_t length) const noexcept { - return scalar::base64::maximal_binary_length_from_base64(input, length); -} + buf += 8; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint16_t word = scalar::utf16::swap_if_needed(buf[k]); + if ((word & 0xFF80) == 0) { + *utf8_output++ = char(word); + } else if ((word & 0xF800) == 0) { + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if ((word & 0xF800) != 0xD800) { + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = + scalar::utf16::swap_if_needed(buf[k + 1]); + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if ((diff | diff2) > 0x3FF) { + return std::make_pair( + result(error_code::SURROGATE, buf - start + k - 1), + utf8_output); + } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf8_output++ = char((value >> 18) | 0b11110000); + *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while -simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept { - return (options & base64_url) ? compress_decode_base64(output, input, length, options) : compress_decode_base64(output, input, length, options); + return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output); } +/* end file src/westmere/sse_convert_utf16_to_utf8.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 -simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept { - return scalar::base64::maximal_binary_length_from_base64(input, length); -} +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +/* begin file src/westmere/sse_convert_utf16_to_utf32.cpp */ +/* + The vectorized algorithm works on single SSE register i.e., it + loads eight 16-bit code units. -simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept { - return (options & base64_url) ? compress_decode_base64(output, input, length, options) : compress_decode_base64(output, input, length, options); -} + We consider three cases: + 1. an input register contains no surrogates and each value + is in range 0x0000 .. 0x07ff. + 2. an input register contains no surrogates and values are + is in range 0x0000 .. 0xffff. + 3. an input register contains surrogates --- i.e. codepoints + can have 16 or 32 bits. + Ad 1. -simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length, base64_options options) const noexcept { - return scalar::base64::base64_length_from_binary(length, options); -} + When values are less than 0x0800, it means that a 16-bit code unit + can be converted into: 1) single UTF8 byte (when it's an ASCII + char) or 2) two UTF8 bytes. -size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept { - if(options & base64_url) { - return encode_base64(output, input, length, options); - } else { - return encode_base64(output, input, length, options); - } -} + For this case we do only some shuffle to obtain these 2-byte + codes and finally compress the whole SSE register with a single + shuffle. -} // namespace icelake -} // namespace simdutf + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. -/* begin file src/simdutf/icelake/end.h */ -#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE -// nothing needed. -#else -SIMDUTF_UNTARGET_REGION -#endif + Ad 2. + When values fit in 16-bit code units, but are above 0x07ff, then + a single word may produce one, two or three UTF8 bytes. -#if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593 -SIMDUTF_POP_DISABLE_WARNINGS -#endif // end of workaround -/* end file src/simdutf/icelake/end.h */ -/* end file src/icelake/implementation.cpp */ -#endif -#if SIMDUTF_IMPLEMENTATION_HASWELL -/* begin file src/haswell/implementation.cpp */ + We prepare data for all these three cases in two registers. + The first register contains lower two UTF8 bytes (used in all + cases), while the second one contains just the third byte for + the three-UTF8-bytes case. -/* begin file src/simdutf/haswell/begin.h */ -// redefining SIMDUTF_IMPLEMENTATION to "haswell" -// #define SIMDUTF_IMPLEMENTATION haswell + Finally these two registers are interleaved forming eight-element + array of 32-bit values. The array spans two SSE registers. + The bytes from the registers are compressed using two shuffles. -#if SIMDUTF_CAN_ALWAYS_RUN_HASWELL -// nothing needed. -#else -SIMDUTF_TARGET_HASWELL -#endif + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. -#if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593 -SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized) -#endif // end of workaround -/* end file src/simdutf/haswell/begin.h */ -namespace simdutf { -namespace haswell { -namespace { -#ifndef SIMDUTF_HASWELL_H -#error "haswell.h must be included" -#endif -using namespace simd; + To summarize: + - We need two 256-entry tables that have 8704 bytes in total. +*/ -simdutf_really_inline bool is_ascii(const simd8x64& input) { - return input.reduce_or().is_ascii(); -} +/* + Returns a pair: the first unprocessed byte from buf and utf8_output + A scalar routine should carry on the conversion of the tail. +*/ +template +std::pair +sse_convert_utf16_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_output) { + const char16_t *end = buf + len; -simdutf_unused simdutf_really_inline simd8 must_be_continuation(const simd8 prev1, const simd8 prev2, const simd8 prev3) { - simd8 is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0 - simd8 is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0 - simd8 is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0 - // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine. - return simd8(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0); -} + const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800); + const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800); -simdutf_really_inline simd8 must_be_2_3_continuation(const simd8 prev2, const simd8 prev3) { - simd8 is_third_byte = prev2.saturating_sub(0xe0u-0x80); // Only 111_____ will be > 0x80 - simd8 is_fourth_byte = prev3.saturating_sub(0xf0u-0x80); // Only 1111____ will be > 0x80 - return simd8(is_third_byte | is_fourth_byte); -} + while (end - buf >= 8) { + __m128i in = _mm_loadu_si128((__m128i *)buf); -/* begin file src/haswell/avx2_detect_encodings.cpp */ -template -// len is known to be a multiple of 2 when this is called -int avx2_detect_encodings(const char * buf, size_t len) { - const char* start = buf; - const char* end = buf + len; + if (big_endian) { + const __m128i swap = + _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + in = _mm_shuffle_epi8(in, swap); + } - bool is_utf8 = true; - bool is_utf16 = true; - bool is_utf32 = true; + // 1. Check if there are any surrogate word in the input chunk. + // We have also deal with situation when there is a surrogate word + // at the end of a chunk. + const __m128i surrogates_bytemask = + _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800); - int out = 0; + // bitmask = 0x0000 if there are no surrogates + // = 0xc000 if the last word is a surrogate + const uint16_t surrogates_bitmask = + static_cast(_mm_movemask_epi8(surrogates_bytemask)); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. + // However, it is likely an uncommon occurrence. + if (surrogates_bitmask == 0x0000) { + // case: no surrogate pair, extend 16-bit code units to 32-bit code units + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), + _mm_cvtepu16_epi32(in)); + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4), + _mm_cvtepu16_epi32(_mm_srli_si128(in, 8))); + utf32_output += 8; + buf += 8; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint16_t word = scalar::utf16::swap_if_needed(buf[k]); + if ((word & 0xF800) != 0xD800) { + *utf32_output++ = char32_t(word); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = + scalar::utf16::swap_if_needed(buf[k + 1]); + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if ((diff | diff2) > 0x3FF) { + return std::make_pair(nullptr, utf32_output); + } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf32_output++ = char32_t(value); + } + } + buf += k; + } + } // while + return std::make_pair(buf, utf32_output); +} - const auto v_d8 = simd8::splat(0xd8); - const auto v_f8 = simd8::splat(0xf8); +/* + Returns a pair: a result struct and utf8_output. + If there is an error, the count field of the result is the position of the + error. Otherwise, it is the position of the first unprocessed byte in buf + (even if finished). A scalar routing should carry on the conversion of the + tail if needed. +*/ +template +std::pair +sse_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len, + char32_t *utf32_output) { + const char16_t *start = buf; + const char16_t *end = buf + len; - __m256i currentmax = _mm256_setzero_si256(); + const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800); + const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800); - checker check{}; + while (end - buf >= 8) { + __m128i in = _mm_loadu_si128((__m128i *)buf); - while(buf + 64 <= end) { - __m256i in = _mm256_loadu_si256((__m256i*)buf); - __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1); + if (big_endian) { + const __m128i swap = + _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + in = _mm_shuffle_epi8(in, swap); + } - const auto u0 = simd16(in); - const auto u1 = simd16(nextin); + // 1. Check if there are any surrogate word in the input chunk. + // We have also deal with situation when there is a surrogate word + // at the end of a chunk. + const __m128i surrogates_bytemask = + _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800); - const auto v0 = u0.shr<8>(); - const auto v1 = u1.shr<8>(); + // bitmask = 0x0000 if there are no surrogates + // = 0xc000 if the last word is a surrogate + const uint16_t surrogates_bitmask = + static_cast(_mm_movemask_epi8(surrogates_bytemask)); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. + // However, it is likely an uncommon occurrence. + if (surrogates_bitmask == 0x0000) { + // case: no surrogate pair, extend 16-bit code units to 32-bit code units + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), + _mm_cvtepu16_epi32(in)); + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4), + _mm_cvtepu16_epi32(_mm_srli_si128(in, 8))); + utf32_output += 8; + buf += 8; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint16_t word = scalar::utf16::swap_if_needed(buf[k]); + if ((word & 0xF800) != 0xD800) { + *utf32_output++ = char32_t(word); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = + scalar::utf16::swap_if_needed(buf[k + 1]); + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if ((diff | diff2) > 0x3FF) { + return std::make_pair( + result(error_code::SURROGATE, buf - start + k - 1), + utf32_output); + } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf32_output++ = char32_t(value); + } + } + buf += k; + } + } // while + return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output); +} +/* end file src/westmere/sse_convert_utf16_to_utf32.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 - const auto in16 = simd16::pack(v0, v1); +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/westmere/sse_convert_utf32_to_latin1.cpp */ +std::pair +sse_convert_utf32_to_latin1(const char32_t *buf, size_t len, + char *latin1_output) { + const size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16 - const auto surrogates_wordmask0 = (in16 & v_f8) == v_d8; - uint32_t surrogates_bitmask0 = surrogates_wordmask0.to_bitmask(); + __m128i high_bytes_mask = _mm_set1_epi32(0xFFFFFF00); + __m128i shufmask = + _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0); - // Check for surrogates - if (surrogates_bitmask0 != 0x0) { - // Cannot be UTF8 - is_utf8 = false; - // Can still be either UTF-16LE or UTF-32 depending on the positions of the surrogates - // To be valid UTF-32, a surrogate cannot be in the two most significant bytes of any 32-bit word. - // On the other hand, to be valid UTF-16LE, at least one surrogate must be in the two most significant - // bytes of a 32-bit word since they always come in pairs in UTF-16LE. - // Note that we always proceed in multiple of 4 before this point so there is no offset in 32-bit code units. + for (size_t i = 0; i < rounded_len; i += 16) { + __m128i in1 = _mm_loadu_si128((__m128i *)buf); + __m128i in2 = _mm_loadu_si128((__m128i *)(buf + 4)); + __m128i in3 = _mm_loadu_si128((__m128i *)(buf + 8)); + __m128i in4 = _mm_loadu_si128((__m128i *)(buf + 12)); - if ((surrogates_bitmask0 & 0xaaaaaaaa) != 0) { - is_utf32 = false; - // Code from avx2_validate_utf16le.cpp - const char16_t * input = reinterpret_cast(buf); - const char16_t* end16 = reinterpret_cast(start) + len/2; + __m128i check_combined = _mm_or_si128(in1, in2); + check_combined = _mm_or_si128(check_combined, in3); + check_combined = _mm_or_si128(check_combined, in4); - const auto v_fc = simd8::splat(0xfc); - const auto v_dc = simd8::splat(0xdc); + if (!_mm_testz_si128(check_combined, high_bytes_mask)) { + return std::make_pair(nullptr, latin1_output); + } + __m128i pack1 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in1, shufmask), + _mm_shuffle_epi8(in2, shufmask)); + __m128i pack2 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in3, shufmask), + _mm_shuffle_epi8(in4, shufmask)); + __m128i pack = _mm_unpacklo_epi64(pack1, pack2); + _mm_storeu_si128((__m128i *)latin1_output, pack); + latin1_output += 16; + buf += 16; + } - const uint32_t V0 = ~surrogates_bitmask0; + return std::make_pair(buf, latin1_output); +} - const auto vH0 = (in16 & v_fc) == v_dc; - const uint32_t H0 = vH0.to_bitmask(); +std::pair +sse_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len, + char *latin1_output) { + const char32_t *start = buf; + const size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16 - const uint32_t L0 = ~H0 & surrogates_bitmask0; + __m128i high_bytes_mask = _mm_set1_epi32(0xFFFFFF00); + __m128i shufmask = + _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0); - const uint32_t a0 = L0 & (H0 >> 1); - const uint32_t b0 = a0 << 1; - const uint32_t c0 = V0 | a0 | b0; + for (size_t i = 0; i < rounded_len; i += 16) { + __m128i in1 = _mm_loadu_si128((__m128i *)buf); + __m128i in2 = _mm_loadu_si128((__m128i *)(buf + 4)); + __m128i in3 = _mm_loadu_si128((__m128i *)(buf + 8)); + __m128i in4 = _mm_loadu_si128((__m128i *)(buf + 12)); - if (c0 == 0xffffffff) { - input += simd16::ELEMENTS * 2; - } else if (c0 == 0x7fffffff) { - input += simd16::ELEMENTS * 2 - 1; - } else { - return simdutf::encoding_type::unspecified; - } + __m128i check_combined = _mm_or_si128(in1, in2); + check_combined = _mm_or_si128(check_combined, in3); + check_combined = _mm_or_si128(check_combined, in4); - while (input + simd16::ELEMENTS * 2 < end16) { - const auto in0 = simd16(input); - const auto in1 = simd16(input + simd16::ELEMENTS); + if (!_mm_testz_si128(check_combined, high_bytes_mask)) { + // Fallback to scalar code for handling errors + for (int k = 0; k < 16; k++) { + char32_t codepoint = buf[k]; + if (codepoint <= 0xff) { + *latin1_output++ = char(codepoint); + } else { + return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), + latin1_output); + } + } + buf += 16; + continue; + } + __m128i pack1 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in1, shufmask), + _mm_shuffle_epi8(in2, shufmask)); + __m128i pack2 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in3, shufmask), + _mm_shuffle_epi8(in4, shufmask)); + __m128i pack = _mm_unpacklo_epi64(pack1, pack2); + _mm_storeu_si128((__m128i *)latin1_output, pack); + latin1_output += 16; + buf += 16; + } - const auto t0 = in0.shr<8>(); - const auto t1 = in1.shr<8>(); + return std::make_pair(result(error_code::SUCCESS, buf - start), + latin1_output); +} +/* end file src/westmere/sse_convert_utf32_to_latin1.cpp */ +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 - const auto in_16 = simd16::pack(t0, t1); +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +/* begin file src/westmere/sse_convert_utf32_to_utf8.cpp */ +std::pair +sse_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_output) { + const char32_t *end = buf + len; - const auto surrogates_wordmask = (in_16 & v_f8) == v_d8; - const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask(); - if (surrogates_bitmask == 0x0) { - input += simd16::ELEMENTS * 2; - } else { - const uint32_t V = ~surrogates_bitmask; + const __m128i v_0000 = _mm_setzero_si128(); //__m128 = 128 bits + const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); // 1111 1000 0000 + // 0000 + const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080); // 1100 0000 1000 + // 0000 + const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80); // 1111 1111 1000 + // 0000 + const __m128i v_ffff0000 = _mm_set1_epi32( + (uint32_t)0xffff0000); // 1111 1111 1111 1111 0000 0000 0000 0000 + const __m128i v_7fffffff = _mm_set1_epi32( + (uint32_t)0x7fffffff); // 0111 1111 1111 1111 1111 1111 1111 1111 + __m128i running_max = _mm_setzero_si128(); + __m128i forbidden_bytemask = _mm_setzero_si128(); + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 + + while (end - buf >= + std::ptrdiff_t( + 16 + safety_margin)) { // buf is a char32_t pointer, each char32_t + // has 4 bytes or 32 bits, thus buf + 16 * + // char_32t = 512 bits = 64 bytes + // We load two 16 bytes registers for a total of 32 bytes or 16 characters. + __m128i in = _mm_loadu_si128((__m128i *)buf); + __m128i nextin = _mm_loadu_si128( + (__m128i *)buf + 1); // These two values can hold only 8 UTF32 chars + running_max = _mm_max_epu32( + _mm_max_epu32(in, running_max), // take element-wise max char32_t from + // in and running_max vector + nextin); // and take element-wise max element from nextin and + // running_max vector - const auto vH = (in_16 & v_fc) == v_dc; - const uint32_t H = vH.to_bitmask(); + // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned + // saturation + __m128i in_16 = _mm_packus_epi32( + _mm_and_si128(in, v_7fffffff), + _mm_and_si128( + nextin, + v_7fffffff)); // in this context pack the two __m128 into a single + // By ensuring the highest bit is set to 0(&v_7fffffff), we are making sure + // all values are interpreted as non-negative, or specifically, the values + // are within the range of valid Unicode code points. remember : having + // leading byte 0 means a positive number by the two complements system. + // Unicode is well beneath the range where you'll start getting issues so + // that's OK. - const uint32_t L = ~H & surrogates_bitmask; + // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp - const uint32_t a = L & (H >> 1); + // Check for ASCII fast path - const uint32_t b = a << 1; + // ASCII fast path!!!! + // We eagerly load another 32 bytes, hoping that they will be ASCII too. + // The intuition is that we try to collect 16 ASCII characters which + // requires a total of 64 bytes of input. If we fail, we just pass thirdin + // and fourthin as our new inputs. + if (_mm_testz_si128(in_16, v_ff80)) { // if the first two blocks are ASCII + __m128i thirdin = _mm_loadu_si128((__m128i *)buf + 2); + __m128i fourthin = _mm_loadu_si128((__m128i *)buf + 3); + running_max = _mm_max_epu32( + _mm_max_epu32(thirdin, running_max), + fourthin); // take the running max of all 4 vectors thus far + __m128i nextin_16 = _mm_packus_epi32( + _mm_and_si128(thirdin, v_7fffffff), + _mm_and_si128(fourthin, + v_7fffffff)); // pack into 1 vector, now you have two + if (!_mm_testz_si128( + nextin_16, + v_ff80)) { // checks if the second packed vector is ASCII, if not: + // 1. pack the bytes + // obviously suboptimal. + const __m128i utf8_packed = _mm_packus_epi16( + in_16, in_16); // creates two copy of in_16 in 1 vector + // 2. store (16 bytes) + _mm_storeu_si128((__m128i *)utf8_output, + utf8_packed); // put them into the output + // 3. adjust pointers + buf += 8; // the char32_t buffer pointer goes up 8 char32_t chars* 32 + // bits = 256 bits + utf8_output += + 8; // same with output, e.g. lift the first two blocks alone. + // Proceed with next input + in_16 = nextin_16; + // We need to update in and nextin because they are used later. + in = thirdin; + nextin = fourthin; + } else { + // 1. pack the bytes + const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i *)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } + } - const uint32_t c = V | a | b; - - if (c == 0xffffffff) { - input += simd16::ELEMENTS * 2; - } else if (c == 0x7fffffff) { - input += simd16::ELEMENTS * 2 - 1; - } else { - return simdutf::encoding_type::unspecified; - } - } - } - } else { - is_utf16 = false; - // Check for UTF-32 - if (len % 4 == 0) { - const char32_t * input = reinterpret_cast(buf); - const char32_t* end32 = reinterpret_cast(start) + len/4; - - // Must start checking for surrogates - __m256i currentoffsetmax = _mm256_setzero_si256(); - const __m256i offset = _mm256_set1_epi32(0xffff2000); - const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff); - - currentmax = _mm256_max_epu32(in, currentmax); - currentmax = _mm256_max_epu32(nextin, currentmax); - - currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax); - currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(nextin, offset), currentoffsetmax); - - while (input + 8 < end32) { - const __m256i in32 = _mm256_loadu_si256((__m256i *)input); - currentmax = _mm256_max_epu32(in32,currentmax); - currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in32, offset), currentoffsetmax); - input += 8; - } - - __m256i forbidden_words = _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax); - if(_mm256_testz_si256(forbidden_words, forbidden_words) == 0) { - return simdutf::encoding_type::unspecified; - } - } else { - return simdutf::encoding_type::unspecified; - } - } - break; - } - // If no surrogate, validate under other encodings as well + // no bits set above 7th bit -- find out all the ASCII characters + const __m128i one_byte_bytemask = + _mm_cmpeq_epi16( // this takes four bytes at a time and compares: + _mm_and_si128(in_16, v_ff80), // the vector that get only the first + // 9 bits of each 16-bit/2-byte units + v_0000 // + ); // they should be all zero if they are ASCII. E.g. ASCII in UTF32 is + // of format 0000 0000 0000 0XXX XXXX + // _mm_cmpeq_epi16 should now return a 1111 1111 1111 1111 for equals, and + // 0000 0000 0000 0000 if not for each 16-bit/2-byte units + const uint16_t one_byte_bitmask = static_cast(_mm_movemask_epi8( + one_byte_bytemask)); // collect the MSB from previous vector and put + // them into uint16_t mas - // UTF-32 validation - currentmax = _mm256_max_epu32(in, currentmax); - currentmax = _mm256_max_epu32(nextin, currentmax); + // no bits set above 11th bit + const __m128i one_or_two_bytes_bytemask = + _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000); + const uint16_t one_or_two_bytes_bitmask = + static_cast(_mm_movemask_epi8(one_or_two_bytes_bytemask)); - // UTF-8 validation - // Relies on ../generic/utf8_validation/utf8_lookup4_algorithm.h - simd::simd8x64 in8(in, nextin); - check.check_next_input(in8); + if (one_or_two_bytes_bitmask == 0xffff) { + // case: all code units either produce 1 or 2 UTF-8 bytes (at least one + // produces 2 bytes) + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const __m128i v_1f00 = + _mm_set1_epi16((int16_t)0x1f00); // 0001 1111 0000 0000 + const __m128i v_003f = + _mm_set1_epi16((int16_t)0x003f); // 0000 0000 0011 1111 - buf += 64; - } + // t0 = [000a|aaaa|bbbb|bb00] + const __m128i t0 = _mm_slli_epi16(in_16, 2); // shift packed vector by two + // t1 = [000a|aaaa|0000|0000] + const __m128i t1 = _mm_and_si128(t0, v_1f00); // potential first utf8 byte + // t2 = [0000|0000|00bb|bbbb] + const __m128i t2 = + _mm_and_si128(in_16, v_003f); // potential second utf8 byte + // t3 = [000a|aaaa|00bb|bbbb] + const __m128i t3 = + _mm_or_si128(t1, t2); // first and second potential utf8 byte together + // t4 = [110a|aaaa|10bb|bbbb] + const __m128i t4 = _mm_or_si128( + t3, + v_c080); // t3 | 1100 0000 1000 0000 = full potential 2-byte utf8 unit - // Check which encodings are possible + // 2. merge ASCII and 2-byte codewords + const __m128i utf8_unpacked = + _mm_blendv_epi8(t4, in_16, one_byte_bytemask); - if (is_utf8) { - if (static_cast(buf - start) != len) { - uint8_t block[64]{}; - std::memset(block, 0x20, 64); - std::memcpy(block, buf, len - (buf - start)); - simd::simd8x64 in(block); - check.check_next_input(in); - } - if (!check.errors()) { - out |= simdutf::encoding_type::UTF8; - } - } + // 3. prepare bitmask for 8-bit lookup + // one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - + // MSB, a - LSB) + const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a + const uint16_t m1 = + static_cast(m0 >> 7); // m1 = 00000000h0g0f0e0 + const uint8_t m2 = + static_cast((m0 | m1) & 0xff); // m2 = hdgcfbea + // 4. pack the bytes + const uint8_t *row = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; + const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1)); + const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle); - if (is_utf16 && scalar::utf16::validate(reinterpret_cast(buf), (len - (buf - start))/2)) { - out |= simdutf::encoding_type::UTF16_LE; - } + // 5. store bytes + _mm_storeu_si128((__m128i *)utf8_output, utf8_packed); - if (is_utf32 && (len % 4 == 0)) { - const __m256i standardmax = _mm256_set1_epi32(0x10ffff); - __m256i is_zero = _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax); - if (_mm256_testz_si256(is_zero, is_zero) == 1 && scalar::utf32::validate(reinterpret_cast(buf), (len - (buf - start))/4)) { - out |= simdutf::encoding_type::UTF32_LE; - } + // 6. adjust pointers + buf += 8; + utf8_output += row[0]; + continue; } - return out; -} -/* end file src/haswell/avx2_detect_encodings.cpp */ - -/* begin file src/haswell/avx2_validate_utf16.cpp */ -/* - In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning. + // Check for overflow in packing - In a vectorized algorithm we want to examine the most significant - nibble in order to select a fast path. If none of highest nibbles - are 0xD (13), than we are sure that UTF-16 chunk in a vector - register is valid. + const __m128i saturation_bytemask = _mm_cmpeq_epi32( + _mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000); + const uint32_t saturation_bitmask = + static_cast(_mm_movemask_epi8(saturation_bytemask)); + if (saturation_bitmask == 0xffff) { + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes + const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800); + forbidden_bytemask = + _mm_or_si128(forbidden_bytemask, + _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800)); - Let us analyze what we need to check if the nibble is 0xD. The - value of the preceding nibble determines what we have: + const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); - 0xd000 .. 0xd7ff - a valid word - 0xd800 .. 0xdbff - low surrogate - 0xdc00 .. 0xdfff - high surrogate + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - + single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - + two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - + three UTF-8 bytes - Other constraints we have to consider: - - there must not be two consecutive low surrogates (0xd800 .. 0xdbff) - - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff) - - there must not be sole low surrogate nor high surrogate + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. - We're going to build three bitmasks based on the 3rd nibble: - - V = valid word, - - L = low surrogate (0xd800 .. 0xdbff) - - H = high surrogate (0xdc00 .. 0xdfff) + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. - 0 1 2 3 4 5 6 7 <--- word index - [ V | L | H | L | H | V | V | L ] - 1 0 0 0 0 1 1 0 - V = valid masks - 0 1 0 1 0 0 0 1 - L = low surrogate - 0 0 1 0 1 0 0 0 - H high surrogate + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + Finally from these two code units we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define simdutf_vec(x) _mm_set1_epi16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000)); - 1 0 0 0 0 1 1 0 V = valid masks - 0 1 0 1 0 0 0 0 a = L & (H >> 1) - 0 0 1 0 1 0 0 0 b = a << 1 - 1 1 1 1 1 1 1 0 c = V | a | b - ^ - the last bit can be zero, we just consume 7 code units - and recheck this word in the next iteration -*/ + // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] + const __m128i s0 = _mm_srli_epi16(in_16, 4); + // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] + const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100)); + // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] + const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140)); + // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000)); + const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, + simdutf_vec(0b0100000000000000)); + const __m128i s4 = _mm_xor_si128(s3, m0); +#undef simdutf_vec -/* Returns: - - pointer to the last unprocessed character (a scalar fallback should check the rest); - - nullptr if an error was detected. -*/ -template -const char16_t* avx2_validate_utf16(const char16_t* input, size_t size) { - const char16_t* end = input + size; + // 4. expand code units 16-bit => 32-bit + const __m128i out0 = _mm_unpacklo_epi16(t2, s4); + const __m128i out1 = _mm_unpackhi_epi16(t2, s4); - const auto v_d8 = simd8::splat(0xd8); - const auto v_f8 = simd8::splat(0xf8); - const auto v_fc = simd8::splat(0xfc); - const auto v_dc = simd8::splat(0xdc); + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + const uint16_t mask = + (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa); + if (mask == 0) { + // We only have three-byte code units. Use fast path. + const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14, + 15, 13, -1, -1, -1, -1); + const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle); + const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle); + _mm_storeu_si128((__m128i *)utf8_output, utf8_0); + utf8_output += 12; + _mm_storeu_si128((__m128i *)utf8_output, utf8_1); + utf8_output += 12; + buf += 8; + continue; + } + const uint8_t mask0 = uint8_t(mask); - while (input + simd16::ELEMENTS * 2 < end) { - // 0. Load data: since the validation takes into account only higher - // byte of each word, we compress the two vectors into one which - // consists only the higher bytes. - auto in0 = simd16(input); - auto in1 = simd16(input + simd16::ELEMENTS); + const uint8_t *row0 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1)); + const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0); - if (big_endian) { - in0 = in0.swap_bytes(); - in1 = in1.swap_bytes(); - } + const uint8_t mask1 = static_cast(mask >> 8); - const auto t0 = in0.shr<8>(); - const auto t1 = in1.shr<8>(); + const uint8_t *row1 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1)); + const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1); - const auto in = simd16::pack(t0, t1); + _mm_storeu_si128((__m128i *)utf8_output, utf8_0); + utf8_output += row0[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_1); + utf8_output += row1[0]; - // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). - const auto surrogates_wordmask = (in & v_f8) == v_d8; - const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask(); - if (surrogates_bitmask == 0x0) { - input += simd16::ELEMENTS * 2; + buf += 8; + } else { + // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=> + // will produce four UTF-8 bytes Let us do a scalar fallback. It may seem + // wasteful to use scalar code, but being efficient with SIMD in the + // presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint32_t word = buf[k]; + if ((word & 0xFFFFFF80) == 0) { + *utf8_output++ = char(word); + } else if ((word & 0xFFFFF800) == 0) { + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if ((word & 0xFFFF0000) == 0) { + if (word >= 0xD800 && word <= 0xDFFF) { + return std::make_pair(nullptr, utf8_output); + } + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); } else { - // 2. We have some surrogates that have to be distinguished: - // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) - // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) - // - // Fact: high surrogate has 11th bit set (3rd bit in the higher word) - - // V - non-surrogate code units - // V = not surrogates_wordmask - const uint32_t V = ~surrogates_bitmask; - - // H - word-mask for high surrogates: the six highest bits are 0b1101'11 - const auto vH = (in & v_fc) == v_dc; - const uint32_t H = vH.to_bitmask(); - - // L - word mask for low surrogates - // L = not H and surrogates_wordmask - const uint32_t L = ~H & surrogates_bitmask; - - const uint32_t a = L & (H >> 1); // A low surrogate must be followed by high one. - // (A low surrogate placed in the 7th register's word - // is an exception we handle.) - const uint32_t b = a << 1; // Just mark that the opposite fact is hold, - // thanks to that we have only two masks for valid case. - const uint32_t c = V | a | b; // Combine all the masks into the final one. - - if (c == 0xffffffff) { - // The whole input register contains valid UTF-16, i.e., - // either single code units or proper surrogate pairs. - input += simd16::ELEMENTS * 2; - } else if (c == 0x7fffffff) { - // The 31 lower code units of the input register contains valid UTF-16. - // The 31 word may be either a low or high surrogate. It the next - // iteration we 1) check if the low surrogate is followed by a high - // one, 2) reject sole high surrogate. - input += simd16::ELEMENTS * 2 - 1; - } else { - return nullptr; - } + if (word > 0x10FFFF) { + return std::make_pair(nullptr, utf8_output); + } + *utf8_output++ = char((word >> 18) | 0b11110000); + *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); } + } + buf += k; } + } // while - return input; -} + // check for invalid input + const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff); + if (static_cast(_mm_movemask_epi8(_mm_cmpeq_epi32( + _mm_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffff) { + return std::make_pair(nullptr, utf8_output); + } + if (static_cast(_mm_movemask_epi8(forbidden_bytemask)) != 0) { + return std::make_pair(nullptr, utf8_output); + } -template -const result avx2_validate_utf16_with_errors(const char16_t* input, size_t size) { - const char16_t* start = input; - const char16_t* end = input + size; - - const auto v_d8 = simd8::splat(0xd8); - const auto v_f8 = simd8::splat(0xf8); - const auto v_fc = simd8::splat(0xfc); - const auto v_dc = simd8::splat(0xdc); - - while (input + simd16::ELEMENTS * 2 < end) { - // 0. Load data: since the validation takes into account only higher - // byte of each word, we compress the two vectors into one which - // consists only the higher bytes. - auto in0 = simd16(input); - auto in1 = simd16(input + simd16::ELEMENTS); + return std::make_pair(buf, utf8_output); +} - if (big_endian) { - in0 = in0.swap_bytes(); - in1 = in1.swap_bytes(); - } +std::pair +sse_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len, + char *utf8_output) { + const char32_t *end = buf + len; + const char32_t *start = buf; - const auto t0 = in0.shr<8>(); - const auto t1 = in1.shr<8>(); + const __m128i v_0000 = _mm_setzero_si128(); + const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); + const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080); + const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80); + const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000); + const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff); + const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff); - const auto in = simd16::pack(t0, t1); + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 - // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). - const auto surrogates_wordmask = (in & v_f8) == v_d8; - const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask(); - if (surrogates_bitmask == 0x0) { - input += simd16::ELEMENTS * 2; - } else { - // 2. We have some surrogates that have to be distinguished: - // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) - // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) - // - // Fact: high surrogate has 11th bit set (3rd bit in the higher word) - - // V - non-surrogate code units - // V = not surrogates_wordmask - const uint32_t V = ~surrogates_bitmask; - - // H - word-mask for high surrogates: the six highest bits are 0b1101'11 - const auto vH = (in & v_fc) == v_dc; - const uint32_t H = vH.to_bitmask(); - - // L - word mask for low surrogates - // L = not H and surrogates_wordmask - const uint32_t L = ~H & surrogates_bitmask; - - const uint32_t a = L & (H >> 1); // A low surrogate must be followed by high one. - // (A low surrogate placed in the 7th register's word - // is an exception we handle.) - const uint32_t b = a << 1; // Just mark that the opposite fact is hold, - // thanks to that we have only two masks for valid case. - const uint32_t c = V | a | b; // Combine all the masks into the final one. - - if (c == 0xffffffff) { - // The whole input register contains valid UTF-16, i.e., - // either single code units or proper surrogate pairs. - input += simd16::ELEMENTS * 2; - } else if (c == 0x7fffffff) { - // The 31 lower code units of the input register contains valid UTF-16. - // The 31 word may be either a low or high surrogate. It the next - // iteration we 1) check if the low surrogate is followed by a high - // one, 2) reject sole high surrogate. - input += simd16::ELEMENTS * 2 - 1; - } else { - return result(error_code::SURROGATE, input - start); - } - } + while (end - buf >= std::ptrdiff_t(16 + safety_margin)) { + // We load two 16 bytes registers for a total of 32 bytes or 8 characters. + __m128i in = _mm_loadu_si128((__m128i *)buf); + __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1); + // Check for too large input + __m128i max_input = _mm_max_epu32(_mm_max_epu32(in, nextin), v_10ffff); + if (static_cast(_mm_movemask_epi8( + _mm_cmpeq_epi32(max_input, v_10ffff))) != 0xffff) { + return std::make_pair(result(error_code::TOO_LARGE, buf - start), + utf8_output); } - return result(error_code::SUCCESS, input - start); -} -/* end file src/haswell/avx2_validate_utf16.cpp */ -/* begin file src/haswell/avx2_validate_utf32le.cpp */ -/* Returns: - - pointer to the last unprocessed character (a scalar fallback should check the rest); - - nullptr if an error was detected. -*/ -const char32_t* avx2_validate_utf32le(const char32_t* input, size_t size) { - const char32_t* end = input + size; - - const __m256i standardmax = _mm256_set1_epi32(0x10ffff); - const __m256i offset = _mm256_set1_epi32(0xffff2000); - const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff); - __m256i currentmax = _mm256_setzero_si256(); - __m256i currentoffsetmax = _mm256_setzero_si256(); - - while (input + 8 < end) { - const __m256i in = _mm256_loadu_si256((__m256i *)input); - currentmax = _mm256_max_epu32(in,currentmax); - currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax); - input += 8; - } - __m256i is_zero = _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax); - if(_mm256_testz_si256(is_zero, is_zero) == 0) { - return nullptr; - } + // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned + // saturation + __m128i in_16 = _mm_packus_epi32(_mm_and_si128(in, v_7fffffff), + _mm_and_si128(nextin, v_7fffffff)); - is_zero = _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax); - if(_mm256_testz_si256(is_zero, is_zero) == 0) { - return nullptr; + // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp + + // Check for ASCII fast path + if (_mm_testz_si128(in_16, v_ff80)) { // ASCII fast path!!!! + // 1. pack the bytes + // obviously suboptimal. + const __m128i utf8_packed = _mm_packus_epi16(in_16, in_16); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i *)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 8; + utf8_output += 8; + continue; } - return input; -} + // no bits set above 7th bit + const __m128i one_byte_bytemask = + _mm_cmpeq_epi16(_mm_and_si128(in_16, v_ff80), v_0000); + const uint16_t one_byte_bitmask = + static_cast(_mm_movemask_epi8(one_byte_bytemask)); + // no bits set above 11th bit + const __m128i one_or_two_bytes_bytemask = + _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000); + const uint16_t one_or_two_bytes_bitmask = + static_cast(_mm_movemask_epi8(one_or_two_bytes_bytemask)); + + if (one_or_two_bytes_bitmask == 0xffff) { + // case: all code units either produce 1 or 2 UTF-8 bytes (at least one + // produces 2 bytes) + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00); + const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f); -const result avx2_validate_utf32le_with_errors(const char32_t* input, size_t size) { - const char32_t* start = input; - const char32_t* end = input + size; + // t0 = [000a|aaaa|bbbb|bb00] + const __m128i t0 = _mm_slli_epi16(in_16, 2); + // t1 = [000a|aaaa|0000|0000] + const __m128i t1 = _mm_and_si128(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const __m128i t2 = _mm_and_si128(in_16, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const __m128i t3 = _mm_or_si128(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const __m128i t4 = _mm_or_si128(t3, v_c080); - const __m256i standardmax = _mm256_set1_epi32(0x10ffff); - const __m256i offset = _mm256_set1_epi32(0xffff2000); - const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff); - __m256i currentmax = _mm256_setzero_si256(); - __m256i currentoffsetmax = _mm256_setzero_si256(); + // 2. merge ASCII and 2-byte codewords + const __m128i utf8_unpacked = + _mm_blendv_epi8(t4, in_16, one_byte_bytemask); - while (input + 8 < end) { - const __m256i in = _mm256_loadu_si256((__m256i *)input); - currentmax = _mm256_max_epu32(in,currentmax); - currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax); + // 3. prepare bitmask for 8-bit lookup + // one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - + // MSB, a - LSB) + const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a + const uint16_t m1 = + static_cast(m0 >> 7); // m1 = 00000000h0g0f0e0 + const uint8_t m2 = + static_cast((m0 | m1) & 0xff); // m2 = hdgcfbea + // 4. pack the bytes + const uint8_t *row = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; + const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1)); + const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle); - __m256i is_zero = _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax); - if(_mm256_testz_si256(is_zero, is_zero) == 0) { - return result(error_code::TOO_LARGE, input - start); - } + // 5. store bytes + _mm_storeu_si128((__m128i *)utf8_output, utf8_packed); - is_zero = _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax); - if(_mm256_testz_si256(is_zero, is_zero) == 0) { - return result(error_code::SURROGATE, input - start); - } - input += 8; + // 6. adjust pointers + buf += 8; + utf8_output += row[0]; + continue; } - return result(error_code::SUCCESS, input - start); -} -/* end file src/haswell/avx2_validate_utf32le.cpp */ + // Check for overflow in packing + const __m128i saturation_bytemask = _mm_cmpeq_epi32( + _mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000); + const uint32_t saturation_bitmask = + static_cast(_mm_movemask_epi8(saturation_bytemask)); -/* begin file src/haswell/avx2_convert_latin1_to_utf8.cpp */ -std::pair avx2_convert_latin1_to_utf8(const char *latin1_input, size_t len, - char *utf8_output) { - const char *end = latin1_input + len; - const __m256i v_0000 = _mm256_setzero_si256(); - const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080); - const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80); - const size_t safety_margin = 12; + if (saturation_bitmask == 0xffff) { + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes - while (latin1_input + 16 + safety_margin <= end) { - __m128i in8 = _mm_loadu_si128((__m128i *)latin1_input); - // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes - const __m128i v_80 = _mm_set1_epi8((char)0x80); - if (_mm_testz_si128(in8, v_80)) { // ASCII fast path!!!! - // 1. store (16 bytes) - _mm_storeu_si128((__m128i *)utf8_output, in8); - // 2. adjust pointers - latin1_input += 16; - utf8_output += 16; - continue; // we are done for this round! - } - // We proceed only with the first 16 bytes. - const __m256i in = _mm256_cvtepu8_epi16((in8)); + // Check for illegal surrogate code units + const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800); + const __m128i forbidden_bytemask = + _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800); + if (static_cast(_mm_movemask_epi8(forbidden_bytemask)) != 0) { + return std::make_pair(result(error_code::SURROGATE, buf - start), + utf8_output); + } + + const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - + single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - + two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - + three UTF-8 bytes + + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. - // 1. prepare 2-byte values - // input 16-bit word : [0000|0000|aabb|bbbb] x 8 - // expected output : [1100|00aa|10bb|bbbb] x 8 - const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00); - const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f); + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. - // t0 = [0000|00aa|bbbb|bb00] - const __m256i t0 = _mm256_slli_epi16(in, 2); - // t1 = [0000|00aa|0000|0000] - const __m256i t1 = _mm256_and_si256(t0, v_1f00); - // t2 = [0000|0000|00bb|bbbb] - const __m256i t2 = _mm256_and_si256(in, v_003f); - // t3 = [000a|aaaa|00bb|bbbb] - const __m256i t3 = _mm256_or_si256(t1, t2); - // t4 = [1100|00aa|10bb|bbbb] - const __m256i t4 = _mm256_or_si256(t3, v_c080); + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. - // 2. merge ASCII and 2-byte codewords + Finally from these two code units we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define simdutf_vec(x) _mm_set1_epi16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000)); - // no bits set above 7th bit - const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000); - const uint32_t one_byte_bitmask = static_cast(_mm256_movemask_epi8(one_byte_bytemask)); + // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] + const __m128i s0 = _mm_srli_epi16(in_16, 4); + // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] + const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100)); + // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] + const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140)); + // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000)); + const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, + simdutf_vec(0b0100000000000000)); + const __m128i s4 = _mm_xor_si128(s3, m0); +#undef simdutf_vec - const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask); + // 4. expand code units 16-bit => 32-bit + const __m128i out0 = _mm_unpacklo_epi16(t2, s4); + const __m128i out1 = _mm_unpackhi_epi16(t2, s4); - // 3. prepare bitmask for 8-bit lookup - const uint32_t M0 = one_byte_bitmask & 0x55555555; - const uint32_t M1 = M0 >> 7; - const uint32_t M2 = (M1 | M0) & 0x00ff00ff; - // 4. pack the bytes + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + const uint16_t mask = + (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa); + if (mask == 0) { + // We only have three-byte code units. Use fast path. + const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14, + 15, 13, -1, -1, -1, -1); + const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle); + const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle); + _mm_storeu_si128((__m128i *)utf8_output, utf8_0); + utf8_output += 12; + _mm_storeu_si128((__m128i *)utf8_output, utf8_1); + utf8_output += 12; + buf += 8; + continue; + } + const uint8_t mask0 = uint8_t(mask); - const uint8_t *row = - &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0]; - const uint8_t *row_2 = - &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> 16)] - [0]; + const uint8_t *row0 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1)); + const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0); - const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1)); - const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1)); + const uint8_t mask1 = static_cast(mask >> 8); - const __m256i utf8_packed = _mm256_shuffle_epi8( - utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2)); - // 5. store bytes - _mm_storeu_si128((__m128i *)utf8_output, - _mm256_castsi256_si128(utf8_packed)); - utf8_output += row[0]; - _mm_storeu_si128((__m128i *)utf8_output, - _mm256_extractf128_si256(utf8_packed, 1)); - utf8_output += row_2[0]; + const uint8_t *row1 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1)); + const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1); - // 6. adjust pointers - latin1_input += 16; - continue; + _mm_storeu_si128((__m128i *)utf8_output, utf8_0); + utf8_output += row0[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_1); + utf8_output += row1[0]; + buf += 8; + } else { + // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=> + // will produce four UTF-8 bytes Let us do a scalar fallback. It may seem + // wasteful to use scalar code, but being efficient with SIMD in the + // presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint32_t word = buf[k]; + if ((word & 0xFFFFFF80) == 0) { + *utf8_output++ = char(word); + } else if ((word & 0xFFFFF800) == 0) { + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if ((word & 0xFFFF0000) == 0) { + if (word >= 0xD800 && word <= 0xDFFF) { + return std::make_pair( + result(error_code::SURROGATE, buf - start + k), utf8_output); + } + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + if (word > 0x10FFFF) { + return std::make_pair( + result(error_code::TOO_LARGE, buf - start + k), utf8_output); + } + *utf8_output++ = char((word >> 18) | 0b11110000); + *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } + } + buf += k; + } } // while - return std::make_pair(latin1_input, utf8_output); + return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output); } -/* end file src/haswell/avx2_convert_latin1_to_utf8.cpp */ -/* begin file src/haswell/avx2_convert_latin1_to_utf16.cpp */ -template -std::pair avx2_convert_latin1_to_utf16(const char* latin1_input, size_t len, char16_t* utf16_output) { - size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 32 +/* end file src/westmere/sse_convert_utf32_to_utf8.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 - size_t i = 0; - for (; i < rounded_len; i += 16) { - // Load 16 bytes from the address (input + i) into a xmm register - __m128i xmm0 = _mm_loadu_si128(reinterpret_cast(latin1_input + i)); +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +/* begin file src/westmere/sse_convert_utf32_to_utf16.cpp */ +struct expansion_result_t { + size_t u16count; + __m128i compressed; +}; - // Zero extend each byte in xmm0 to word and put it in another xmm register - __m128i xmm1 = _mm_cvtepu8_epi16(xmm0); +// Function sse_expand_surrogate takes four **valid** UTF-32 characters +// having at least one code-point producing a surrogate pair. +template +expansion_result_t sse_expand_surrogate(const __m128i x) { + using vector_u32 = simd32; + using vector_u8 = simd8; - // Shift xmm0 to the right by 8 bytes - xmm0 = _mm_srli_si128(xmm0, 8); + const auto in = vector_u32(x); - // Zero extend each byte in the shifted xmm0 to word in xmm0 - xmm0 = _mm_cvtepu8_epi16(xmm0); + const auto non_surrogate_mask = (in & uint32_t(0xffff0000)) == uint32_t(0); + const auto mask = (~non_surrogate_mask.to_4bit_bitmask()) & 0xf; - if (big_endian) { - const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - xmm0 = _mm_shuffle_epi8(xmm0, swap); - xmm1 = _mm_shuffle_epi8(xmm1, swap); - } + const auto t0 = in - uint32_t(0x00010000); + const auto hi = t0.shr<10>() & uint32_t(0x000003ff); + const auto lo = t0.shl<16>() & uint32_t(0x03ff0000); + const auto surrogates = (lo | hi) | uint32_t(0xdc00d800); - // Store the contents of xmm1 into the address pointed by (output + i) - _mm_storeu_si128(reinterpret_cast<__m128i*>(utf16_output + i), xmm1); + const auto merged = as_vector_u8(select(non_surrogate_mask, in, surrogates)); - // Store the contents of xmm0 into the address pointed by (output + i + 8) - _mm_storeu_si128(reinterpret_cast<__m128i*>(utf16_output + i + 8), xmm0); - } + const auto shuffle = vector_u8::load( + (byte_order == endianness::LITTLE) + ? tables::utf32_to_utf16::pack_utf32_to_utf16le[mask] + : tables::utf32_to_utf16::pack_utf32_to_utf16be[mask]); - return std::make_pair(latin1_input + rounded_len, utf16_output + rounded_len); + const size_t u16count = (4 + count_ones(mask)); + const auto compressed = shuffle.lookup_16(merged); + return {u16count, compressed}; } -/* end file src/haswell/avx2_convert_latin1_to_utf16.cpp */ -/* begin file src/haswell/avx2_convert_latin1_to_utf32.cpp */ -std::pair avx2_convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) { - size_t rounded_len = ((len | 7) ^ 7); // Round down to nearest multiple of 8 - for (size_t i = 0; i < rounded_len; i += 8) { - // Load 8 Latin1 characters into a 64-bit register - __m128i in = _mm_loadl_epi64((__m128i*)&buf[i]); +// Function `validate_utf32` checks 2 x 4 UTF-32 characters for their validity. +simdutf_really_inline bool validate_utf32(const __m128i a, const __m128i b) { + using vector_u32 = simd32; - // Zero extend each set of 8 Latin1 characters to 8 32-bit integers using vpmovzxbd - __m256i out = _mm256_cvtepu8_epi32(in); + const auto in0 = vector_u32(a); + const auto in1 = vector_u32(b); - // Store the results back to memory - _mm256_storeu_si256((__m256i*)&utf32_output[i], out); - } + const auto standardmax = vector_u32::splat(0x10ffff); + const auto offset = vector_u32::splat(0xffff2000); + const auto standardoffsetmax = vector_u32::splat(0xfffff7ff); + + const auto too_large = max(in0, in1) > standardmax; + const auto surrogate0 = (in0 + offset) > standardoffsetmax; + const auto surrogate1 = (in1 + offset) > standardoffsetmax; - // return pointers pointing to where we left off - return std::make_pair(buf + rounded_len, utf32_output + rounded_len); + const auto combined = too_large | surrogate0 | surrogate1; + return !combined.any(); } -/* end file src/haswell/avx2_convert_latin1_to_utf32.cpp */ +template +std::pair +sse_convert_utf32_to_utf16(const char32_t *buf, size_t len, + char16_t *utf16_output) { -/* begin file src/haswell/avx2_convert_utf8_to_utf16.cpp */ -// depends on "tables/utf8_to_utf16_tables.h" + const char32_t *end = buf + len; + + const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000); + __m128i forbidden_bytemask = _mm_setzero_si128(); + while (end - buf >= 16 + 8) { + const __m128i *ptr = reinterpret_cast(buf); + const __m128i in0 = _mm_loadu_si128(ptr + 0); + const __m128i in1 = _mm_loadu_si128(ptr + 1); + const __m128i in2 = _mm_loadu_si128(ptr + 2); + const __m128i in3 = _mm_loadu_si128(ptr + 3); -// Convert up to 12 bytes from utf8 to utf16 using a mask indicating the -// end of the code points. Only the least significant 12 bits of the mask -// are accessed. -// It returns how many bytes were consumed (up to 12). -template -size_t convert_masked_utf8_to_utf16(const char *input, - uint64_t utf8_end_of_code_point_mask, - char16_t *&utf16_output) { - // we use an approach where we try to process up to 12 input bytes. - // Why 12 input bytes and not 16? Because we are concerned with the size of - // the lookup tables. Also 12 is nicely divisible by two and three. - // - // - // Optimization note: our main path below is load-latency dependent. Thus it is maybe - // beneficial to have fast paths that depend on branch prediction but have less latency. - // This results in more instructions but, potentially, also higher speeds. - // - // We first try a few fast paths. - const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - const __m128i in = _mm_loadu_si128((__m128i *)input); - const uint16_t input_utf8_end_of_code_point_mask = - utf8_end_of_code_point_mask & 0xfff; - if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) { - // We process the data in chunks of 16 bytes. - __m256i ascii = _mm256_cvtepu8_epi16(in); - if (big_endian) { - const __m256i swap256 = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, - 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); - ascii = _mm256_shuffle_epi8(ascii, swap256); - } - _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf16_output), ascii); - utf16_output += 16; // We wrote 16 16-bit characters. - return 16; // We consumed 16 bytes. - } - if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) { - // We want to take 8 2-byte UTF-8 code units and turn them into 8 2-byte UTF-16 code units. - // There is probably a more efficient sequence, but the following might do. - const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - const __m128i perm = _mm_shuffle_epi8(in, sh); - const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); - const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); - __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); - if (big_endian) composed = _mm_shuffle_epi8(composed, swap); - _mm_storeu_si128((__m128i *)utf16_output, composed); - utf16_output += 8; // We wrote 16 bytes, 8 code points. - return 16; - } - if(input_utf8_end_of_code_point_mask == 0x924) { - // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte UTF-16 code units. - // There is probably a more efficient sequence, but the following might do. - const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1); - const __m128i perm = _mm_shuffle_epi8(in, sh); - const __m128i ascii = - _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits - const __m128i middlebyte = - _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits - const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); - const __m128i highbyte = - _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits - const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); - const __m128i composed = - _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); - __m128i composed_repacked = _mm_packus_epi32(composed, composed); - if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap); - _mm_storeu_si128((__m128i *)utf16_output, composed_repacked); - utf16_output += 4; - return 12; - } + const __m128i combined = + _mm_or_si128(_mm_or_si128(in2, in3), _mm_or_si128(in0, in1)); + if (simdutf_likely(_mm_testz_si128(combined, v_ffff0000))) { + // No bits set above 16th, directly pack UTF-32 to UTF-16 + __m128i utf16_packed0 = _mm_packus_epi32(in0, in1); + __m128i utf16_packed1 = _mm_packus_epi32(in2, in3); - const uint8_t idx = - simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; - const uint8_t consumed = - simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; - if (idx < 64) { - // SIX (6) input code-code units - // this is a relatively easy scenario - // we process SIX (6) input code-code units. The max length in bytes of six code - // code units spanning between 1 and 2 bytes each is 12 bytes. On processors - // where pdep/pext is fast, we might be able to use a small lookup table. - const __m128i sh = - _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]); - const __m128i perm = _mm_shuffle_epi8(in, sh); - const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); - const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); - __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); - if (big_endian) composed = _mm_shuffle_epi8(composed, swap); - _mm_storeu_si128((__m128i *)utf16_output, composed); - utf16_output += 6; // We wrote 12 bytes, 6 code points. There is a potential overflow of 4 bytes. - } else if (idx < 145) { - // FOUR (4) input code-code units - const __m128i sh = - _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]); - const __m128i perm = _mm_shuffle_epi8(in, sh); - const __m128i ascii = - _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits - const __m128i middlebyte = - _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits - const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); - const __m128i highbyte = - _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits - const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); - const __m128i composed = - _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); - __m128i composed_repacked = _mm_packus_epi32(composed, composed); - if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap); - _mm_storeu_si128((__m128i *)utf16_output, composed_repacked); - utf16_output += 4; // Here we overflow by 8 bytes. - } else if (idx < 209) { - // TWO (2) input code-code units - ////////////// - // There might be garbage inputs where a leading byte mascarades as a four-byte - // leading byte (by being followed by 3 continuation byte), but is not greater than - // 0xf0. This could trigger a buffer overflow if we only counted leading - // bytes of the form 0xf0 as generating surrogate pairs, without further UTF-8 validation. - // Thus we must be careful to ensure that only leading bytes at least as large as 0xf0 generate surrogate pairs. - // We do as at the cost of an extra mask. - ///////////// - const __m128i sh = - _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]); - const __m128i perm = _mm_shuffle_epi8(in, sh); - const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); - const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); - const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); - __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000)); - // correct for spurious high bit - const __m128i correct = - _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1); - middlehighbyte = _mm_xor_si128(correct, middlehighbyte); - const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4); - // We deliberately carry the leading four bits in highbyte if they are present, - // we remove them later when computing hightenbits. - const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0xff000000)); - const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6); - // When we need to generate a surrogate pair (leading byte > 0xF0), then - // the corresponding 32-bit value in 'composed' will be greater than - // > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the - // location of the surrogate pairs. - const __m128i composed = - _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), - _mm_or_si128(highbyte_shifted, middlehighbyte_shifted)); - const __m128i composedminus = - _mm_sub_epi32(composed, _mm_set1_epi32(0x10000)); - const __m128i lowtenbits = - _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff)); - // Notice the 0x3ff mask: - const __m128i hightenbits = _mm_and_si128(_mm_srli_epi32(composedminus, 10), _mm_set1_epi32(0x3ff)); - const __m128i lowtenbitsadd = - _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00)); - const __m128i hightenbitsadd = - _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800)); - const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16); - __m128i surrogates = - _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted); - uint32_t basic_buffer[4]; - uint32_t basic_buffer_swap[4]; - if (big_endian) { - _mm_storeu_si128((__m128i *)basic_buffer_swap, _mm_shuffle_epi8(composed, swap)); - surrogates = _mm_shuffle_epi8(surrogates, swap); - } - _mm_storeu_si128((__m128i *)basic_buffer, composed); - uint32_t surrogate_buffer[4]; - _mm_storeu_si128((__m128i *)surrogate_buffer, surrogates); - for (size_t i = 0; i < 3; i++) { - if(basic_buffer[i] > 0x3c00000) { - utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff); - utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16); - utf16_output += 2; - } else { - utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]); - utf16_output++; + const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); + const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800); + forbidden_bytemask = _mm_or_si128( + forbidden_bytemask, + _mm_or_si128( + _mm_cmpeq_epi16(_mm_and_si128(utf16_packed0, v_f800), v_d800), + _mm_cmpeq_epi16(_mm_and_si128(utf16_packed1, v_f800), v_d800))); + + if (big_endian) { + const __m128i swap = + _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + utf16_packed0 = _mm_shuffle_epi8(utf16_packed0, swap); + utf16_packed1 = _mm_shuffle_epi8(utf16_packed1, swap); } - } - } else { - // here we know that there is an error but we do not handle errors - } - return consumed; -} -/* end file src/haswell/avx2_convert_utf8_to_utf16.cpp */ -/* begin file src/haswell/avx2_convert_utf8_to_utf32.cpp */ -// depends on "tables/utf8_to_utf16_tables.h" + _mm_storeu_si128((__m128i *)utf16_output + 0, utf16_packed0); + _mm_storeu_si128((__m128i *)utf16_output + 1, utf16_packed1); + utf16_output += 16; + buf += 16; + } else { + if (!validate_utf32(in0, in1) || !validate_utf32(in2, in3)) { + return std::make_pair(nullptr, utf16_output); + } -// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the -// end of the code points. Only the least significant 12 bits of the mask -// are accessed. -// It returns how many bytes were consumed (up to 12). -size_t convert_masked_utf8_to_utf32(const char *input, - uint64_t utf8_end_of_code_point_mask, - char32_t *&utf32_output) { - // we use an approach where we try to process up to 12 input bytes. - // Why 12 input bytes and not 16? Because we are concerned with the size of - // the lookup tables. Also 12 is nicely divisible by two and three. - // - // - // Optimization note: our main path below is load-latency dependent. Thus it is maybe - // beneficial to have fast paths that depend on branch prediction but have less latency. - // This results in more instructions but, potentially, also higher speeds. - // - // We first try a few fast paths. - const __m128i in = _mm_loadu_si128((__m128i *)input); - const uint16_t input_utf8_end_of_code_point_mask = - utf8_end_of_code_point_mask & 0xfff; - if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) { - // We process the data in chunks of 16 bytes. - _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu8_epi32(in)); - _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output+8), _mm256_cvtepu8_epi32(_mm_srli_si128(in,8))); - utf32_output += 16; // We wrote 16 32-bit characters. - return 16; // We consumed 16 bytes. - } - if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) { - // We want to take 8 2-byte UTF-8 code units and turn them into 8 4-byte UTF-32 code units. - // There is probably a more efficient sequence, but the following might do. - const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - const __m128i perm = _mm_shuffle_epi8(in, sh); - const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); - const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); - const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); - _mm256_storeu_si256((__m256i *)utf32_output, _mm256_cvtepu16_epi32(composed)); - utf32_output += 8; // We wrote 16 bytes, 8 code points. - return 16; - } - if(input_utf8_end_of_code_point_mask == 0x924) { - // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte UTF-32 code units. - // There is probably a more efficient sequence, but the following might do. - const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1); - const __m128i perm = _mm_shuffle_epi8(in, sh); - const __m128i ascii = - _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits - const __m128i middlebyte = - _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits - const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); - const __m128i highbyte = - _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits - const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); - const __m128i composed = - _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); - _mm_storeu_si128((__m128i *)utf32_output, composed); - utf32_output += 4; - return 12; + const auto ret0 = sse_expand_surrogate(in0); + _mm_storeu_si128((__m128i *)utf16_output, ret0.compressed); + utf16_output += ret0.u16count; + + const auto ret1 = sse_expand_surrogate(in1); + _mm_storeu_si128((__m128i *)utf16_output, ret1.compressed); + utf16_output += ret1.u16count; + + const auto ret2 = sse_expand_surrogate(in2); + _mm_storeu_si128((__m128i *)utf16_output, ret2.compressed); + utf16_output += ret2.u16count; + + const auto ret3 = sse_expand_surrogate(in3); + _mm_storeu_si128((__m128i *)utf16_output, ret3.compressed); + utf16_output += ret3.u16count; + + buf += 16; + } } - /// We do not have a fast path available, so we fallback. - const uint8_t idx = - tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; - const uint8_t consumed = - tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; - if (idx < 64) { - // SIX (6) input code-code units - // this is a relatively easy scenario - // we process SIX (6) input code-code units. The max length in bytes of six code - // code units spanning between 1 and 2 bytes each is 12 bytes. On processors - // where pdep/pext is fast, we might be able to use a small lookup table. - const __m128i sh = - _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); - const __m128i perm = _mm_shuffle_epi8(in, sh); - const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); - const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); - const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); - _mm256_storeu_si256((__m256i *)utf32_output, _mm256_cvtepu16_epi32(composed)); - utf32_output += 6; // We wrote 24 bytes, 6 code points. There is a potential - // overflow of 32 - 24 = 8 bytes. - } else if (idx < 145) { - // FOUR (4) input code-code units - const __m128i sh = - _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); - const __m128i perm = _mm_shuffle_epi8(in, sh); - const __m128i ascii = - _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits - const __m128i middlebyte = - _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits - const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); - const __m128i highbyte = - _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits - const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); - const __m128i composed = - _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); - _mm_storeu_si128((__m128i *)utf32_output, composed); - utf32_output += 4; - } else if (idx < 209) { - // TWO (2) input code-code units - const __m128i sh = - _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); - const __m128i perm = _mm_shuffle_epi8(in, sh); - const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); - const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); - const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); - __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000)); - // correct for spurious high bit - const __m128i correct = - _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1); - middlehighbyte = _mm_xor_si128(correct, middlehighbyte); - const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4); - const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000)); - const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6); - const __m128i composed = - _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), - _mm_or_si128(highbyte_shifted, middlehighbyte_shifted)); - _mm_storeu_si128((__m128i *)utf32_output, composed); - utf32_output += 3; // We wrote 3 * 4 bytes, there is a potential overflow of 4 bytes. - } else { - // here we know that there is an error but we do not handle errors + // check for invalid input + if (static_cast(_mm_movemask_epi8(forbidden_bytemask)) != 0) { + return std::make_pair(nullptr, utf16_output); } - return consumed; + + return std::make_pair(buf, utf16_output); } -/* end file src/haswell/avx2_convert_utf8_to_utf32.cpp */ -/* begin file src/haswell/avx2_convert_utf16_to_latin1.cpp */ template -std::pair -avx2_convert_utf16_to_latin1(const char16_t *buf, size_t len, - char *latin1_output) { - const char16_t *end = buf + len; - while (buf + 16 <= end) { - // Load 16 UTF-16 characters into 256-bit AVX2 register - __m256i in = _mm256_loadu_si256(reinterpret_cast(buf)); +std::pair +sse_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len, + char16_t *utf16_output) { + const char32_t *start = buf; + const char32_t *end = buf + len; - if (!match_system(big_endian)) { - const __m256i swap = _mm256_setr_epi8( - 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, - 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); - in = _mm256_shuffle_epi8(in, swap); - } + const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000); - __m256i high_byte_mask = _mm256_set1_epi16((int16_t)0xFF00); - if (_mm256_testz_si256(in, high_byte_mask)) { - // Pack 16-bit characters into 8-bit and store in latin1_output - __m128i lo = _mm256_extractf128_si256(in, 0); - __m128i hi = _mm256_extractf128_si256(in, 1); - __m128i latin1_packed_lo = _mm_packus_epi16(lo, lo); - __m128i latin1_packed_hi = _mm_packus_epi16(hi, hi); - _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output), - latin1_packed_lo); - _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output + 8), - latin1_packed_hi); - // Adjust pointers for next iteration - buf += 16; - latin1_output += 16; + while (end - buf >= 8) { + const __m128i in = _mm_loadu_si128((__m128i *)buf); + const __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1); + + const __m128i combined = _mm_or_si128(in, nextin); + if (simdutf_likely(_mm_testz_si128(combined, v_ffff0000))) { + // No bits set above 16th, directly pack UTF-32 to UTF-16 + __m128i utf16_packed = _mm_packus_epi32(in, nextin); + + const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); + const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800); + const __m128i forbidden_bytemask = + _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800); + if (static_cast(_mm_movemask_epi8(forbidden_bytemask)) != 0) { + return std::make_pair(result(error_code::SURROGATE, buf - start), + utf16_output); + } + + if (big_endian) { + const __m128i swap = + _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + utf16_packed = _mm_shuffle_epi8(utf16_packed, swap); + } + + _mm_storeu_si128((__m128i *)utf16_output, utf16_packed); + utf16_output += 8; + buf += 8; } else { - return std::make_pair(nullptr, reinterpret_cast(latin1_output)); + size_t forward = 7; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint32_t word = buf[k]; + if ((word & 0xFFFF0000) == 0) { + // will not generate a surrogate pair + if (word >= 0xD800 && word <= 0xDFFF) { + return std::make_pair( + result(error_code::SURROGATE, buf - start + k), utf16_output); + } + *utf16_output++ = + big_endian + ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) + : char16_t(word); + } else { + // will generate a surrogate pair + if (word > 0x10FFFF) { + return std::make_pair( + result(error_code::TOO_LARGE, buf - start + k), utf16_output); + } + word -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); + if (big_endian) { + high_surrogate = + uint16_t((high_surrogate >> 8) | (high_surrogate << 8)); + low_surrogate = + uint16_t((low_surrogate >> 8) | (low_surrogate << 8)); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + } + } + buf += k; } - } // while - return std::make_pair(buf, latin1_output); + } + + return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output); } +/* end file src/westmere/sse_convert_utf32_to_utf16.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 -template -std::pair -avx2_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len, - char *latin1_output) { - const char16_t *start = buf; - const char16_t *end = buf + len; - while (buf + 16 <= end) { - __m256i in = _mm256_loadu_si256(reinterpret_cast(buf)); +#if SIMDUTF_FEATURE_BASE64 +/* begin file src/westmere/sse_base64.cpp */ +/** + * References and further reading: + * + * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the + * speed of a memory copy, Software: Practice and Experience 50 (2), 2020. + * https://arxiv.org/abs/1910.05109 + * + * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2 + * Instructions, ACM Transactions on the Web 12 (3), 2018. + * https://arxiv.org/abs/1704.00605 + * + * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings. + * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force, + * Request for Comments: 4648. + * + * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization. + * http://www.alfredklomp.com/programming/sse-base64/. (2014). + * + * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD + * acceleration. https://github.com/aklomp/base64. (2014). + * + * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014). + * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/ + * + * Nick Kopp. 2013. Base64 Encoding on a GPU. + * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013). + */ + +// --- encoding ---------------------------------------------------- +template __m128i lookup_pshufb_improved(const __m128i input) { + // credit: Wojciech Muła + // reduce 0..51 -> 0 + // 52..61 -> 1 .. 10 + // 62 -> 11 + // 63 -> 12 + __m128i result = _mm_subs_epu8(input, _mm_set1_epi8(51)); + + // distinguish between ranges 0..25 and 26..51: + // 0 .. 25 -> remains 0 + // 26 .. 51 -> becomes 13 + const __m128i less = _mm_cmpgt_epi8(_mm_set1_epi8(26), input); + result = _mm_or_si128(result, _mm_and_si128(less, _mm_set1_epi8(13))); + + __m128i shift_LUT; + if (base64_url) { + shift_LUT = _mm_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, + '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, + '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0); + } else { + shift_LUT = _mm_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, + '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, + '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0); + } + + // read shift + result = _mm_shuffle_epi8(shift_LUT, result); + + return _mm_add_epi8(result, input); +} + +inline __m128i insert_line_feed16(__m128i input, size_t K) { + static const uint8_t shuffle_masks[16][16] = { + {0x80, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, + {0, 0x80, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, + {0, 1, 0x80, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, + {0, 1, 2, 0x80, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, + {0, 1, 2, 3, 0x80, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, + {0, 1, 2, 3, 4, 0x80, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, + {0, 1, 2, 3, 4, 5, 0x80, 6, 7, 8, 9, 10, 11, 12, 13, 14}, + {0, 1, 2, 3, 4, 5, 6, 0x80, 7, 8, 9, 10, 11, 12, 13, 14}, + {0, 1, 2, 3, 4, 5, 6, 7, 0x80, 8, 9, 10, 11, 12, 13, 14}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 0x80, 9, 10, 11, 12, 13, 14}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0x80, 10, 11, 12, 13, 14}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0x80, 11, 12, 13, 14}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0x80, 12, 13, 14}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0x80, 13, 14}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 14}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0x80}}; + // Prepare a vector with '\n' (0x0A) + __m128i line_feed_vector = _mm_set1_epi8('\n'); + + // Load the precomputed shuffle mask for K (index K-1) + __m128i mask = _mm_loadu_si128((__m128i *)shuffle_masks[K]); + __m128i lf_pos = _mm_cmpeq_epi8(mask, _mm_set1_epi8(static_cast(0x80))); + + // Perform the shuffle to reposition the K bytes + __m128i shuffled = _mm_shuffle_epi8(input, mask); + + // Blend with line_feed_vector to insert '\n' at the appropriate positions + __m128i result = _mm_blendv_epi8(shuffled, line_feed_vector, lf_pos); + + return result; +} +template +size_t encode_base64_impl(char *dst, const char *src, size_t srclen, + base64_options options, + size_t line_length = simdutf::default_line_length) { + size_t offset = 0; + if (line_length < 4) { + line_length = 4; // We do not support line_length less than 4 + } + // credit: Wojciech Muła + // SSE (lookup: pshufb improved unrolled) + const uint8_t *input = (const uint8_t *)src; - if (!big_endian) { - const __m256i swap = _mm256_setr_epi8( - 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, - 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); - in = _mm256_shuffle_epi8(in, swap); - } + uint8_t *out = (uint8_t *)dst; + const __m128i shuf = + _mm_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1); - __m256i high_byte_mask = _mm256_set1_epi16((int16_t)0xFF00); - if (_mm256_testz_si256(in, high_byte_mask)) { - __m128i lo = _mm256_extractf128_si256(in, 0); - __m128i hi = _mm256_extractf128_si256(in, 1); - __m128i latin1_packed_lo = _mm_packus_epi16(lo, lo); - __m128i latin1_packed_hi = _mm_packus_epi16(hi, hi); - _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output), - latin1_packed_lo); - _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output + 8), - latin1_packed_hi); - buf += 16; - latin1_output += 16; - } else { - // Fallback to scalar code for handling errors - for (int k = 0; k < 16; k++) { - uint16_t word = !match_system(big_endian) - ? scalar::utf16::swap_bytes(buf[k]) - : buf[k]; - if (word <= 0xff) { - *latin1_output++ = char(word); - } else { - return std::make_pair( - result{error_code::TOO_LARGE, (size_t)(buf - start + k)}, - latin1_output); - } - } - buf += 16; - } - } // while - return std::make_pair(result{error_code::SUCCESS, (size_t)(buf - start)}, - latin1_output); -} -/* end file src/haswell/avx2_convert_utf16_to_latin1.cpp */ -/* begin file src/haswell/avx2_convert_utf16_to_utf8.cpp */ -/* - The vectorized algorithm works on single SSE register i.e., it - loads eight 16-bit code units. + size_t i = 0; + for (; i + 52 <= srclen; i += 48) { + __m128i in0 = _mm_loadu_si128( + reinterpret_cast(input + i + 4 * 3 * 0)); + __m128i in1 = _mm_loadu_si128( + reinterpret_cast(input + i + 4 * 3 * 1)); + __m128i in2 = _mm_loadu_si128( + reinterpret_cast(input + i + 4 * 3 * 2)); + __m128i in3 = _mm_loadu_si128( + reinterpret_cast(input + i + 4 * 3 * 3)); - We consider three cases: - 1. an input register contains no surrogates and each value - is in range 0x0000 .. 0x07ff. - 2. an input register contains no surrogates and values are - is in range 0x0000 .. 0xffff. - 3. an input register contains surrogates --- i.e. codepoints - can have 16 or 32 bits. + in0 = _mm_shuffle_epi8(in0, shuf); + in1 = _mm_shuffle_epi8(in1, shuf); + in2 = _mm_shuffle_epi8(in2, shuf); + in3 = _mm_shuffle_epi8(in3, shuf); - Ad 1. + const __m128i t0_0 = _mm_and_si128(in0, _mm_set1_epi32(0x0fc0fc00)); + const __m128i t0_1 = _mm_and_si128(in1, _mm_set1_epi32(0x0fc0fc00)); + const __m128i t0_2 = _mm_and_si128(in2, _mm_set1_epi32(0x0fc0fc00)); + const __m128i t0_3 = _mm_and_si128(in3, _mm_set1_epi32(0x0fc0fc00)); - When values are less than 0x0800, it means that a 16-bit code unit - can be converted into: 1) single UTF8 byte (when it's an ASCII - char) or 2) two UTF8 bytes. + const __m128i t1_0 = _mm_mulhi_epu16(t0_0, _mm_set1_epi32(0x04000040)); + const __m128i t1_1 = _mm_mulhi_epu16(t0_1, _mm_set1_epi32(0x04000040)); + const __m128i t1_2 = _mm_mulhi_epu16(t0_2, _mm_set1_epi32(0x04000040)); + const __m128i t1_3 = _mm_mulhi_epu16(t0_3, _mm_set1_epi32(0x04000040)); - For this case we do only some shuffle to obtain these 2-byte - codes and finally compress the whole SSE register with a single - shuffle. + const __m128i t2_0 = _mm_and_si128(in0, _mm_set1_epi32(0x003f03f0)); + const __m128i t2_1 = _mm_and_si128(in1, _mm_set1_epi32(0x003f03f0)); + const __m128i t2_2 = _mm_and_si128(in2, _mm_set1_epi32(0x003f03f0)); + const __m128i t2_3 = _mm_and_si128(in3, _mm_set1_epi32(0x003f03f0)); - We need 256-entry lookup table to get a compression pattern - and the number of output bytes in the compressed vector register. - Each entry occupies 17 bytes. + const __m128i t3_0 = _mm_mullo_epi16(t2_0, _mm_set1_epi32(0x01000010)); + const __m128i t3_1 = _mm_mullo_epi16(t2_1, _mm_set1_epi32(0x01000010)); + const __m128i t3_2 = _mm_mullo_epi16(t2_2, _mm_set1_epi32(0x01000010)); + const __m128i t3_3 = _mm_mullo_epi16(t2_3, _mm_set1_epi32(0x01000010)); - Ad 2. + const __m128i input0 = _mm_or_si128(t1_0, t3_0); + const __m128i input1 = _mm_or_si128(t1_1, t3_1); + const __m128i input2 = _mm_or_si128(t1_2, t3_2); + const __m128i input3 = _mm_or_si128(t1_3, t3_3); - When values fit in 16-bit code units, but are above 0x07ff, then - a single word may produce one, two or three UTF8 bytes. + const __m128i t0 = lookup_pshufb_improved(input0); + const __m128i t1 = lookup_pshufb_improved(input1); + const __m128i t2 = lookup_pshufb_improved(input2); + const __m128i t3 = lookup_pshufb_improved(input3); + + if (use_lines) { + if (line_length >= 64) { // fast path + if (offset + 64 > line_length) { + size_t location_end = line_length - offset; + size_t to_move = 64 - location_end; + if (location_end < 16) { + // We can store or extract store. See below. + //_mm_storeu_si128(reinterpret_cast<__m128i *>(out+1), t0); + _mm_storeu_si128(reinterpret_cast<__m128i *>(out), + insert_line_feed16(t0, location_end)); + out[16] = static_cast(_mm_extract_epi8(t0, 15)); + out += 17; + } else { + _mm_storeu_si128(reinterpret_cast<__m128i *>(out), t0); + out += 16; + } + if (location_end >= 16 && location_end < 32) { + // We can store or extract store. See below. + //_mm_storeu_si128(reinterpret_cast<__m128i *>(out+1), t1); + _mm_storeu_si128(reinterpret_cast<__m128i *>(out), + insert_line_feed16(t1, location_end - 16)); + out[16] = static_cast(_mm_extract_epi8(t1, 15)); + out += 17; + } else { + _mm_storeu_si128(reinterpret_cast<__m128i *>(out), t1); + out += 16; + } + if (location_end >= 32 && location_end < 48) { + // We can store or extract store. See below. + //_mm_storeu_si128(reinterpret_cast<__m128i *>(out+1), t2); + _mm_storeu_si128(reinterpret_cast<__m128i *>(out), + insert_line_feed16(t2, location_end - 32)); + out[16] = static_cast(_mm_extract_epi8(t2, 15)); + out += 17; + } else { + _mm_storeu_si128(reinterpret_cast<__m128i *>(out), t2); + out += 16; + } + if (location_end >= 48) { + // We can store or extract store. See below. + //_mm_storeu_si128(reinterpret_cast<__m128i *>(out+1), t3); + _mm_storeu_si128(reinterpret_cast<__m128i *>(out), + insert_line_feed16(t3, location_end - 48)); + out[16] = static_cast(_mm_extract_epi8(t3, 15)); + out += 17; + } else { + _mm_storeu_si128(reinterpret_cast<__m128i *>(out), t3); + out += 16; + } + offset = to_move; + } else { - We prepare data for all these three cases in two registers. - The first register contains lower two UTF8 bytes (used in all - cases), while the second one contains just the third byte for - the three-UTF8-bytes case. + _mm_storeu_si128(reinterpret_cast<__m128i *>(out), t0); + _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 16), t1); + _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 32), t2); + _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 48), t3); + offset += 64; + out += 64; + } + } else { // slow path + // could be optimized + alignas(64) uint8_t buffer[64]; + _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer), t0); + _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + 16), t1); + _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + 32), t2); + _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + 48), t3); + std::memcpy(out, buffer, 64); + size_t out_pos = 0; + size_t local_offset = offset; + for (size_t j = 0; j < 64;) { + if (local_offset == line_length) { + out[out_pos++] = '\n'; + local_offset = 0; + } + out[out_pos++] = buffer[j++]; + local_offset++; + } + offset = local_offset; + out += out_pos; + } + } else { + _mm_storeu_si128(reinterpret_cast<__m128i *>(out), t0); + _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 16), t1); + _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 32), t2); + _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 48), t3); + out += 64; + } + } + for (; i + 16 <= srclen; i += 12) { - Finally these two registers are interleaved forming eight-element - array of 32-bit values. The array spans two SSE registers. - The bytes from the registers are compressed using two shuffles. + __m128i in = _mm_loadu_si128(reinterpret_cast(input + i)); - We need 256-entry lookup table to get a compression pattern - and the number of output bytes in the compressed vector register. - Each entry occupies 17 bytes. + // bytes from groups A, B and C are needed in separate 32-bit lanes + // in = [DDDD|CCCC|BBBB|AAAA] + // + // an input triplet has layout + // [????????|ccdddddd|bbbbcccc|aaaaaabb] + // byte 3 byte 2 byte 1 byte 0 -- byte 3 comes from the next + // triplet + // + // shuffling changes the order of bytes: 1, 0, 2, 1 + // [bbbbcccc|ccdddddd|aaaaaabb|bbbbcccc] + // ^^^^ ^^^^^^^^ ^^^^^^^^ ^^^^ + // processed bits + in = _mm_shuffle_epi8(in, shuf); + // unpacking - To summarize: - - We need two 256-entry tables that have 8704 bytes in total. -*/ + // t0 = [0000cccc|cc000000|aaaaaa00|00000000] + const __m128i t0 = _mm_and_si128(in, _mm_set1_epi32(0x0fc0fc00)); + // t1 = [00000000|00cccccc|00000000|00aaaaaa] + // (c * (1 << 10), a * (1 << 6)) >> 16 (note: an unsigned + // multiplication) + const __m128i t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040)); + // t2 = [00000000|00dddddd|000000bb|bbbb0000] + const __m128i t2 = _mm_and_si128(in, _mm_set1_epi32(0x003f03f0)); + // t3 = [00dddddd|00000000|00bbbbbb|00000000]( + // (d * (1 << 8), b * (1 << 4)) + const __m128i t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010)); -/* - Returns a pair: the first unprocessed byte from buf and utf8_output - A scalar routing should carry on the conversion of the tail. -*/ -template -std::pair avx2_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) { - const char16_t* end = buf + len; - const __m256i v_0000 = _mm256_setzero_si256(); - const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800); - const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800); - const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080); - const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 + // res = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] = t1 | t3 + const __m128i indices = _mm_or_si128(t1, t3); - while (buf + 16 + safety_margin <= end) { - __m256i in = _mm256_loadu_si256((__m256i*)buf); - if (big_endian) { - const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, - 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); - in = _mm256_shuffle_epi8(in, swap); - } - // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes - const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80); - if(_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!! - // 1. pack the bytes - const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1)); - // 2. store (16 bytes) - _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); - // 3. adjust pointers - buf += 16; - utf8_output += 16; - continue; // we are done for this round! - } - // no bits set above 7th bit - const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000); - const uint32_t one_byte_bitmask = static_cast(_mm256_movemask_epi8(one_byte_bytemask)); + const __m128i T0 = lookup_pshufb_improved(indices); - // no bits set above 11th bit - const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000); - const uint32_t one_or_two_bytes_bitmask = static_cast(_mm256_movemask_epi8(one_or_two_bytes_bytemask)); - if (one_or_two_bytes_bitmask == 0xffffffff) { + _mm_storeu_si128(reinterpret_cast<__m128i *>(out), T0); - // 1. prepare 2-byte values - // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 - // expected output : [110a|aaaa|10bb|bbbb] x 8 - const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00); - const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f); - - // t0 = [000a|aaaa|bbbb|bb00] - const __m256i t0 = _mm256_slli_epi16(in, 2); - // t1 = [000a|aaaa|0000|0000] - const __m256i t1 = _mm256_and_si256(t0, v_1f00); - // t2 = [0000|0000|00bb|bbbb] - const __m256i t2 = _mm256_and_si256(in, v_003f); - // t3 = [000a|aaaa|00bb|bbbb] - const __m256i t3 = _mm256_or_si256(t1, t2); - // t4 = [110a|aaaa|10bb|bbbb] - const __m256i t4 = _mm256_or_si256(t3, v_c080); - - // 2. merge ASCII and 2-byte codewords - const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask); - - // 3. prepare bitmask for 8-bit lookup - const uint32_t M0 = one_byte_bitmask & 0x55555555; - const uint32_t M1 = M0 >> 7; - const uint32_t M2 = (M1 | M0) & 0x00ff00ff; - // 4. pack the bytes - - const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0]; - const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0]; - - const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1)); - const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1)); - - const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2)); - // 5. store bytes - _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed)); - utf8_output += row[0]; - _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1)); - utf8_output += row_2[0]; - - // 6. adjust pointers - buf += 16; - continue; + if (use_lines) { + if (line_length >= 16) { // fast path + _mm_storeu_si128(reinterpret_cast<__m128i *>(out), T0); + if (offset + 16 > line_length) { + size_t location_end = line_length - offset; + size_t to_move = 16 - location_end; + std::memmove(out + location_end + 1, out + location_end, to_move); + out[location_end] = '\n'; + offset = to_move; + out += 16 + 1; + } else { + offset += 16; + out += 16; + } + } else { // slow path + // could be optimized + uint8_t buffer[16]; + _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer), T0); + size_t out_pos = 0; + size_t local_offset = offset; + for (size_t j = 0; j < 16;) { + if (local_offset == line_length) { + out[out_pos++] = '\n'; + local_offset = 0; + } + out[out_pos++] = buffer[j++]; + local_offset++; + } + offset = local_offset; + out += out_pos; + } + } else { + _mm_storeu_si128(reinterpret_cast<__m128i *>(out), T0); + out += 16; } - // 1. Check if there are any surrogate word in the input chunk. - // We have also deal with situation when there is a surrogate word - // at the end of a chunk. - const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800); + } + return ((char *)out - (char *)dst) + + scalar::base64::tail_encode_base64_impl( + (char *)out, src + i, srclen - i, options, line_length, offset); +} - // bitmask = 0x0000 if there are no surrogates - // = 0xc000 if the last word is a surrogate - const uint32_t surrogates_bitmask = static_cast(_mm256_movemask_epi8(surrogates_bytemask)); - // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, - // it is likely an uncommon occurrence. - if (surrogates_bitmask == 0x00000000) { - // case: code units from register produce either 1, 2 or 3 UTF-8 bytes - const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e, - 0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); +template +size_t encode_base64(char *dst, const char *src, size_t srclen, + base64_options options) { + return encode_base64_impl(dst, src, srclen, options); +} - /* In this branch we handle three cases: - 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte - 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes - 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes +// --- decoding ----------------------------------------------- - We expand the input word (16-bit) into two code units (32-bit), thus - we have room for four bytes. However, we need five distinct bit - layouts. Note that the last byte in cases #2 and #3 is the same. +static simdutf_really_inline void compress(__m128i data, uint16_t mask, + char *output) { + if (mask == 0) { + _mm_storeu_si128(reinterpret_cast<__m128i *>(output), data); + return; + } - We precompute byte 1 for case #1 and the common byte for cases #2 & #3 - in register t2. + // this particular implementation was inspired by work done by @animetosho + // we do it in two steps, first 8 bytes and then second 8 bytes + uint8_t mask1 = uint8_t(mask); // least significant 8 bits + uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits + // next line just loads the 64-bit values thintable_epi8[mask1] and + // thintable_epi8[mask2] into a 128-bit register, using only + // two instructions on most compilers. + + __m128i shufmask = _mm_set_epi64x(tables::base64::thintable_epi8[mask2], + tables::base64::thintable_epi8[mask1]); + // we increment by 0x08 the second half of the mask + shufmask = + _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0)); + // this is the version "nearly pruned" + __m128i pruned = _mm_shuffle_epi8(data, shufmask); + // we still need to put the two halves together. + // we compute the popcount of the first half: + int pop1 = tables::base64::BitsSetTable256mul2[mask1]; + // then load the corresponding mask, what it does is to write + // only the first pop1 bytes from the first 8 bytes, and then + // it fills in with the bytes from the second 8 bytes + some filling + // at the end. + __m128i compactmask = _mm_loadu_si128(reinterpret_cast( + tables::base64::pshufb_combine_table + pop1 * 8)); + __m128i answer = _mm_shuffle_epi8(pruned, compactmask); + _mm_storeu_si128(reinterpret_cast<__m128i *>(output), answer); +} - We precompute byte 1 for case #3 and -- **conditionally** -- precompute - either byte 1 for case #2 or byte 2 for case #3. Note that they - differ by exactly one bit. +static simdutf_really_inline void base64_decode(char *out, __m128i str) { + // credit: aqrit - Finally from these two code units we build proper UTF-8 sequence, taking - into account the case (i.e, the number of bytes to write). - */ - /** - * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: - * t2 => [0ccc|cccc] [10cc|cccc] - * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) - */ -#define simdutf_vec(x) _mm256_set1_epi16(static_cast(x)) - // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] - const __m256i t0 = _mm256_shuffle_epi8(in, dup_even); - // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] - const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111)); - // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] - const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000)); - - // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] - const __m256i s0 = _mm256_srli_epi16(in, 4); - // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] - const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100)); - // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] - const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140)); - // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] - const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000)); - const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000)); - const __m256i s4 = _mm256_xor_si256(s3, m0); -#undef simdutf_vec + const __m128i pack_shuffle = + _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1); + + const __m128i t0 = _mm_maddubs_epi16(str, _mm_set1_epi32(0x01400140)); + const __m128i t1 = _mm_madd_epi16(t0, _mm_set1_epi32(0x00011000)); + const __m128i t2 = _mm_shuffle_epi8(t1, pack_shuffle); + // Store the output: + // this writes 16 bytes, but we only need 12. + _mm_storeu_si128((__m128i *)out, t2); +} - // 4. expand code units 16-bit => 32-bit - const __m256i out0 = _mm256_unpacklo_epi16(t2, s4); - const __m256i out1 = _mm256_unpackhi_epi16(t2, s4); +// decode 64 bytes and output 48 bytes +static inline void base64_decode_block(char *out, const char *src) { + base64_decode(out, _mm_loadu_si128(reinterpret_cast(src))); + base64_decode(out + 12, + _mm_loadu_si128(reinterpret_cast(src + 16))); + base64_decode(out + 24, + _mm_loadu_si128(reinterpret_cast(src + 32))); + base64_decode(out + 36, + _mm_loadu_si128(reinterpret_cast(src + 48))); +} - // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle - const uint32_t mask = (one_byte_bitmask & 0x55555555) | - (one_or_two_bytes_bitmask & 0xaaaaaaaa); - // Due to the wider registers, the following path is less likely to be useful. - /*if(mask == 0) { - // We only have three-byte code units. Use fast path. - const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); - const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle); - const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle); - _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0)); - utf8_output += 12; - _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1)); - utf8_output += 12; - _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1)); - utf8_output += 12; - _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1)); - utf8_output += 12; - buf += 16; - continue; - }*/ - const uint8_t mask0 = uint8_t(mask); - const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; - const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1)); - const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0); +static inline void base64_decode_block_safe(char *out, const char *src) { + base64_decode(out, _mm_loadu_si128(reinterpret_cast(src))); + base64_decode(out + 12, + _mm_loadu_si128(reinterpret_cast(src + 16))); + base64_decode(out + 24, + _mm_loadu_si128(reinterpret_cast(src + 32))); + char buffer[16]; + base64_decode(buffer, + _mm_loadu_si128(reinterpret_cast(src + 48))); + std::memcpy(out + 36, buffer, 12); +} - const uint8_t mask1 = static_cast(mask >> 8); - const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; - const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1)); - const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1); +// --- decoding - base64 class -------------------------------- - const uint8_t mask2 = static_cast(mask >> 16); - const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0]; - const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1)); - const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2); +class block64 { + __m128i chunks[4]; +public: + // The caller of this function is responsible to ensure that there are 64 + // bytes available from reading at src. + simdutf_really_inline block64(const char *src) { + chunks[0] = _mm_loadu_si128(reinterpret_cast(src)); + chunks[1] = _mm_loadu_si128(reinterpret_cast(src + 16)); + chunks[2] = _mm_loadu_si128(reinterpret_cast(src + 32)); + chunks[3] = _mm_loadu_si128(reinterpret_cast(src + 48)); + } - const uint8_t mask3 = static_cast(mask >> 24); - const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0]; - const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1)); - const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3); +public: + // The caller of this function is responsible to ensure that there are 128 + // bytes available from reading at src. The data is read into a block64 + // structure. + simdutf_really_inline block64(const char16_t *src) { + const auto m1 = _mm_loadu_si128(reinterpret_cast(src)); + const auto m2 = _mm_loadu_si128(reinterpret_cast(src + 8)); + const auto m3 = + _mm_loadu_si128(reinterpret_cast(src + 16)); + const auto m4 = + _mm_loadu_si128(reinterpret_cast(src + 24)); + const auto m5 = + _mm_loadu_si128(reinterpret_cast(src + 32)); + const auto m6 = + _mm_loadu_si128(reinterpret_cast(src + 40)); + const auto m7 = + _mm_loadu_si128(reinterpret_cast(src + 48)); + const auto m8 = + _mm_loadu_si128(reinterpret_cast(src + 56)); + chunks[0] = _mm_packus_epi16(m1, m2); + chunks[1] = _mm_packus_epi16(m3, m4); + chunks[2] = _mm_packus_epi16(m5, m6); + chunks[3] = _mm_packus_epi16(m7, m8); + } - _mm_storeu_si128((__m128i*)utf8_output, utf8_0); - utf8_output += row0[0]; - _mm_storeu_si128((__m128i*)utf8_output, utf8_1); - utf8_output += row1[0]; - _mm_storeu_si128((__m128i*)utf8_output, utf8_2); - utf8_output += row2[0]; - _mm_storeu_si128((__m128i*)utf8_output, utf8_3); - utf8_output += row3[0]; - buf += 16; - // surrogate pair(s) in a register - } else { - // Let us do a scalar fallback. - // It may seem wasteful to use scalar code, but being efficient with SIMD - // in the presence of surrogate pairs may require non-trivial tables. - size_t forward = 15; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; - if((word & 0xFF80)==0) { - *utf8_output++ = char(word); - } else if((word & 0xF800)==0) { - *utf8_output++ = char((word>>6) | 0b11000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else if((word &0xF800 ) != 0xD800) { - *utf8_output++ = char((word>>12) | 0b11100000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else { - // must be a surrogate pair - uint16_t diff = uint16_t(word - 0xD800); - uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1]; - k++; - uint16_t diff2 = uint16_t(next_word - 0xDC00); - if((diff | diff2) > 0x3FF) { return std::make_pair(nullptr, utf8_output); } - uint32_t value = (diff << 10) + diff2 + 0x10000; - *utf8_output++ = char((value>>18) | 0b11110000); - *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000); - *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((value & 0b111111) | 0b10000000); - } - } - buf += k; +public: + simdutf_really_inline void copy_block(char *output) { + _mm_storeu_si128(reinterpret_cast<__m128i *>(output), chunks[0]); + _mm_storeu_si128(reinterpret_cast<__m128i *>(output + 16), chunks[1]); + _mm_storeu_si128(reinterpret_cast<__m128i *>(output + 32), chunks[2]); + _mm_storeu_si128(reinterpret_cast<__m128i *>(output + 48), chunks[3]); + } + +public: + simdutf_really_inline uint64_t compress_block(uint64_t mask, char *output) { + if (is_power_of_two(mask)) { + return compress_block_single(mask, output); } - } // while - return std::make_pair(buf, utf8_output); -} + uint64_t nmask = ~mask; + compress(chunks[0], uint16_t(mask), output); + compress(chunks[1], uint16_t(mask >> 16), + output + count_ones(nmask & 0xFFFF)); + compress(chunks[2], uint16_t(mask >> 32), + output + count_ones(nmask & 0xFFFFFFFF)); + compress(chunks[3], uint16_t(mask >> 48), + output + count_ones(nmask & 0xFFFFFFFFFFFFULL)); + return count_ones(nmask); + } -/* - Returns a pair: a result struct and utf8_output. - If there is an error, the count field of the result is the position of the error. - Otherwise, it is the position of the first unprocessed byte in buf (even if finished). - A scalar routing should carry on the conversion of the tail if needed. -*/ -template -std::pair avx2_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) { - const char16_t* start = buf; - const char16_t* end = buf + len; +private: + simdutf_really_inline size_t compress_block_single(uint64_t mask, + char *output) { + const size_t pos64 = trailing_zeroes(mask); + const int8_t pos = pos64 & 0xf; + switch (pos64 >> 4) { + case 0b00: { + const __m128i v0 = _mm_set1_epi8(char(pos - 1)); + const __m128i v1 = + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + const __m128i v2 = _mm_cmpgt_epi8(v1, v0); + const __m128i sh = _mm_sub_epi8(v1, v2); + const __m128i compressed = _mm_shuffle_epi8(chunks[0], sh); + + _mm_storeu_si128((__m128i *)(output + 0 * 16), compressed); + _mm_storeu_si128((__m128i *)(output + 1 * 16 - 1), chunks[1]); + _mm_storeu_si128((__m128i *)(output + 2 * 16 - 1), chunks[2]); + _mm_storeu_si128((__m128i *)(output + 3 * 16 - 1), chunks[3]); + } break; + case 0b01: { + _mm_storeu_si128((__m128i *)(output + 0 * 16), chunks[0]); + + const __m128i v0 = _mm_set1_epi8(char(pos - 1)); + const __m128i v1 = + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + const __m128i v2 = _mm_cmpgt_epi8(v1, v0); + const __m128i sh = _mm_sub_epi8(v1, v2); + const __m128i compressed = _mm_shuffle_epi8(chunks[1], sh); + + _mm_storeu_si128((__m128i *)(output + 1 * 16), compressed); + _mm_storeu_si128((__m128i *)(output + 2 * 16 - 1), chunks[2]); + _mm_storeu_si128((__m128i *)(output + 3 * 16 - 1), chunks[3]); + } break; + case 0b10: { + _mm_storeu_si128((__m128i *)(output + 0 * 16), chunks[0]); + _mm_storeu_si128((__m128i *)(output + 1 * 16), chunks[1]); + + const __m128i v0 = _mm_set1_epi8(char(pos - 1)); + const __m128i v1 = + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + const __m128i v2 = _mm_cmpgt_epi8(v1, v0); + const __m128i sh = _mm_sub_epi8(v1, v2); + const __m128i compressed = _mm_shuffle_epi8(chunks[2], sh); + + _mm_storeu_si128((__m128i *)(output + 2 * 16), compressed); + _mm_storeu_si128((__m128i *)(output + 3 * 16 - 1), chunks[3]); + } break; + case 0b11: { + _mm_storeu_si128((__m128i *)(output + 0 * 16), chunks[0]); + _mm_storeu_si128((__m128i *)(output + 1 * 16), chunks[1]); + _mm_storeu_si128((__m128i *)(output + 2 * 16), chunks[2]); + + const __m128i v0 = _mm_set1_epi8(char(pos - 1)); + const __m128i v1 = + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + const __m128i v2 = _mm_cmpgt_epi8(v1, v0); + const __m128i sh = _mm_sub_epi8(v1, v2); + const __m128i compressed = _mm_shuffle_epi8(chunks[3], sh); + + _mm_storeu_si128((__m128i *)(output + 3 * 16), compressed); + } break; + } + + return 63; + } - const __m256i v_0000 = _mm256_setzero_si256(); - const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800); - const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800); - const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080); - const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 +public: + template + simdutf_really_inline uint64_t to_base64_mask(uint64_t *error) { + uint32_t err0 = 0; + uint32_t err1 = 0; + uint32_t err2 = 0; + uint32_t err3 = 0; + uint64_t m0 = to_base64_mask( + &chunks[0], &err0); + uint64_t m1 = to_base64_mask( + &chunks[1], &err1); + uint64_t m2 = to_base64_mask( + &chunks[2], &err2); + uint64_t m3 = to_base64_mask( + &chunks[3], &err3); + if (!ignore_garbage) { + *error = (err0) | ((uint64_t)err1 << 16) | ((uint64_t)err2 << 32) | + ((uint64_t)err3 << 48); + } + return m0 | (m1 << 16) | (m2 << 32) | (m3 << 48); + } - while (buf + 16 + safety_margin <= end) { - __m256i in = _mm256_loadu_si256((__m256i*)buf); - if (big_endian) { - const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, - 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); - in = _mm256_shuffle_epi8(in, swap); - } - // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes - const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80); - if(_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!! - // 1. pack the bytes - const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1)); - // 2. store (16 bytes) - _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); - // 3. adjust pointers - buf += 16; - utf8_output += 16; - continue; // we are done for this round! +private: + template + simdutf_really_inline uint16_t to_base64_mask(__m128i *src, uint32_t *error) { + const __m128i ascii_space_tbl = + _mm_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa, + 0x0, 0xc, 0xd, 0x0, 0x0); + // credit: aqrit + __m128i delta_asso; + if (default_or_url) { + delta_asso = + _mm_setr_epi8(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x16); + } else if (base64_url) { + delta_asso = _mm_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, + 0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF); + } else { + delta_asso = + _mm_setr_epi8(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F); + } + __m128i delta_values; + if (default_or_url) { + delta_values = _mm_setr_epi8( + uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0x13), + uint8_t(0x04), uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9), + uint8_t(0xB9), uint8_t(0x00), uint8_t(0xFF), uint8_t(0x11), + uint8_t(0xFF), uint8_t(0xBF), uint8_t(0x10), uint8_t(0xB9)); + + } else if (base64_url) { + delta_values = _mm_setr_epi8(0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), + uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9), + 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF), + uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9)); + } else { + delta_values = + _mm_setr_epi8(int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), + int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), + int8_t(0xB9), int8_t(0x00), int8_t(0x10), int8_t(0xC3), + int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9)); + } + __m128i check_asso; + if (default_or_url) { + check_asso = + _mm_setr_epi8(0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x03, 0x07, 0x0B, 0x0E, 0x0B, 0x06); + } else if (base64_url) { + check_asso = _mm_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, + 0x1, 0x3, 0x7, 0xB, 0xE, 0xB, 0x6); + } else { + check_asso = + _mm_setr_epi8(0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F); + } + __m128i check_values; + if (default_or_url) { + check_values = _mm_setr_epi8( + uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), + uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6), + uint8_t(0xB5), uint8_t(0xA1), uint8_t(0x00), uint8_t(0x80), + uint8_t(0x00), uint8_t(0x80), uint8_t(0x00), uint8_t(0x80)); + } else if (base64_url) { + check_values = _mm_setr_epi8(uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), + uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF), + uint8_t(0xB6), uint8_t(0xA6), uint8_t(0xB5), + uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0, + uint8_t(0x80), 0x0, uint8_t(0x80)); + } else { + check_values = + _mm_setr_epi8(int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), + int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), + int8_t(0xB5), int8_t(0x86), int8_t(0xD1), int8_t(0x80), + int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80)); } - // no bits set above 7th bit - const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000); - const uint32_t one_byte_bitmask = static_cast(_mm256_movemask_epi8(one_byte_bytemask)); + const __m128i shifted = _mm_srli_epi32(*src, 3); - // no bits set above 11th bit - const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000); - const uint32_t one_or_two_bytes_bitmask = static_cast(_mm256_movemask_epi8(one_or_two_bytes_bytemask)); - if (one_or_two_bytes_bitmask == 0xffffffff) { - - // 1. prepare 2-byte values - // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 - // expected output : [110a|aaaa|10bb|bbbb] x 8 - const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00); - const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f); - - // t0 = [000a|aaaa|bbbb|bb00] - const __m256i t0 = _mm256_slli_epi16(in, 2); - // t1 = [000a|aaaa|0000|0000] - const __m256i t1 = _mm256_and_si256(t0, v_1f00); - // t2 = [0000|0000|00bb|bbbb] - const __m256i t2 = _mm256_and_si256(in, v_003f); - // t3 = [000a|aaaa|00bb|bbbb] - const __m256i t3 = _mm256_or_si256(t1, t2); - // t4 = [110a|aaaa|10bb|bbbb] - const __m256i t4 = _mm256_or_si256(t3, v_c080); - - // 2. merge ASCII and 2-byte codewords - const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask); - - // 3. prepare bitmask for 8-bit lookup - const uint32_t M0 = one_byte_bitmask & 0x55555555; - const uint32_t M1 = M0 >> 7; - const uint32_t M2 = (M1 | M0) & 0x00ff00ff; - // 4. pack the bytes - - const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0]; - const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0]; - - const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1)); - const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1)); - - const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2)); - // 5. store bytes - _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed)); - utf8_output += row[0]; - _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1)); - utf8_output += row_2[0]; - - // 6. adjust pointers - buf += 16; - continue; + __m128i delta_hash = + _mm_avg_epu8(_mm_shuffle_epi8(delta_asso, *src), shifted); + if (default_or_url) { + delta_hash = _mm_and_si128(delta_hash, _mm_set1_epi8(0xf)); } - // 1. Check if there are any surrogate word in the input chunk. - // We have also deal with situation when there is a surrogate word - // at the end of a chunk. - const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800); + const __m128i check_hash = + _mm_avg_epu8(_mm_shuffle_epi8(check_asso, *src), shifted); - // bitmask = 0x0000 if there are no surrogates - // = 0xc000 if the last word is a surrogate - const uint32_t surrogates_bitmask = static_cast(_mm256_movemask_epi8(surrogates_bytemask)); - // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, - // it is likely an uncommon occurrence. - if (surrogates_bitmask == 0x00000000) { - // case: code units from register produce either 1, 2 or 3 UTF-8 bytes - const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e, - 0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + const __m128i out = + _mm_adds_epi8(_mm_shuffle_epi8(delta_values, delta_hash), *src); + const __m128i chk = + _mm_adds_epi8(_mm_shuffle_epi8(check_values, check_hash), *src); + const int mask = _mm_movemask_epi8(chk); + if (!ignore_garbage && mask) { + __m128i ascii_space = + _mm_cmpeq_epi8(_mm_shuffle_epi8(ascii_space_tbl, *src), *src); + *error = (mask ^ _mm_movemask_epi8(ascii_space)); + } + *src = out; + return (uint16_t)mask; + } - /* In this branch we handle three cases: - 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte - 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes - 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes +public: + simdutf_really_inline void base64_decode_block(char *out) { + base64_decode(out, chunks[0]); + base64_decode(out + 12, chunks[1]); + base64_decode(out + 24, chunks[2]); + base64_decode(out + 36, chunks[3]); + } - We expand the input word (16-bit) into two code units (32-bit), thus - we have room for four bytes. However, we need five distinct bit - layouts. Note that the last byte in cases #2 and #3 is the same. +public: + simdutf_really_inline void base64_decode_block_safe(char *out) { + base64_decode(out, chunks[0]); + base64_decode(out + 12, chunks[1]); + base64_decode(out + 24, chunks[2]); + char buffer[16]; + base64_decode(buffer, chunks[3]); + std::memcpy(out + 36, buffer, 12); + } +}; +/* end file src/westmere/sse_base64.cpp */ +#endif // SIMDUTF_FEATURE_BASE64 - We precompute byte 1 for case #1 and the common byte for cases #2 & #3 - in register t2. +} // unnamed namespace +} // namespace westmere +} // namespace simdutf - We precompute byte 1 for case #3 and -- **conditionally** -- precompute - either byte 1 for case #2 or byte 2 for case #3. Note that they - differ by exactly one bit. +/* begin file src/generic/buf_block_reader.h */ +namespace simdutf { +namespace westmere { +namespace { - Finally from these two code units we build proper UTF-8 sequence, taking - into account the case (i.e, the number of bytes to write). - */ - /** - * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: - * t2 => [0ccc|cccc] [10cc|cccc] - * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) - */ -#define simdutf_vec(x) _mm256_set1_epi16(static_cast(x)) - // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] - const __m256i t0 = _mm256_shuffle_epi8(in, dup_even); - // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] - const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111)); - // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] - const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000)); - - // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] - const __m256i s0 = _mm256_srli_epi16(in, 4); - // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] - const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100)); - // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] - const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140)); - // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] - const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000)); - const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000)); - const __m256i s4 = _mm256_xor_si256(s3, m0); -#undef simdutf_vec +// Walks through a buffer in block-sized increments, loading the last part with +// spaces +template struct buf_block_reader { +public: + simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len); + simdutf_really_inline size_t block_index(); + simdutf_really_inline bool has_full_block() const; + simdutf_really_inline const uint8_t *full_block() const; + /** + * Get the last block, padded with spaces. + * + * There will always be a last block, with at least 1 byte, unless len == 0 + * (in which case this function fills the buffer with spaces and returns 0. In + * particular, if len == STEP_SIZE there will be 0 full_blocks and 1 remainder + * block with STEP_SIZE bytes and no spaces for padding. + * + * @return the number of effective characters in the last block. + */ + simdutf_really_inline size_t get_remainder(uint8_t *dst) const; + simdutf_really_inline void advance(); - // 4. expand code units 16-bit => 32-bit - const __m256i out0 = _mm256_unpacklo_epi16(t2, s4); - const __m256i out1 = _mm256_unpackhi_epi16(t2, s4); +private: + const uint8_t *buf; + const size_t len; + const size_t lenminusstep; + size_t idx; +}; - // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle - const uint32_t mask = (one_byte_bitmask & 0x55555555) | - (one_or_two_bytes_bitmask & 0xaaaaaaaa); - // Due to the wider registers, the following path is less likely to be useful. - /*if(mask == 0) { - // We only have three-byte code units. Use fast path. - const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); - const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle); - const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle); - _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0)); - utf8_output += 12; - _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1)); - utf8_output += 12; - _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1)); - utf8_output += 12; - _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1)); - utf8_output += 12; - buf += 16; - continue; - }*/ - const uint8_t mask0 = uint8_t(mask); - const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; - const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1)); - const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0); +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char *format_input_text_64(const uint8_t *text) { + static char *buf = + reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + for (size_t i = 0; i < sizeof(simd8x64); i++) { + buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]); + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} - const uint8_t mask1 = static_cast(mask >> 8); - const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; - const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1)); - const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1); +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char *format_input_text(const simd8x64 &in) { + static char *buf = + reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + in.store(reinterpret_cast(buf)); + for (size_t i = 0; i < sizeof(simd8x64); i++) { + if (buf[i] < ' ') { + buf[i] = '_'; + } + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} - const uint8_t mask2 = static_cast(mask >> 16); - const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0]; - const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1)); - const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2); +simdutf_unused static char *format_mask(uint64_t mask) { + static char *buf = reinterpret_cast(malloc(64 + 1)); + for (size_t i = 0; i < 64; i++) { + buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' '; + } + buf[64] = '\0'; + return buf; +} +template +simdutf_really_inline +buf_block_reader::buf_block_reader(const uint8_t *_buf, size_t _len) + : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, + idx{0} {} - const uint8_t mask3 = static_cast(mask >> 24); - const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0]; - const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1)); - const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3); +template +simdutf_really_inline size_t buf_block_reader::block_index() { + return idx; +} - _mm_storeu_si128((__m128i*)utf8_output, utf8_0); - utf8_output += row0[0]; - _mm_storeu_si128((__m128i*)utf8_output, utf8_1); - utf8_output += row1[0]; - _mm_storeu_si128((__m128i*)utf8_output, utf8_2); - utf8_output += row2[0]; - _mm_storeu_si128((__m128i*)utf8_output, utf8_3); - utf8_output += row3[0]; - buf += 16; - // surrogate pair(s) in a register - } else { - // Let us do a scalar fallback. - // It may seem wasteful to use scalar code, but being efficient with SIMD - // in the presence of surrogate pairs may require non-trivial tables. - size_t forward = 15; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; - if((word & 0xFF80)==0) { - *utf8_output++ = char(word); - } else if((word & 0xF800)==0) { - *utf8_output++ = char((word>>6) | 0b11000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else if((word &0xF800 ) != 0xD800) { - *utf8_output++ = char((word>>12) | 0b11100000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else { - // must be a surrogate pair - uint16_t diff = uint16_t(word - 0xD800); - uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1]; - k++; - uint16_t diff2 = uint16_t(next_word - 0xDC00); - if((diff | diff2) > 0x3FF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf8_output); } - uint32_t value = (diff << 10) + diff2 + 0x10000; - *utf8_output++ = char((value>>18) | 0b11110000); - *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000); - *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((value & 0b111111) | 0b10000000); - } - } - buf += k; - } - } // while - return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output); +template +simdutf_really_inline bool buf_block_reader::has_full_block() const { + return idx < lenminusstep; } -/* end file src/haswell/avx2_convert_utf16_to_utf8.cpp */ -/* begin file src/haswell/avx2_convert_utf16_to_utf32.cpp */ -/* - The vectorized algorithm works on single SSE register i.e., it - loads eight 16-bit code units. - We consider three cases: - 1. an input register contains no surrogates and each value - is in range 0x0000 .. 0x07ff. - 2. an input register contains no surrogates and values are - in range 0x0000 .. 0xffff. - 3. an input register contains surrogates --- i.e. codepoints - can have 16 or 32 bits. +template +simdutf_really_inline const uint8_t * +buf_block_reader::full_block() const { + return &buf[idx]; +} - Ad 1. +template +simdutf_really_inline size_t +buf_block_reader::get_remainder(uint8_t *dst) const { + if (len == idx) { + return 0; + } // memcpy(dst, null, 0) will trigger an error with some sanitizers + std::memset(dst, 0x20, + STEP_SIZE); // std::memset STEP_SIZE because it is more efficient + // to write out 8 or 16 bytes at once. + std::memcpy(dst, buf + idx, len - idx); + return len - idx; +} - When values are less than 0x0800, it means that a 16-bit code unit - can be converted into: 1) single UTF8 byte (when it's an ASCII - char) or 2) two UTF8 bytes. +template +simdutf_really_inline void buf_block_reader::advance() { + idx += STEP_SIZE; +} - For this case we do only some shuffle to obtain these 2-byte - codes and finally compress the whole SSE register with a single - shuffle. +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/buf_block_reader.h */ +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +namespace simdutf { +namespace westmere { +namespace { +namespace utf8_validation { - We need 256-entry lookup table to get a compression pattern - and the number of output bytes in the compressed vector register. - Each entry occupies 17 bytes. +using namespace simd; - Ad 2. +simdutf_really_inline simd8 +check_special_cases(const simd8 input, const simd8 prev1) { + // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) + // Bit 1 = Too Long (ASCII followed by continuation) + // Bit 2 = Overlong 3-byte + // Bit 4 = Surrogate + // Bit 5 = Overlong 2-byte + // Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1 << 6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4); + constexpr const uint8_t CARRY = + TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = + (prev1 & 0x0F) + .lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, - When values fit in 16-bit code units, but are above 0x07ff, then - a single word may produce one, two or three UTF8 bytes. + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | + OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - We prepare data for all these three cases in two registers. - The first register contains lower two UTF8 bytes (used in all - cases), while the second one contains just the third byte for - the three-UTF8-bytes case. + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); + return (byte_1_high & byte_1_low & byte_2_high); +} +simdutf_really_inline simd8 +check_multibyte_lengths(const simd8 input, + const simd8 prev_input, + const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = + simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; +} - Finally these two registers are interleaved forming eight-element - array of 32-bit values. The array spans two SSE registers. - The bytes from the registers are compressed using two shuffles. +// +// Return nonzero if there are incomplete multibyte characters at the end of the +// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end. +// +simdutf_really_inline simd8 is_incomplete(const simd8 input) { + // If the previous input's last 3 bytes match this, they're too short (they + // ended at EOF): + // ... 1111____ 111_____ 11______ + static const uint8_t max_array[32] = {255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 0b11110000u - 1, + 0b11100000u - 1, + 0b11000000u - 1}; + const simd8 max_value( + &max_array[sizeof(max_array) - sizeof(simd8)]); + return input.gt_bits(max_value); +} + +struct utf8_checker { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + // The last input we received + simd8 prev_input_block; + // Whether the last input we received was incomplete (used for ASCII fast + // path) + simd8 prev_incomplete; - We need 256-entry lookup table to get a compression pattern - and the number of output bytes in the compressed vector register. - Each entry occupies 17 bytes. + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, + const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ + // lead bytes (2, 3, 4-byte leads become large positive numbers instead of + // small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + // The only problem that can happen at EOF is that a multibyte character is + // too short or a byte value too large in the last bytes: check_special_cases + // only checks for bytes too large in the first of two bytes. + simdutf_really_inline void check_eof() { + // If the previous block had incomplete UTF-8 characters at the end, an + // ASCII block can't possibly finish them. + this->error |= this->prev_incomplete; + } + + simdutf_really_inline void check_next_input(const simd8x64 &input) { + if (simdutf_likely(is_ascii(input))) { + this->error |= this->prev_incomplete; + } else { + // you might think that a for-loop would work, but under Visual Studio, it + // is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + this->prev_incomplete = + is_incomplete(input.chunks[simd8x64::NUM_CHUNKS - 1]); + this->prev_input_block = input.chunks[simd8x64::NUM_CHUNKS - 1]; + } + } + // do not forget to call check_eof! + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } - To summarize: - - We need two 256-entry tables that have 8704 bytes in total. -*/ +}; // struct utf8_checker +} // namespace utf8_validation +using utf8_validation::utf8_checker; -/* - Returns a pair: the first unprocessed byte from buf and utf32_output - A scalar routing should carry on the conversion of the tail. -*/ -template -std::pair avx2_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) { - const char16_t* end = buf + len; - const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800); - const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800); +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +/* begin file src/generic/utf8_validation/utf8_validator.h */ +namespace simdutf { +namespace westmere { +namespace { +namespace utf8_validation { - while (buf + 16 <= end) { - __m256i in = _mm256_loadu_si256((__m256i*)buf); - if (big_endian) { - const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, - 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); - in = _mm256_shuffle_epi8(in, swap); - } +/** + * Validates that the string is actual UTF-8. + */ +template +bool generic_validate_utf8(const uint8_t *input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + reader.advance(); + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + c.check_eof(); + return !c.errors(); +} - // 1. Check if there are any surrogate word in the input chunk. - // We have also deal with situation when there is a surrogate word - // at the end of a chunk. - const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800); +bool generic_validate_utf8(const char *input, size_t length) { + return generic_validate_utf8( + reinterpret_cast(input), length); +} - // bitmask = 0x0000 if there are no surrogates - // = 0xc000 if the last word is a surrogate - const uint32_t surrogates_bitmask = static_cast(_mm256_movemask_epi8(surrogates_bytemask)); - // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, - // it is likely an uncommon occurrence. - if (surrogates_bitmask == 0x00000000) { - // case: we extend all sixteen 16-bit code units to sixteen 32-bit code units - _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in))); - _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output + 8), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in,1))); - utf32_output += 16; - buf += 16; - // surrogate pair(s) in a register - } else { - // Let us do a scalar fallback. - // It may seem wasteful to use scalar code, but being efficient with SIMD - // in the presence of surrogate pairs may require non-trivial tables. - size_t forward = 15; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; - if((word &0xF800 ) != 0xD800) { - // No surrogate pair - *utf32_output++ = char32_t(word); - } else { - // must be a surrogate pair - uint16_t diff = uint16_t(word - 0xD800); - uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1]; - k++; - uint16_t diff2 = uint16_t(next_word - 0xDC00); - if((diff | diff2) > 0x3FF) { return std::make_pair(nullptr, utf32_output); } - uint32_t value = (diff << 10) + diff2 + 0x10000; - *utf32_output++ = char32_t(value); - } - } - buf += k; +/** + * Validates that the string is actual UTF-8 and stops on errors. + */ +template +result generic_validate_utf8_with_errors(const uint8_t *input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + size_t count{0}; + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + if (c.errors()) { + if (count != 0) { + count--; + } // Sometimes the error is only detected in the next chunk + result res = scalar::utf8::rewind_and_validate_with_errors( + reinterpret_cast(input), + reinterpret_cast(input + count), length - count); + res.count += count; + return res; } - } // while - return std::make_pair(buf, utf32_output); + reader.advance(); + count += 64; + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + c.check_eof(); + if (c.errors()) { + if (count != 0) { + count--; + } // Sometimes the error is only detected in the next chunk + result res = scalar::utf8::rewind_and_validate_with_errors( + reinterpret_cast(input), + reinterpret_cast(input) + count, length - count); + res.count += count; + return res; + } else { + return result(error_code::SUCCESS, length); + } } +result generic_validate_utf8_with_errors(const char *input, size_t length) { + return generic_validate_utf8_with_errors( + reinterpret_cast(input), length); +} -/* - Returns a pair: a result struct and utf8_output. - If there is an error, the count field of the result is the position of the error. - Otherwise, it is the position of the first unprocessed byte in buf (even if finished). - A scalar routing should carry on the conversion of the tail if needed. -*/ -template -std::pair avx2_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) { - const char16_t* start = buf; - const char16_t* end = buf + len; - const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800); - const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800); +} // namespace utf8_validation +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/utf8_validation/utf8_validator.h */ +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_ASCII +/* begin file src/generic/ascii_validation.h */ +namespace simdutf { +namespace westmere { +namespace { +namespace ascii_validation { - while (buf + 16 <= end) { - __m256i in = _mm256_loadu_si256((__m256i*)buf); - if (big_endian) { - const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, - 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); - in = _mm256_shuffle_epi8(in, swap); +result generic_validate_ascii_with_errors(const char *input, size_t length) { + buf_block_reader<64> reader(reinterpret_cast(input), length); + size_t count{0}; + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + if (!in.is_ascii()) { + result res = scalar::ascii::validate_with_errors( + reinterpret_cast(input + count), length - count); + return result(res.error, count + res.count); } + reader.advance(); - // 1. Check if there are any surrogate word in the input chunk. - // We have also deal with situation when there is a surrogate word - // at the end of a chunk. - const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800); + count += 64; + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + if (!in.is_ascii()) { + result res = scalar::ascii::validate_with_errors( + reinterpret_cast(input + count), length - count); + return result(res.error, count + res.count); + } else { + return result(error_code::SUCCESS, length); + } +} - // bitmask = 0x0000 if there are no surrogates - // = 0xc000 if the last word is a surrogate - const uint32_t surrogates_bitmask = static_cast(_mm256_movemask_epi8(surrogates_bytemask)); - // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, - // it is likely an uncommon occurrence. - if (surrogates_bitmask == 0x00000000) { - // case: we extend all sixteen 16-bit code units to sixteen 32-bit code units - _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in))); - _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output + 8), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in,1))); - utf32_output += 16; - buf += 16; - // surrogate pair(s) in a register - } else { - // Let us do a scalar fallback. - // It may seem wasteful to use scalar code, but being efficient with SIMD - // in the presence of surrogate pairs may require non-trivial tables. - size_t forward = 15; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; - if((word &0xF800 ) != 0xD800) { - // No surrogate pair - *utf32_output++ = char32_t(word); - } else { - // must be a surrogate pair - uint16_t diff = uint16_t(word - 0xD800); - uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1]; - k++; - uint16_t diff2 = uint16_t(next_word - 0xDC00); - if((diff | diff2) > 0x3FF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf32_output); } - uint32_t value = (diff << 10) + diff2 + 0x10000; - *utf32_output++ = char32_t(value); - } - } - buf += k; +bool generic_validate_ascii(const char *input, size_t length) { + buf_block_reader<64> reader(reinterpret_cast(input), length); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + if (!in.is_ascii()) { + return false; } - } // while - return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output); + reader.advance(); + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + return in.is_ascii(); } -/* end file src/haswell/avx2_convert_utf16_to_utf32.cpp */ - -/* begin file src/haswell/avx2_convert_utf32_to_latin1.cpp */ -std::pair -avx2_convert_utf32_to_latin1(const char32_t *buf, size_t len, - char *latin1_output) { - const size_t rounded_len = - len & ~0x1F; // Round down to nearest multiple of 32 - __m256i high_bytes_mask = _mm256_set1_epi32(0xFFFFFF00); - - __m256i shufmask = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, 12, 8, 4, 0, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, 12, 8, 4, 0); +} // namespace ascii_validation +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/ascii_validation.h */ +#endif // SIMDUTF_FEATURE_ASCII - for (size_t i = 0; i < rounded_len; i += 16) { - __m256i in1 = _mm256_loadu_si256((__m256i *)buf); - __m256i in2 = _mm256_loadu_si256((__m256i *)(buf + 8)); +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + // transcoding from UTF-8 to UTF-16 +/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ +namespace simdutf { +namespace westmere { +namespace { +namespace utf8_to_utf16 { - __m256i check_combined = _mm256_or_si256(in1, in2); +using namespace simd; - if (!_mm256_testz_si256(check_combined, high_bytes_mask)) { - return std::make_pair(nullptr, latin1_output); +template +simdutf_warn_unused size_t convert_valid(const char *input, size_t size, + char16_t *utf16_output) noexcept { + // The implementation is not specific to haswell and should be moved to the + // generic directory. + size_t pos = 0; + char16_t *start{utf16_output}; + const size_t safety_margin = 16; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + // this loop could be unrolled further. For example, we could process the + // mask far more than 64 bytes. + simd8x64 in(reinterpret_cast(input + pos)); + if (in.is_ascii()) { + in.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // Slow path. We hope that the compiler will recognize that this is a slow + // path. Anything that is not a continuation mask is a 'leading byte', + // that is, the start of a new code point. + uint64_t utf8_continuation_mask = in.lt(-65 + 1); + // -65 is 0b10111111 in two-complement's, so largest possible continuation + // byte + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + // The *start* of code points is not so useful, rather, we want the *end* + // of code points. + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times when using solely + // the slow/regular path, and at least four times if there are fast paths. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + // + // Thus we may allow convert_masked_utf8_to_utf16 to process + // more bytes at a time under a fast-path mode where 16 bytes + // are consumed at once (e.g., when encountering ASCII). + size_t consumed = convert_masked_utf8_to_utf16( + input + pos, utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. } - - //Turn UTF32 bytes into latin 1 bytes - __m256i shuffled1 = _mm256_shuffle_epi8(in1, shufmask); - __m256i shuffled2 = _mm256_shuffle_epi8(in2, shufmask); - - //move Latin1 bytes to their correct spot - __m256i idx1 = _mm256_set_epi32(-1, -1,-1,-1,-1,-1,4,0); - __m256i idx2 = _mm256_set_epi32(-1, -1,-1,-1,4,0,-1,-1); - __m256i reshuffled1 = _mm256_permutevar8x32_epi32(shuffled1, idx1); - __m256i reshuffled2 = _mm256_permutevar8x32_epi32(shuffled2, idx2); - - __m256i result = _mm256_or_si256(reshuffled1, reshuffled2); - _mm_storeu_si128((__m128i *)latin1_output, - _mm256_castsi256_si128(result)); - - latin1_output += 16; - buf += 16; } - - return std::make_pair(buf, latin1_output); + utf16_output += scalar::utf8_to_utf16::convert_valid( + input + pos, size - pos, utf16_output); + return utf16_output - start; } -std::pair -avx2_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len, - char *latin1_output) { - const size_t rounded_len = - len & ~0x1F; // Round down to nearest multiple of 32 - - __m256i high_bytes_mask = _mm256_set1_epi32(0xFFFFFF00); - __m256i shufmask = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, 12, 8, 4, 0, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, 12, 8, 4, 0); - - const char32_t *start = buf; - - for (size_t i = 0; i < rounded_len; i += 16) { - __m256i in1 = _mm256_loadu_si256((__m256i *)buf); - __m256i in2 = _mm256_loadu_si256((__m256i *)(buf + 8)); - - __m256i check_combined = _mm256_or_si256(in1, in2); - - if (!_mm256_testz_si256(check_combined, high_bytes_mask)) { - // Fallback to scalar code for handling errors - for (int k = 0; k < 8; k++) { - char32_t codepoint = buf[k]; - if (codepoint <= 0xFF) { - *latin1_output++ = static_cast(codepoint); - } else { - return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), - latin1_output); - } - } - buf += 8; - } else { - __m256i shuffled1 = _mm256_shuffle_epi8(in1, shufmask); - __m256i shuffled2 = _mm256_shuffle_epi8(in2, shufmask); - __m256i idx1 = _mm256_set_epi32(-1, -1, -1, -1, -1, -1, 4, 0); - __m256i idx2 = _mm256_set_epi32(-1, -1, -1, -1, 4, 0, -1, -1); - __m256i reshuffled1 = _mm256_permutevar8x32_epi32(shuffled1, idx1); - __m256i reshuffled2 = _mm256_permutevar8x32_epi32(shuffled2, idx2); +} // namespace utf8_to_utf16 +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ +/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */ +namespace simdutf { +namespace westmere { +namespace { +namespace utf8_to_utf16 { +using namespace simd; + +simdutf_really_inline simd8 +check_special_cases(const simd8 input, const simd8 prev1) { + // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) + // Bit 1 = Too Long (ASCII followed by continuation) + // Bit 2 = Overlong 3-byte + // Bit 4 = Surrogate + // Bit 5 = Overlong 2-byte + // Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1 << 6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4); + constexpr const uint8_t CARRY = + TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = + (prev1 & 0x0F) + .lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, - __m256i result = _mm256_or_si256(reshuffled1, reshuffled2); - _mm_storeu_si128((__m128i *)latin1_output, _mm256_castsi256_si128(result)); + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | + OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - latin1_output += 16; - buf += 16; + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); + return (byte_1_high & byte_1_low & byte_2_high); +} +simdutf_really_inline simd8 +check_multibyte_lengths(const simd8 input, + const simd8 prev_input, + const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = + simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; +} + +struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, + const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ + // lead bytes (2, 3, 4-byte leads become large positive numbers instead of + // small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + template + simdutf_really_inline size_t convert(const char *in, size_t size, + char16_t *utf16_output) { + size_t pos = 0; + char16_t *start{utf16_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 8 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the eight last + // leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (utf8_continuation_mask & 1) { + return 0; // error + } + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf16( + in + pos, utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } } - - return std::make_pair(result(error_code::SUCCESS, buf - start), latin1_output); -} -/* end file src/haswell/avx2_convert_utf32_to_latin1.cpp */ -/* begin file src/haswell/avx2_convert_utf32_to_utf8.cpp */ -std::pair avx2_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) { - const char32_t* end = buf + len; - const __m256i v_0000 = _mm256_setzero_si256(); - const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000); - const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80); - const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800); - const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080); - const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff); - __m256i running_max = _mm256_setzero_si256(); - __m256i forbidden_bytemask = _mm256_setzero_si256(); - - const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 - - while (buf + 16 + safety_margin <= end) { - __m256i in = _mm256_loadu_si256((__m256i*)buf); - __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1); - running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin); - - // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned saturation - __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff)); - in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000); - - // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp) - - if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!! - // 1. pack the bytes - const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1)); - // 2. store (16 bytes) - _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); - // 3. adjust pointers - buf += 16; - utf8_output += 16; - continue; // we are done for this round! + if (errors()) { + return 0; } - // no bits set above 7th bit - const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000); - const uint32_t one_byte_bitmask = static_cast(_mm256_movemask_epi8(one_byte_bytemask)); + if (pos < size) { + size_t howmany = scalar::utf8_to_utf16::convert( + in + pos, size - pos, utf16_output); + if (howmany == 0) { + return 0; + } + utf16_output += howmany; + } + return utf16_output - start; + } - // no bits set above 11th bit - const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000); - const uint32_t one_or_two_bytes_bitmask = static_cast(_mm256_movemask_epi8(one_or_two_bytes_bytemask)); - if (one_or_two_bytes_bitmask == 0xffffffff) { - // 1. prepare 2-byte values - // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 - // expected output : [110a|aaaa|10bb|bbbb] x 8 - const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00); - const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f); + template + simdutf_really_inline result convert_with_errors(const char *in, size_t size, + char16_t *utf16_output) { + size_t pos = 0; + char16_t *start{utf16_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 8 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the eight last + // leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (errors() || (utf8_continuation_mask & 1)) { + // rewind_and_convert_with_errors will seek a potential error from + // in+pos onward, with the ability to go back up to pos bytes, and + // read size-pos bytes forward. + result res = + scalar::utf8_to_utf16::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf16_output); + res.count += pos; + return res; + } + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf16( + in + pos, utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (errors()) { + // rewind_and_convert_with_errors will seek a potential error from in+pos + // onward, with the ability to go back up to pos bytes, and read size-pos + // bytes forward. + result res = + scalar::utf8_to_utf16::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf16_output); + res.count += pos; + return res; + } + if (pos < size) { + // rewind_and_convert_with_errors will seek a potential error from in+pos + // onward, with the ability to go back up to pos bytes, and read size-pos + // bytes forward. + result res = + scalar::utf8_to_utf16::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf16_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + utf16_output += res.count; + } + } + return result(error_code::SUCCESS, utf16_output - start); + } - // t0 = [000a|aaaa|bbbb|bb00] - const __m256i t0 = _mm256_slli_epi16(in_16, 2); - // t1 = [000a|aaaa|0000|0000] - const __m256i t1 = _mm256_and_si256(t0, v_1f00); - // t2 = [0000|0000|00bb|bbbb] - const __m256i t2 = _mm256_and_si256(in_16, v_003f); - // t3 = [000a|aaaa|00bb|bbbb] - const __m256i t3 = _mm256_or_si256(t1, t2); - // t4 = [110a|aaaa|10bb|bbbb] - const __m256i t4 = _mm256_or_si256(t3, v_c080); + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } - // 2. merge ASCII and 2-byte codewords - const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask); +}; // struct utf8_checker +} // namespace utf8_to_utf16 +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */ +/* begin file src/generic/utf8/utf16_length_from_utf8_bytemask.h */ +namespace simdutf { +namespace westmere { +namespace { +namespace utf8 { - // 3. prepare bitmask for 8-bit lookup - const uint32_t M0 = one_byte_bitmask & 0x55555555; - const uint32_t M1 = M0 >> 7; - const uint32_t M2 = (M1 | M0) & 0x00ff00ff; - // 4. pack the bytes +using namespace simd; - const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0]; - const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0]; +simdutf_really_inline size_t utf16_length_from_utf8_bytemask(const char *in, + size_t size) { + using vector_i8 = simd8; + using vector_u8 = simd8; + using vector_u64 = simd64; - const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1)); - const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1)); + constexpr size_t N = vector_i8::SIZE; + constexpr size_t max_iterations = 255 / 2; - const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2)); - // 5. store bytes - _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed)); - utf8_output += row[0]; - _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1)); - utf8_output += row_2[0]; + auto counters = vector_u64::zero(); + auto local = vector_u8::zero(); - // 6. adjust pointers - buf += 16; - continue; - } - // Must check for overflow in packing - const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000); - const uint32_t saturation_bitmask = static_cast(_mm256_movemask_epi8(saturation_bytemask)); - if (saturation_bitmask == 0xffffffff) { - // case: code units from register produce either 1, 2 or 3 UTF-8 bytes - const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800); - forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800)); + size_t iterations = 0; + size_t pos = 0; + size_t count = 0; + for (; pos + N <= size; pos += N) { + const auto input = + vector_i8::load(reinterpret_cast(in + pos)); - const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e, - 0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + const auto continuation = input > int8_t(-65); + const auto utf_4bytes = vector_u8(input.value) >= uint8_t(240); - /* In this branch we handle three cases: - 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte - 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes - 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes + local -= vector_u8(continuation); + local -= vector_u8(utf_4bytes); - We expand the input word (16-bit) into two code units (32-bit), thus - we have room for four bytes. However, we need five distinct bit - layouts. Note that the last byte in cases #2 and #3 is the same. + iterations += 1; + if (iterations == max_iterations) { + counters += sum_8bytes(local); + local = vector_u8::zero(); + iterations = 0; + } + } - We precompute byte 1 for case #1 and the common byte for cases #2 & #3 - in register t2. + if (iterations > 0) { + count += local.sum_bytes(); + } - We precompute byte 1 for case #3 and -- **conditionally** -- precompute - either byte 1 for case #2 or byte 2 for case #3. Note that they - differ by exactly one bit. + count += counters.sum(); - Finally from these two code units we build proper UTF-8 sequence, taking - into account the case (i.e, the number of bytes to write). - */ - /** - * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: - * t2 => [0ccc|cccc] [10cc|cccc] - * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) - */ -#define simdutf_vec(x) _mm256_set1_epi16(static_cast(x)) - // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] - const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even); - // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] - const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111)); - // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] - const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000)); + return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos); +} - // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] - const __m256i s0 = _mm256_srli_epi16(in_16, 4); - // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] - const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100)); - // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] - const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140)); - // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] - const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000)); - const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000)); - const __m256i s4 = _mm256_xor_si256(s3, m0); -#undef simdutf_vec +} // namespace utf8 +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/utf8/utf16_length_from_utf8_bytemask.h */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 - // 4. expand code units 16-bit => 32-bit - const __m256i out0 = _mm256_unpacklo_epi16(t2, s4); - const __m256i out1 = _mm256_unpackhi_epi16(t2, s4); +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ +namespace simdutf { +namespace westmere { +namespace { +namespace utf8_to_utf32 { - // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle - const uint32_t mask = (one_byte_bitmask & 0x55555555) | - (one_or_two_bytes_bitmask & 0xaaaaaaaa); - // Due to the wider registers, the following path is less likely to be useful. - /*if(mask == 0) { - // We only have three-byte code units. Use fast path. - const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); - const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle); - const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle); - _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0)); - utf8_output += 12; - _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1)); - utf8_output += 12; - _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1)); - utf8_output += 12; - _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1)); - utf8_output += 12; - buf += 16; - continue; - }*/ - const uint8_t mask0 = uint8_t(mask); - const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; - const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1)); - const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0); +using namespace simd; - const uint8_t mask1 = static_cast(mask >> 8); - const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; - const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1)); - const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1); +simdutf_warn_unused size_t convert_valid(const char *input, size_t size, + char32_t *utf32_output) noexcept { + size_t pos = 0; + char32_t *start{utf32_output}; + const size_t safety_margin = 16; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 in(reinterpret_cast(input + pos)); + if (in.is_ascii()) { + in.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // -65 is 0b10111111 in two-complement's, so largest possible continuation + // byte + uint64_t utf8_continuation_mask = in.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + size_t max_starting_point = (pos + 64) - 12; + while (pos < max_starting_point) { + size_t consumed = convert_masked_utf8_to_utf32( + input + pos, utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + } + } + utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, + utf32_output); + return utf32_output - start; +} - const uint8_t mask2 = static_cast(mask >> 16); - const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0]; - const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1)); - const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2); +} // namespace utf8_to_utf32 +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ +/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */ +namespace simdutf { +namespace westmere { +namespace { +namespace utf8_to_utf32 { +using namespace simd; +simdutf_really_inline simd8 +check_special_cases(const simd8 input, const simd8 prev1) { + // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) + // Bit 1 = Too Long (ASCII followed by continuation) + // Bit 2 = Overlong 3-byte + // Bit 4 = Surrogate + // Bit 5 = Overlong 2-byte + // Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1 << 6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4); + constexpr const uint8_t CARRY = + TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = + (prev1 & 0x0F) + .lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, - const uint8_t mask3 = static_cast(mask >> 24); - const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0]; - const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1)); - const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3); + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | + OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - _mm_storeu_si128((__m128i*)utf8_output, utf8_0); - utf8_output += row0[0]; - _mm_storeu_si128((__m128i*)utf8_output, utf8_1); - utf8_output += row1[0]; - _mm_storeu_si128((__m128i*)utf8_output, utf8_2); - utf8_output += row2[0]; - _mm_storeu_si128((__m128i*)utf8_output, utf8_3); - utf8_output += row3[0]; - buf += 16; - } else { - // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes. - // Let us do a scalar fallback. - // It may seem wasteful to use scalar code, but being efficient with SIMD - // may require large, non-trivial tables? - size_t forward = 15; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint32_t word = buf[k]; - if((word & 0xFFFFFF80)==0) { // 1-byte (ASCII) - *utf8_output++ = char(word); - } else if((word & 0xFFFFF800)==0) { // 2-byte - *utf8_output++ = char((word>>6) | 0b11000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else if((word & 0xFFFF0000 )==0) { // 3-byte - if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); } - *utf8_output++ = char((word>>12) | 0b11100000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else { // 4-byte - if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); } - *utf8_output++ = char((word>>18) | 0b11110000); - *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); + return (byte_1_high & byte_1_low & byte_2_high); +} +simdutf_really_inline simd8 +check_multibyte_lengths(const simd8 input, + const simd8 prev_input, + const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = + simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; +} + +struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, + const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ + // lead bytes (2, 3, 4-byte leads become large positive numbers instead of + // small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + simdutf_really_inline size_t convert(const char *in, size_t size, + char32_t *utf32_output) { + size_t pos = 0; + char32_t *start{utf32_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 words when calling convert_masked_utf8_to_utf32. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 16 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the fourth + // last leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (utf8_continuation_mask & 1) { + return 0; // we have an error } + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf32( + in + pos, utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. } - buf += k; } - } // while - - // check for invalid input - const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff); - if(static_cast(_mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) { - return std::make_pair(nullptr, utf8_output); + if (errors()) { + return 0; + } + if (pos < size) { + size_t howmany = + scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output); + if (howmany == 0) { + return 0; + } + utf32_output += howmany; + } + return utf32_output - start; } - if (static_cast(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf8_output); } + simdutf_really_inline result convert_with_errors(const char *in, size_t size, + char32_t *utf32_output) { + size_t pos = 0; + char32_t *start{utf32_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 8 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the fourth + // last leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (errors() || (utf8_continuation_mask & 1)) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf32_output); + res.count += pos; + return res; + } + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf32( + in + pos, utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (errors()) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf32_output); + res.count += pos; + return res; + } + if (pos < size) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf32_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + utf32_output += res.count; + } + } + return result(error_code::SUCCESS, utf32_output - start); + } - return std::make_pair(buf, utf8_output); -} + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } +}; // struct utf8_checker +} // namespace utf8_to_utf32 +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */ +/* begin file src/generic/utf32.h */ +#include -std::pair avx2_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) { - const char32_t* end = buf + len; - const char32_t* start = buf; +namespace simdutf { +namespace westmere { +namespace { +namespace utf32 { - const __m256i v_0000 = _mm256_setzero_si256(); - const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000); - const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80); - const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800); - const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080); - const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff); - const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff); +template T min(T a, T b) { return a <= b ? a : b; } - const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 +simdutf_really_inline size_t utf8_length_from_utf32(const char32_t *input, + size_t length) { + using vector_u32 = simd32; - while (buf + 16 + safety_margin <= end) { - __m256i in = _mm256_loadu_si256((__m256i*)buf); - __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1); - // Check for too large input - const __m256i max_input = _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff); - if(static_cast(_mm256_movemask_epi8(_mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) { - return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output); - } + const char32_t *start = input; - // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned saturation - __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff)); - in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000); + // we add up to three ones in a single iteration (see the vectorized loop in + // section #2 below) + const size_t max_increment = 3; - // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp) + const size_t N = vector_u32::ELEMENTS; - if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!! - // 1. pack the bytes - const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1)); - // 2. store (16 bytes) - _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); - // 3. adjust pointers - buf += 16; - utf8_output += 16; - continue; // we are done for this round! - } - // no bits set above 7th bit - const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000); - const uint32_t one_byte_bitmask = static_cast(_mm256_movemask_epi8(one_byte_bytemask)); +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + const auto v_0000007f = vector_u32::splat(0x0000007f); + const auto v_000007ff = vector_u32::splat(0x000007ff); + const auto v_0000ffff = vector_u32::splat(0x0000ffff); +#else + const auto v_ffffff80 = vector_u32::splat(0xffffff80); + const auto v_fffff800 = vector_u32::splat(0xfffff800); + const auto v_ffff0000 = vector_u32::splat(0xffff0000); + const auto one = vector_u32::splat(1); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP - // no bits set above 11th bit - const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000); - const uint32_t one_or_two_bytes_bitmask = static_cast(_mm256_movemask_epi8(one_or_two_bytes_bytemask)); - if (one_or_two_bytes_bitmask == 0xffffffff) { - // 1. prepare 2-byte values - // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 - // expected output : [110a|aaaa|10bb|bbbb] x 8 - const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00); - const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f); + size_t counter = 0; - // t0 = [000a|aaaa|bbbb|bb00] - const __m256i t0 = _mm256_slli_epi16(in_16, 2); - // t1 = [000a|aaaa|0000|0000] - const __m256i t1 = _mm256_and_si256(t0, v_1f00); - // t2 = [0000|0000|00bb|bbbb] - const __m256i t2 = _mm256_and_si256(in_16, v_003f); - // t3 = [000a|aaaa|00bb|bbbb] - const __m256i t3 = _mm256_or_si256(t1, t2); - // t4 = [110a|aaaa|10bb|bbbb] - const __m256i t4 = _mm256_or_si256(t3, v_c080); + // 1. vectorized loop unrolled 4 times + { + // we use vector of uint32 counters, this is why this limit is used + const size_t max_iterations = + std::numeric_limits::max() / (max_increment * 4); + size_t blocks = length / (N * 4); + length -= blocks * (N * 4); + while (blocks != 0) { + const size_t iterations = min(blocks, max_iterations); + blocks -= iterations; + + simd32 acc = vector_u32::zero(); + for (size_t i = 0; i < iterations; i++) { + const auto in0 = vector_u32(input + 0 * N); + const auto in1 = vector_u32(input + 1 * N); + const auto in2 = vector_u32(input + 2 * N); + const auto in3 = vector_u32(input + 3 * N); + +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + acc -= as_vector_u32(in0 > v_0000007f); + acc -= as_vector_u32(in1 > v_0000007f); + acc -= as_vector_u32(in2 > v_0000007f); + acc -= as_vector_u32(in3 > v_0000007f); + + acc -= as_vector_u32(in0 > v_000007ff); + acc -= as_vector_u32(in1 > v_000007ff); + acc -= as_vector_u32(in2 > v_000007ff); + acc -= as_vector_u32(in3 > v_000007ff); + + acc -= as_vector_u32(in0 > v_0000ffff); + acc -= as_vector_u32(in1 > v_0000ffff); + acc -= as_vector_u32(in2 > v_0000ffff); + acc -= as_vector_u32(in3 > v_0000ffff); +#else + acc += min(one, in0 & v_ffffff80); + acc += min(one, in1 & v_ffffff80); + acc += min(one, in2 & v_ffffff80); + acc += min(one, in3 & v_ffffff80); + + acc += min(one, in0 & v_fffff800); + acc += min(one, in1 & v_fffff800); + acc += min(one, in2 & v_fffff800); + acc += min(one, in3 & v_fffff800); + + acc += min(one, in0 & v_ffff0000); + acc += min(one, in1 & v_ffff0000); + acc += min(one, in2 & v_ffff0000); + acc += min(one, in3 & v_ffff0000); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP + + input += 4 * N; + } - // 2. merge ASCII and 2-byte codewords - const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask); + counter += acc.sum(); + } + } - // 3. prepare bitmask for 8-bit lookup - const uint32_t M0 = one_byte_bitmask & 0x55555555; - const uint32_t M1 = M0 >> 7; - const uint32_t M2 = (M1 | M0) & 0x00ff00ff; - // 4. pack the bytes + // 2. vectorized loop for tail + { + const size_t max_iterations = + std::numeric_limits::max() / max_increment; + size_t blocks = length / N; + length -= blocks * N; + while (blocks != 0) { + const size_t iterations = min(blocks, max_iterations); + blocks -= iterations; + + auto acc = vector_u32::zero(); + for (size_t i = 0; i < iterations; i++) { + const auto in = vector_u32(input); + +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + acc -= as_vector_u32(in > v_0000007f); + acc -= as_vector_u32(in > v_000007ff); + acc -= as_vector_u32(in > v_0000ffff); +#else + acc += min(one, in & v_ffffff80); + acc += min(one, in & v_fffff800); + acc += min(one, in & v_ffff0000); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP - const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0]; - const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0]; + input += N; + } - const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1)); - const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1)); + counter += acc.sum(); + } + } - const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2)); - // 5. store bytes - _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed)); - utf8_output += row[0]; - _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1)); - utf8_output += row_2[0]; + const size_t consumed = input - start; + if (consumed != 0) { + // We don't count 0th bytes in the vectorized loops above, this + // is why we need to count them in the end. + counter += consumed; + } - // 6. adjust pointers - buf += 16; - continue; - } - // Must check for overflow in packing - const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000); - const uint32_t saturation_bitmask = static_cast(_mm256_movemask_epi8(saturation_bytemask)); - if (saturation_bitmask == 0xffffffff) { - // case: code units from register produce either 1, 2 or 3 UTF-8 bytes + return counter + scalar::utf32::utf8_length_from_utf32(input, length); +} - // Check for illegal surrogate code units - const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800); - const __m256i forbidden_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800); - if (static_cast(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) { - return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output); - } +} // namespace utf32 +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/utf32.h */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 - const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e, - 0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); +#if SIMDUTF_FEATURE_UTF8 +/* begin file src/generic/utf8.h */ +namespace simdutf { +namespace westmere { +namespace { +namespace utf8 { - /* In this branch we handle three cases: - 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte - 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes - 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes +using namespace simd; - We expand the input word (16-bit) into two code units (32-bit), thus - we have room for four bytes. However, we need five distinct bit - layouts. Note that the last byte in cases #2 and #3 is the same. +simdutf_really_inline size_t count_code_points(const char *in, size_t size) { + size_t pos = 0; + size_t count = 0; + for (; pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.gt(-65); + count += count_ones(utf8_continuation_mask); + } + return count + scalar::utf8::count_code_points(in + pos, size - pos); +} - We precompute byte 1 for case #1 and the common byte for cases #2 & #3 - in register t2. +#ifdef SIMDUTF_SIMD_HAS_BYTEMASK +simdutf_really_inline size_t count_code_points_bytemask(const char *in, + size_t size) { + using vector_i8 = simd8; + using vector_u8 = simd8; + using vector_u64 = simd64; - We precompute byte 1 for case #3 and -- **conditionally** -- precompute - either byte 1 for case #2 or byte 2 for case #3. Note that they - differ by exactly one bit. + constexpr size_t N = vector_i8::SIZE; + constexpr size_t max_iterations = 255 / 4; - Finally from these two code units we build proper UTF-8 sequence, taking - into account the case (i.e, the number of bytes to write). - */ - /** - * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: - * t2 => [0ccc|cccc] [10cc|cccc] - * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) - */ -#define simdutf_vec(x) _mm256_set1_epi16(static_cast(x)) - // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] - const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even); - // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] - const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111)); - // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] - const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000)); + size_t pos = 0; + size_t count = 0; - // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] - const __m256i s0 = _mm256_srli_epi16(in_16, 4); - // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] - const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100)); - // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] - const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140)); - // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] - const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000)); - const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000)); - const __m256i s4 = _mm256_xor_si256(s3, m0); -#undef simdutf_vec + auto counters = vector_u64::zero(); + auto local = vector_u8::zero(); + size_t iterations = 0; + for (; pos + 4 * N <= size; pos += 4 * N) { + const auto input0 = + simd8::load(reinterpret_cast(in + pos + 0 * N)); + const auto input1 = + simd8::load(reinterpret_cast(in + pos + 1 * N)); + const auto input2 = + simd8::load(reinterpret_cast(in + pos + 2 * N)); + const auto input3 = + simd8::load(reinterpret_cast(in + pos + 3 * N)); + const auto mask0 = input0 > int8_t(-65); + const auto mask1 = input1 > int8_t(-65); + const auto mask2 = input2 > int8_t(-65); + const auto mask3 = input3 > int8_t(-65); - // 4. expand code units 16-bit => 32-bit - const __m256i out0 = _mm256_unpacklo_epi16(t2, s4); - const __m256i out1 = _mm256_unpackhi_epi16(t2, s4); + local -= vector_u8(mask0); + local -= vector_u8(mask1); + local -= vector_u8(mask2); + local -= vector_u8(mask3); - // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle - const uint32_t mask = (one_byte_bitmask & 0x55555555) | - (one_or_two_bytes_bitmask & 0xaaaaaaaa); - // Due to the wider registers, the following path is less likely to be useful. - /*if(mask == 0) { - // We only have three-byte code units. Use fast path. - const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); - const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle); - const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle); - _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0)); - utf8_output += 12; - _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1)); - utf8_output += 12; - _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1)); - utf8_output += 12; - _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1)); - utf8_output += 12; - buf += 16; - continue; - }*/ - const uint8_t mask0 = uint8_t(mask); - const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; - const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1)); - const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0); + iterations += 1; + if (iterations == max_iterations) { + counters += sum_8bytes(local); + local = vector_u8::zero(); + iterations = 0; + } + } - const uint8_t mask1 = static_cast(mask >> 8); - const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; - const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1)); - const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1); + if (iterations > 0) { + count += local.sum_bytes(); + } - const uint8_t mask2 = static_cast(mask >> 16); - const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0]; - const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1)); - const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2); + count += counters.sum(); + return count + scalar::utf8::count_code_points(in + pos, size - pos); +} +#endif // SIMDUTF_SIMD_HAS_BYTEMASK - const uint8_t mask3 = static_cast(mask >> 24); - const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0]; - const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1)); - const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3); +simdutf_really_inline size_t utf16_length_from_utf8(const char *in, + size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for (; pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + // We count one word for anything that is not a continuation (so + // leading bytes). + count += 64 - count_ones(utf8_continuation_mask); + int64_t utf8_4byte = input.gteq_unsigned(240); + count += count_ones(utf8_4byte); + } + return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos); +} - _mm_storeu_si128((__m128i*)utf8_output, utf8_0); - utf8_output += row0[0]; - _mm_storeu_si128((__m128i*)utf8_output, utf8_1); - utf8_output += row1[0]; - _mm_storeu_si128((__m128i*)utf8_output, utf8_2); - utf8_output += row2[0]; - _mm_storeu_si128((__m128i*)utf8_output, utf8_3); - utf8_output += row3[0]; - buf += 16; - } else { - // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes. - // Let us do a scalar fallback. - // It may seem wasteful to use scalar code, but being efficient with SIMD - // may require large, non-trivial tables? - size_t forward = 15; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint32_t word = buf[k]; - if((word & 0xFFFFFF80)==0) { // 1-byte (ASCII) - *utf8_output++ = char(word); - } else if((word & 0xFFFFF800)==0) { // 2-byte - *utf8_output++ = char((word>>6) | 0b11000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else if((word & 0xFFFF0000 )==0) { // 3-byte - if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); } - *utf8_output++ = char((word>>12) | 0b11100000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else { // 4-byte - if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf8_output); } - *utf8_output++ = char((word>>18) | 0b11110000); - *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } - } - buf += k; - } - } // while +} // namespace utf8 +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/utf8.h */ +#endif // SIMDUTF_FEATURE_UTF8 +#if SIMDUTF_FEATURE_UTF16 +/* begin file src/generic/utf16.h */ +namespace simdutf { +namespace westmere { +namespace { +namespace utf16 { - return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output); +template +simdutf_really_inline size_t count_code_points(const char16_t *in, + size_t size) { + size_t pos = 0; + size_t count = 0; + for (; pos < size / 32 * 32; pos += 32) { + simd16x32 input(reinterpret_cast(in + pos)); + if simdutf_constexpr (!match_system(big_endian)) { + input.swap_bytes(); + } + uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF); + count += count_ones(not_pair) / 2; + } + return count + + scalar::utf16::count_code_points(in + pos, size - pos); } -/* end file src/haswell/avx2_convert_utf32_to_utf8.cpp */ -/* begin file src/haswell/avx2_convert_utf32_to_utf16.cpp */ + template -std::pair avx2_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output) { - const char32_t* end = buf + len; +simdutf_really_inline size_t utf8_length_from_utf16(const char16_t *in, + size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for (; pos < size / 32 * 32; pos += 32) { + simd16x32 input(reinterpret_cast(in + pos)); + if simdutf_constexpr (!match_system(big_endian)) { + input.swap_bytes(); + } + uint64_t ascii_mask = input.lteq(0x7F); + uint64_t twobyte_mask = input.lteq(0x7FF); + uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF); - const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 - __m256i forbidden_bytemask = _mm256_setzero_si256(); + size_t ascii_count = count_ones(ascii_mask) / 2; + size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2; + size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2; + size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2; + count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + + ascii_count; + } + return count + scalar::utf16::utf8_length_from_utf16(in + pos, + size - pos); +} +template +simdutf_really_inline size_t utf32_length_from_utf16(const char16_t *in, + size_t size) { + return count_code_points(in, size); +} - while (buf + 8 + safety_margin <= end) { - __m256i in = _mm256_loadu_si256((__m256i*)buf); +simdutf_really_inline void +change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) { + size_t pos = 0; - const __m256i v_00000000 = _mm256_setzero_si256(); - const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000); + while (pos < size / 32 * 32) { + simd16x32 input(reinterpret_cast(in + pos)); + input.swap_bytes(); + input.store(reinterpret_cast(output)); + pos += 32; + output += 32; + } - // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs - const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000); - const uint32_t saturation_bitmask = static_cast(_mm256_movemask_epi8(saturation_bytemask)); + scalar::utf16::change_endianness_utf16(in + pos, size - pos, output); +} - if (saturation_bitmask == 0xffffffff) { - const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800); - const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800); - forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800)); +} // namespace utf16 +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/utf16.h */ +/* begin file src/generic/utf16/utf8_length_from_utf16_bytemask.h */ +namespace simdutf { +namespace westmere { +namespace { +namespace utf16 { - __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1)); - if (big_endian) { - const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - utf16_packed = _mm_shuffle_epi8(utf16_packed, swap); - } - _mm_storeu_si128((__m128i*)utf16_output, utf16_packed); - utf16_output += 8; - buf += 8; - } else { - size_t forward = 7; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint32_t word = buf[k]; - if((word & 0xFFFF0000)==0) { - // will not generate a surrogate pair - if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); } - *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word); - } else { - // will generate a surrogate pair - if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); } - word -= 0x10000; - uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); - uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); - if (big_endian) { - high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8)); - low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8)); - } - *utf16_output++ = char16_t(high_surrogate); - *utf16_output++ = char16_t(low_surrogate); - } - } - buf += k; - } - } +using namespace simd; - // check for invalid input - if (static_cast(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf16_output); } +template +simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in, + size_t size) { + size_t pos = 0; - return std::make_pair(buf, utf16_output); -} + using vector_u16 = simd16; + constexpr size_t N = vector_u16::ELEMENTS; + const auto one = vector_u16::splat(1); -template -std::pair avx2_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) { - const char32_t* start = buf; - const char32_t* end = buf + len; + auto v_count = vector_u16::zero(); - const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 + // each char16 yields at least one byte + size_t count = size / N * N; - while (buf + 8 + safety_margin <= end) { - __m256i in = _mm256_loadu_si256((__m256i*)buf); + // in a single iteration the increment is 0, 1 or 2, despite we have + // three additions + constexpr size_t max_iterations = 65535 / 2; + size_t iteration = max_iterations; - const __m256i v_00000000 = _mm256_setzero_si256(); - const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000); + for (; pos < size / N * N; pos += N) { + auto input = vector_u16::load(reinterpret_cast(in + pos)); + if simdutf_constexpr (!match_system(big_endian)) { + input = input.swap_bytes(); + } + // 0xd800 .. 0xdbff - low surrogate + // 0xdc00 .. 0xdfff - high surrogate + const auto is_surrogate = ((input & uint16_t(0xf800)) == uint16_t(0xd800)); - // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs - const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000); - const uint32_t saturation_bitmask = static_cast(_mm256_movemask_epi8(saturation_bytemask)); + // c0 - chars that yield 2- or 3-byte UTF-8 codes + const auto c0 = min(input & uint16_t(0xff80), one); - if (saturation_bitmask == 0xffffffff) { - const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800); - const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800); - const __m256i forbidden_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800); - if (static_cast(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) { - return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output); - } + // c1 - chars that yield 3-byte UTF-8 codes (including surrogates) + const auto c1 = min(input & uint16_t(0xf800), one); - __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1)); - if (big_endian) { - const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - utf16_packed = _mm_shuffle_epi8(utf16_packed, swap); - } - _mm_storeu_si128((__m128i*)utf16_output, utf16_packed); - utf16_output += 8; - buf += 8; - } else { - size_t forward = 7; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint32_t word = buf[k]; - if((word & 0xFFFF0000)==0) { - // will not generate a surrogate pair - if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); } - *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word); - } else { - // will generate a surrogate pair - if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); } - word -= 0x10000; - uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); - uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); - if (big_endian) { - high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8)); - low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8)); - } - *utf16_output++ = char16_t(high_surrogate); - *utf16_output++ = char16_t(low_surrogate); - } - } - buf += k; + /* + Explanation how the counting works. + + In the case of a non-surrogate character we count: + * always 1 -- see how `count` is initialized above; + * c0 = 1 if the current char yields 2 or 3 bytes; + * c1 = 1 if the current char yields 3 bytes. + + Thus, we always have correct count for the current char: + from 1, 2 or 3 bytes. + + A trickier part is how we count surrogate pairs. Whether + we encounter a surrogate (low or high), we count it as + 3 chars and then minus 1 (`is_surrogate` is -1 or 0). + Each surrogate char yields 2. A surrogate pair, that + is a low surrogate followed by a high one, yields + the expected 4 bytes. + + It also correctly handles cases when low surrogate is + processed by the this loop, but high surrogate is counted + by the scalar procedure. The scalar procedure uses exactly + the described approach, thanks to that for valid UTF-16 + strings it always count correctly. + */ + v_count += c0; + v_count += c1; + v_count += vector_u16(is_surrogate); + + iteration -= 1; + if (iteration == 0) { + count += v_count.sum(); + v_count = vector_u16::zero(); + iteration = max_iterations; } } - return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output); + if (iteration > 0) { + count += v_count.sum(); + } + + return count + scalar::utf16::utf8_length_from_utf16(in + pos, + size - pos); } -/* end file src/haswell/avx2_convert_utf32_to_utf16.cpp */ -/* begin file src/haswell/avx2_convert_utf8_to_latin1.cpp */ -// depends on "tables/utf8_to_utf16_tables.h" +template +simdutf_really_inline result +utf8_length_from_utf16_with_replacement(const char16_t *in, size_t size) { + using vector_u16 = simd16; + constexpr size_t N = vector_u16::ELEMENTS; + if (N + 1 > size) { + return scalar::utf16::utf8_length_from_utf16_with_replacement( + in, size); + } // special case for short inputs + size_t pos = 0; + bool any_surrogates = false; -// Convert up to 12 bytes from utf8 to latin1 using a mask indicating the -// end of the code points. Only the least significant 12 bits of the mask -// are accessed. -// It returns how many bytes were consumed (up to 12). -size_t convert_masked_utf8_to_latin1(const char *input, - uint64_t utf8_end_of_code_point_mask, - char *&latin1_output) { - // we use an approach where we try to process up to 12 input bytes. - // Why 12 input bytes and not 16? Because we are concerned with the size of - // the lookup tables. Also 12 is nicely divisible by two and three. - // - // - // Optimization note: our main path below is load-latency dependent. Thus it is maybe - // beneficial to have fast paths that depend on branch prediction but have less latency. - // This results in more instructions but, potentially, also higher speeds. - // - const __m128i in = _mm_loadu_si128((__m128i *)input); - const __m128i in_second_half = _mm_loadu_si128((__m128i *)(input + 16)); + const auto one = vector_u16::splat(1); - const uint16_t input_utf8_end_of_code_point_mask = - utf8_end_of_code_point_mask & 0xfff; //we're only processing 12 bytes in case it`s not all ASCII + auto v_count = vector_u16::zero(); + auto v_mismatched_count = vector_u16::zero(); - if((input_utf8_end_of_code_point_mask & 0xffffffff) == 0xffffffff) { - // Load the next 128 bits. + size_t count = 0; + size_t mismatched_count = 0; + + // in a single iteration the increment is 0, 1 or 2, despite we have + // three additions + constexpr size_t max_iterations = 65535 / 2; + size_t iteration = max_iterations; + + if (scalar::utf16::is_low_surrogate(in[0])) { + any_surrogates = true; + mismatched_count += 1; + } + + for (; pos < (size - 1) / N * N; pos += N) { + auto input = vector_u16::load(reinterpret_cast(in + pos)); + if simdutf_constexpr (!match_system(big_endian)) { + input = input.swap_bytes(); + } + // 0xd800 .. 0xdbff - low surrogate + // 0xdc00 .. 0xdfff - high surrogate + const auto is_surrogate = ((input & uint16_t(0xf800)) == uint16_t(0xd800)); + + // c0 - chars that yield 2- or 3-byte UTF-8 codes + const auto c0 = min(input & uint16_t(0xff80), one); + + // c1 - chars that yield 3-byte UTF-8 codes (including surrogates) + const auto c1 = min(input & uint16_t(0xf800), one); + + v_count += c0; + v_count += c1; + v_count += vector_u16(is_surrogate); + if (is_surrogate.to_bitmask() != 0 || + scalar::utf16::is_low_surrogate(in[pos + N])) { + any_surrogates = true; + auto input_next = + vector_u16::load(reinterpret_cast(in + pos + 1)); + if simdutf_constexpr (!match_system(big_endian)) { + input_next = input_next.swap_bytes(); + } - // Combine the two 128-bit registers into a single 256-bit register. - __m256i in_combined = _mm256_set_m128i(in_second_half, in); + const auto lb_masked = input & (0xfc00); + const auto block_masked = input_next & (0xfc00); - // We process the data in chunks of 32 bytes. - _mm256_storeu_si256(reinterpret_cast<__m256i *>(latin1_output), in_combined); + const auto lb_is_high = lb_masked == (0xd800); + const auto block_is_low = block_masked == (0xdc00); - latin1_output += 32; // We wrote 32 characters. - return 32; // We consumed 32 bytes. - } + const auto illseq = min(vector_u16(lb_is_high ^ block_is_low), one); + v_mismatched_count += illseq; + } - if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) { - // We process the data in chunks of 16 bytes. - _mm_storeu_si128(reinterpret_cast<__m128i *>(latin1_output), in); - latin1_output += 16; // We wrote 16 characters. - return 16; // We consumed 16 bytes. + iteration -= 1; + if (iteration == 0) { + count += v_count.sum(); + v_count = vector_u16::zero(); + mismatched_count += v_mismatched_count.sum(); + v_mismatched_count = vector_u16::zero(); + iteration = max_iterations; + } } - /// We do not have a fast path available, so we fallback. - const uint8_t idx = - tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; - const uint8_t consumed = - tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; - // this indicates an invalid input: - if(idx >= 64) { return consumed; } - // Here we should have (idx < 64), if not, there is a bug in the validation or elsewhere. - // SIX (6) input code-code units - // this is a relatively easy scenario - // we process SIX (6) input code-code units. The max length in bytes of six code - // code units spanning between 1 and 2 bytes each is 12 bytes. On processors - // where pdep/pext is fast, we might be able to use a small lookup table. - const __m128i sh = - _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); - const __m128i perm = _mm_shuffle_epi8(in, sh); - const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); - const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); - __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); - const __m128i latin1_packed = _mm_packus_epi16(composed,composed); - // writing 8 bytes even though we only care about the first 6 bytes. - // performance note: it would be faster to use _mm_storeu_si128, we should investigate. - _mm_storel_epi64((__m128i *)latin1_output, latin1_packed); - latin1_output += 6; // We wrote 6 bytes. - return consumed; -} -/* end file src/haswell/avx2_convert_utf8_to_latin1.cpp */ - -/* begin file src/haswell/avx2_base64.cpp */ -/** - * References and further reading: - * - * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the - * speed of a memory copy, Software: Practice and Experience 50 (2), 2020. - * https://arxiv.org/abs/1910.05109 - * - * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2 - * Instructions, ACM Transactions on the Web 12 (3), 2018. - * https://arxiv.org/abs/1704.00605 - * - * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings. - * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force, - * Request for Comments: 4648. - * - * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization. - * http://www.alfredklomp.com/programming/sse-base64/. (2014). - * - * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD - * acceleration. https://github.com/aklomp/base64. (2014). - * - * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014). - * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/ - * - * Nick Kopp. 2013. Base64 Encoding on a GPU. - * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013). - */ - -template -simdutf_really_inline __m256i lookup_pshufb_improved(const __m256i input) { - // credit: Wojciech Muła - __m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51)); - const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input); - result = - _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13))); - __m256i shift_LUT; - if (base64_url) { - shift_LUT = _mm256_setr_epi8( - 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, - '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0, - - 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, - '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0); - } else { - shift_LUT = _mm256_setr_epi8( - 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, - '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0, - 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, - '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0); + if (iteration > 0) { + count += v_count.sum(); + mismatched_count += v_mismatched_count.sum(); } - result = _mm256_shuffle_epi8(shift_LUT, result); - return _mm256_add_epi8(result, input); + if (scalar::utf16::is_low_surrogate(in[pos])) { + any_surrogates = true; + if (!scalar::utf16::is_high_surrogate(in[pos - 1])) { + mismatched_count -= 1; + count += 2; + pos += 1; + } + } + count += pos; + count += mismatched_count; + if (scalar::utf16::is_high_surrogate(in[pos - 1])) { + any_surrogates = true; + if (pos == size) { + count += 2; + } else if (scalar::utf16::is_low_surrogate(in[pos])) { + pos += 1; + count += 2; + } + } + result scalar_result = + scalar::utf16::utf8_length_from_utf16_with_replacement( + in + pos, size - pos); + return {any_surrogates ? SURROGATE : scalar_result.error, + count + scalar_result.count}; } -template -size_t encode_base64(char *dst, const char *src, size_t srclen) { - // credit: Wojciech Muła - const uint8_t *input = (const uint8_t *)src; +} // namespace utf16 +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/utf16/utf8_length_from_utf16_bytemask.h */ +#endif // SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +/* begin file src/generic/validate_utf16.h */ +namespace simdutf { +namespace westmere { +namespace { +namespace utf16 { +/* + UTF-16 validation + -------------------------------------------------- - uint8_t *out = (uint8_t *)dst; - const __m256i shuf = - _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1, + In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning. - 10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1); - size_t i = 0; - for (; i + 100 <= srclen; i += 96) { - const __m128i lo0 = _mm_loadu_si128( - reinterpret_cast(input + i + 4 * 3 * 0)); - const __m128i hi0 = _mm_loadu_si128( - reinterpret_cast(input + i + 4 * 3 * 1)); - const __m128i lo1 = _mm_loadu_si128( - reinterpret_cast(input + i + 4 * 3 * 2)); - const __m128i hi1 = _mm_loadu_si128( - reinterpret_cast(input + i + 4 * 3 * 3)); - const __m128i lo2 = _mm_loadu_si128( - reinterpret_cast(input + i + 4 * 3 * 4)); - const __m128i hi2 = _mm_loadu_si128( - reinterpret_cast(input + i + 4 * 3 * 5)); - const __m128i lo3 = _mm_loadu_si128( - reinterpret_cast(input + i + 4 * 3 * 6)); - const __m128i hi3 = _mm_loadu_si128( - reinterpret_cast(input + i + 4 * 3 * 7)); + In a vectorized algorithm we want to examine the most significant + nibble in order to select a fast path. If none of highest nibbles + are 0xD (13), than we are sure that UTF-16 chunk in a vector + register is valid. - __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf); - __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf); - __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf); - __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf); + Let us analyze what we need to check if the nibble is 0xD. The + value of the preceding nibble determines what we have: + + 0xd000 .. 0xd7ff - a valid word + 0xd800 .. 0xdbff - low surrogate + 0xdc00 .. 0xdfff - high surrogate + + Other constraints we have to consider: + - there must not be two consecutive low surrogates (0xd800 .. 0xdbff) + - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff) + - there must not be sole low surrogate nor high surrogate - const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00)); - const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00)); - const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00)); - const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00)); + We are going to build three bitmasks based on the 3rd nibble: + - V = valid word, + - L = low surrogate (0xd800 .. 0xdbff) + - H = high surrogate (0xdc00 .. 0xdfff) - const __m256i t1_0 = - _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040)); - const __m256i t1_1 = - _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040)); - const __m256i t1_2 = - _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040)); - const __m256i t1_3 = - _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040)); + 0 1 2 3 4 5 6 7 <--- word index + [ V | L | H | L | H | V | V | L ] + 1 0 0 0 0 1 1 0 - V = valid masks + 0 1 0 1 0 0 0 1 - L = low surrogate + 0 0 1 0 1 0 0 0 - H high surrogate - const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0)); - const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0)); - const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0)); - const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0)); - const __m256i t3_0 = - _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010)); - const __m256i t3_1 = - _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010)); - const __m256i t3_2 = - _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010)); - const __m256i t3_3 = - _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010)); + 1 0 0 0 0 1 1 0 V = valid masks + 0 1 0 1 0 0 0 0 a = L & (H >> 1) + 0 0 1 0 1 0 0 0 b = a << 1 + 1 1 1 1 1 1 1 0 c = V | a | b + ^ + the last bit can be zero, we just consume 7 + code units and recheck this word in the next iteration +*/ +template +const result validate_utf16_with_errors(const char16_t *input, size_t size) { + if (simdutf_unlikely(size == 0)) { + return result(error_code::SUCCESS, 0); + } + + const char16_t *start = input; + const char16_t *end = input + size; + + const auto v_d8 = simd8::splat(0xd8); + const auto v_f8 = simd8::splat(0xf8); + const auto v_fc = simd8::splat(0xfc); + const auto v_dc = simd8::splat(0xdc); + + while (input + simd16::SIZE * 2 < end) { + // 0. Load data: since the validation takes into account only higher + // byte of each word, we compress the two vectors into one which + // consists only the higher bytes. + auto in0 = simd16(input); + auto in1 = + simd16(input + simd16::SIZE / sizeof(char16_t)); + + // Function `utf16_gather_high_bytes` consumes two vectors of UTF-16 + // and yields a single vector having only higher bytes of characters. + const auto in = utf16_gather_high_bytes(in0, in1); + + // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). + const auto surrogates_wordmask = (in & v_f8) == v_d8; + const uint16_t surrogates_bitmask = + static_cast(surrogates_wordmask.to_bitmask()); + if (surrogates_bitmask == 0x0000) { + input += 16; + } else { + // 2. We have some surrogates that have to be distinguished: + // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) + // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) + // + // Fact: high surrogate has 11th bit set (3rd bit in the higher byte) + + // V - non-surrogate code units + // V = not surrogates_wordmask + const uint16_t V = static_cast(~surrogates_bitmask); + + // H - word-mask for high surrogates: the six highest bits are 0b1101'11 + const auto vH = (in & v_fc) == v_dc; + const uint16_t H = static_cast(vH.to_bitmask()); + + // L - word mask for low surrogates + // L = not H and surrogates_wordmask + const uint16_t L = static_cast(~H & surrogates_bitmask); + + const uint16_t a = static_cast( + L & (H >> 1)); // A low surrogate must be followed by high one. + // (A low surrogate placed in the 7th register's word + // is an exception we handle.) + const uint16_t b = static_cast( + a << 1); // Just mark that the opinput - startite fact is hold, + // thanks to that we have only two masks for valid case. + const uint16_t c = static_cast( + V | a | b); // Combine all the masks into the final one. + + if (c == 0xffff) { + // The whole input register contains valid UTF-16, i.e., + // either single code units or proper surrogate pairs. + input += 16; + } else if (c == 0x7fff) { + // The 15 lower code units of the input register contains valid UTF-16. + // The 15th word may be either a low or high surrogate. It the next + // iteration we 1) check if the low surrogate is followed by a high + // one, 2) reject sole high surrogate. + input += 15; + } else { + return result(error_code::SURROGATE, input - start); + } + } + } - const __m256i input0 = _mm256_or_si256(t1_0, t3_0); - const __m256i input1 = _mm256_or_si256(t1_1, t3_1); - const __m256i input2 = _mm256_or_si256(t1_2, t3_2); - const __m256i input3 = _mm256_or_si256(t1_3, t3_3); + return result(error_code::SUCCESS, input - start); +} - _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), - lookup_pshufb_improved(input0)); - out += 32; +template +const result validate_utf16_as_ascii_with_errors(const char16_t *input, + size_t size) { + if (simdutf_unlikely(size == 0)) { + return result(error_code::SUCCESS, 0); + } + size_t pos = 0; + for (; pos < size / 32 * 32; pos += 32) { + simd16x32 input_vec( + reinterpret_cast(input + pos)); + if simdutf_constexpr (!match_system(big_endian)) { + input_vec.swap_bytes(); + } + uint64_t matches = input_vec.lteq(uint16_t(0x7f)); + if (~matches) { + // Found a match, return the first one + int index = trailing_zeroes(~matches) / 2; + return result(error_code::TOO_LARGE, pos + index); + } + } - _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), - lookup_pshufb_improved(input1)); - out += 32; + // Scalar tail + while (pos < size) { - _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), - lookup_pshufb_improved(input2)); - out += 32; - _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), - lookup_pshufb_improved(input3)); - out += 32; + char16_t v = scalar::utf16::swap_if_needed(input[pos]); + if (v > 0x7F) { + return result(error_code::TOO_LARGE, pos); + } + pos++; } - for (; i + 28 <= srclen; i += 24) { - // lo = [xxxx|DDDC|CCBB|BAAA] - // hi = [xxxx|HHHG|GGFF|FEEE] - const __m128i lo = - _mm_loadu_si128(reinterpret_cast(input + i)); - const __m128i hi = - _mm_loadu_si128(reinterpret_cast(input + i + 4 * 3)); + return result(error_code::SUCCESS, size); +} - // bytes from groups A, B and C are needed in separate 32-bit lanes - // in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA] - __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf); +} // namespace utf16 +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/validate_utf16.h */ +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */ +namespace simdutf { +namespace westmere { +namespace { +namespace utf8_to_latin1 { +using namespace simd; - // this part is well commented in encode.sse.cpp +simdutf_really_inline simd8 +check_special_cases(const simd8 input, const simd8 prev1) { + // For UTF-8 to Latin 1, we can allow any ASCII character, and any + // continuation byte, but the non-ASCII leading bytes must be 0b11000011 or + // 0b11000010 and nothing else. + // + // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) + // Bit 1 = Too Long (ASCII followed by continuation) + // Bit 2 = Overlong 3-byte + // Bit 4 = Surrogate + // Bit 5 = Overlong 2-byte + // Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1 << 6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____ + constexpr const uint8_t FORBIDDEN = 0xff; + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + FORBIDDEN, + // 1110____ ________ + FORBIDDEN, + // 1111____ ________ + FORBIDDEN); + constexpr const uint8_t CARRY = + TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = + (prev1 & 0x0F) + .lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, CARRY, + + // ____0100 ________ + FORBIDDEN, + // ____0101 ________ + FORBIDDEN, + // ____011_ ________ + FORBIDDEN, FORBIDDEN, + + // ____1___ ________ + FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, + // ____1101 ________ + FORBIDDEN, FORBIDDEN, FORBIDDEN); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, - const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00)); - const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040)); - const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0)); - const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010)); - const __m256i indices = _mm256_or_si256(t1, t3); + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | + OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), - lookup_pshufb_improved(indices)); - out += 32; - } - return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i, - srclen - i, options); + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); + return (byte_1_high & byte_1_low & byte_2_high); } -static inline void compress(__m128i data, uint16_t mask, char *output) { - if (mask == 0) { - _mm_storeu_si128(reinterpret_cast<__m128i *>(output), data); - return; +struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, + const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ + // lead bytes (2, 3, 4-byte leads become large positive numbers instead of + // small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + this->error |= check_special_cases(input, prev1); } - // this particular implementation was inspired by work done by @animetosho - // we do it in two steps, first 8 bytes and then second 8 bytes - uint8_t mask1 = uint8_t(mask); // least significant 8 bits - uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits - // next line just loads the 64-bit values thintable_epi8[mask1] and - // thintable_epi8[mask2] into a 128-bit register, using only - // two instructions on most compilers. - __m128i shufmask = _mm_set_epi64x(tables::base64::thintable_epi8[mask2], - tables::base64::thintable_epi8[mask1]); - // we increment by 0x08 the second half of the mask - shufmask = - _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0)); - // this is the version "nearly pruned" - __m128i pruned = _mm_shuffle_epi8(data, shufmask); - // we still need to put the two halves together. - // we compute the popcount of the first half: - int pop1 = tables::base64::BitsSetTable256mul2[mask1]; - // then load the corresponding mask, what it does is to write - // only the first pop1 bytes from the first 8 bytes, and then - // it fills in with the bytes from the second 8 bytes + some filling - // at the end. - __m128i compactmask = _mm_loadu_si128(reinterpret_cast( - tables::base64::pshufb_combine_table + pop1 * 8)); - __m128i answer = _mm_shuffle_epi8(pruned, compactmask); + simdutf_really_inline size_t convert(const char *in, size_t size, + char *latin1_output) { + size_t pos = 0; + char *start{latin1_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 16 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 16; margin--) { + leading_byte += (int8_t(in[margin - 1]) > + -65); // twos complement of -65 is 1011 1111 ... + } + // If the input is long enough, then we have that margin-1 is the eight last + // leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store((int8_t *)latin1_output); + latin1_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = + input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in + // this case, we also have ASCII to account for. + if (utf8_continuation_mask & 1) { + return 0; // error + } + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_latin1( + in + pos, utf8_end_of_code_point_mask, latin1_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (errors()) { + return 0; + } + if (pos < size) { + size_t howmany = + scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output); + if (howmany == 0) { + return 0; + } + latin1_output += howmany; + } + return latin1_output - start; + } - _mm_storeu_si128(reinterpret_cast<__m128i *>(output), answer); -} + simdutf_really_inline result convert_with_errors(const char *in, size_t size, + char *latin1_output) { + size_t pos = 0; + char *start{latin1_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 8 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the eight last + // leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store((int8_t *)latin1_output); + latin1_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + if (errors()) { + // rewind_and_convert_with_errors will seek a potential error from + // in+pos onward, with the ability to go back up to pos bytes, and + // read size-pos bytes forward. + result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors( + pos, in + pos, size - pos, latin1_output); + res.count += pos; + return res; + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_latin1( + in + pos, utf8_end_of_code_point_mask, latin1_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (errors()) { + // rewind_and_convert_with_errors will seek a potential error from in+pos + // onward, with the ability to go back up to pos bytes, and read size-pos + // bytes forward. + result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors( + pos, in + pos, size - pos, latin1_output); + res.count += pos; + return res; + } + if (pos < size) { + // rewind_and_convert_with_errors will seek a potential error from in+pos + // onward, with the ability to go back up to pos bytes, and read size-pos + // bytes forward. + result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors( + pos, in + pos, size - pos, latin1_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + latin1_output += res.count; + } + } + return result(error_code::SUCCESS, latin1_output - start); + } -static inline void compress(__m256i data, uint32_t mask, char *output) { - if (mask == 0) { - _mm256_storeu_si256(reinterpret_cast<__m256i *>(output), data); - return; + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); } - compress(_mm256_castsi256_si128(data), uint16_t(mask), output); - compress(_mm256_extracti128_si256(data, 1), uint16_t(mask >> 16), - output + _mm_popcnt_u32(~mask & 0xFFFF)); -} -struct block64 { - __m256i chunks[2]; -}; +}; // struct utf8_checker +} // namespace utf8_to_latin1 +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */ +/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */ +namespace simdutf { +namespace westmere { +namespace { +namespace utf8_to_latin1 { +using namespace simd; -template -static inline uint32_t to_base64_mask(__m256i *src, bool *error) { - const __m256i ascii_space_tbl = - _mm256_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa, - 0x0, 0xc, 0xd, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0); - // credit: aqrit - __m256i delta_asso; - if (base64_url) { - delta_asso = - _mm256_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0, - 0x0, 0x0, 0xF, 0x0, 0xF, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, - 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF); - } else { - delta_asso = _mm256_setr_epi8( - 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x0F, 0x00, 0x0F, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F); +simdutf_really_inline size_t convert_valid(const char *in, size_t size, + char *latin1_output) { + size_t pos = 0; + char *start{latin1_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last + // 16 bytes, and if the data is valid, then it is entirely safe because 16 + // UTF-8 bytes generate much more than 8 bytes. However, you cannot generally + // assume that you have valid UTF-8 input, so we are going to go back from the + // end counting 8 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > + -65); // twos complement of -65 is 1011 1111 ... + } + // If the input is long enough, then we have that margin-1 is the eight last + // leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store((int8_t *)latin1_output); + latin1_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it + // is not good enough. + uint64_t utf8_continuation_mask = + input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in + // this case, we also have ASCII to account for. + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_latin1( + in + pos, utf8_end_of_code_point_mask, latin1_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (pos < size) { + size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos, + latin1_output); + latin1_output += howmany; } + return latin1_output - start; +} - __m256i delta_values; - if (base64_url) { - delta_values = _mm256_setr_epi8( - 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9), - uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF), uint8_t(0xE0), - uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), - uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3), - uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9)); - } else { - delta_values = _mm256_setr_epi8( - int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), int8_t(0x04), - int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), int8_t(0x00), - int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), - int8_t(0xB9), int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), - int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), - int8_t(0x00), int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), - int8_t(0xB9), int8_t(0xB9)); +} // namespace utf8_to_latin1 +} // namespace +} // namespace westmere +} // namespace simdutf + // namespace simdutf +/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING +/* begin file src/generic/validate_utf32.h */ +namespace simdutf { +namespace westmere { +namespace { +namespace utf32 { + +simdutf_really_inline bool validate(const char32_t *input, size_t size) { + if (simdutf_unlikely(size == 0)) { + // empty input is valid UTF-32. protect the implementation from + // handling nullptr + return true; } - __m256i check_asso; - if (base64_url) { - check_asso = - _mm256_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x3, - 0x7, 0xB, 0x6, 0xB, 0x12, 0xD, 0x1, 0x1, 0x1, 0x1, 0x1, - 0x1, 0x1, 0x1, 0x1, 0x3, 0x7, 0xB, 0x6, 0xB, 0x12); - } else { + const char32_t *end = input + size; + + using vector_u32 = simd32; + + const auto standardmax = vector_u32::splat(0x10ffff); + const auto offset = vector_u32::splat(0xffff2000); + const auto standardoffsetmax = vector_u32::splat(0xfffff7ff); + auto currentmax = vector_u32::zero(); + auto currentoffsetmax = vector_u32::zero(); + + constexpr size_t N = vector_u32::ELEMENTS; - check_asso = _mm256_setr_epi8( - 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, 0x07, - 0x0B, 0x0B, 0x0B, 0x0F, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, - 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F); + while (input + N < end) { + auto in = vector_u32(input); + if simdutf_constexpr (!match_system(endianness::BIG)) { + in.swap_bytes(); + } + + currentmax = max(currentmax, in); + currentoffsetmax = max(currentoffsetmax, in + offset); + input += N; } - __m256i check_values; - if (base64_url) { - check_values = _mm256_setr_epi8( - 0x0, uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0xCF), - uint8_t(0xBF), uint8_t(0xD3), uint8_t(0xA6), uint8_t(0xB5), - uint8_t(0x86), uint8_t(0xD0), uint8_t(0x80), uint8_t(0xB0), - uint8_t(0x80), 0x0, 0x0, 0x0, uint8_t(0x80), uint8_t(0x80), - uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD3), - uint8_t(0xA6), uint8_t(0xB5), uint8_t(0x86), uint8_t(0xD0), - uint8_t(0x80), uint8_t(0xB0), uint8_t(0x80), 0x0, 0x0); - } else { - check_values = _mm256_setr_epi8( - int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF), - int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), int8_t(0x86), - int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), int8_t(0x91), - int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), - int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), - int8_t(0x86), int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), - int8_t(0x91), int8_t(0x80)); - } - const __m256i shifted = _mm256_srli_epi32(*src, 3); - const __m256i delta_hash = - _mm256_avg_epu8(_mm256_shuffle_epi8(delta_asso, *src), shifted); - const __m256i check_hash = - _mm256_avg_epu8(_mm256_shuffle_epi8(check_asso, *src), shifted); - const __m256i out = - _mm256_adds_epi8(_mm256_shuffle_epi8(delta_values, delta_hash), *src); - const __m256i chk = - _mm256_adds_epi8(_mm256_shuffle_epi8(check_values, check_hash), *src); - const int mask = _mm256_movemask_epi8(chk); - if (mask) { - __m256i ascii_space = - _mm256_cmpeq_epi8(_mm256_shuffle_epi8(ascii_space_tbl, *src), *src); - *error |= (mask != _mm256_movemask_epi8(ascii_space)); + + const auto too_large = currentmax > standardmax; + if (too_large.any()) { + return false; } - *src = out; - return (uint32_t)mask; -} -template -static inline uint64_t to_base64_mask(block64 *b, bool *error) { - *error = 0; - uint64_t m0 = to_base64_mask(&b->chunks[0], error); - uint64_t m1 = to_base64_mask(&b->chunks[1], error); - return m0 | (m1 << 32); -} + const auto surrogate = currentoffsetmax > standardoffsetmax; + if (surrogate.any()) { + return false; + } -static inline void copy_block(block64 *b, char *output) { - _mm256_storeu_si256(reinterpret_cast<__m256i *>(output), b->chunks[0]); - _mm256_storeu_si256(reinterpret_cast<__m256i *>(output + 32), b->chunks[1]); + return scalar::utf32::validate(input, end - input); } -static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) { - uint64_t nmask = ~mask; - compress(b->chunks[0], uint32_t(mask), output); - compress(b->chunks[1], uint32_t(mask >> 32), - output + _mm_popcnt_u64(nmask & 0xFFFFFFFF)); - return _mm_popcnt_u64(nmask); -} +simdutf_really_inline result validate_with_errors(const char32_t *input, + size_t size) { + if (simdutf_unlikely(size == 0)) { + // empty input is valid UTF-32. protect the implementation from + // handling nullptr + return result(error_code::SUCCESS, 0); + } -// The caller of this function is responsible to ensure that there are 64 bytes available -// from reading at src. The data is read into a block64 structure. -static inline void load_block(block64 *b, const char *src) { - b->chunks[0] = _mm256_loadu_si256(reinterpret_cast(src)); - b->chunks[1] = - _mm256_loadu_si256(reinterpret_cast(src + 32)); -} + const char32_t *start = input; + const char32_t *end = input + size; -// The caller of this function is responsible to ensure that there are 128 bytes available -// from reading at src. The data is read into a block64 structure. -static inline void load_block(block64 *b, const char16_t *src) { - __m256i m1 = _mm256_loadu_si256(reinterpret_cast(src)); - __m256i m2 = _mm256_loadu_si256(reinterpret_cast(src + 16)); - __m256i m3 = _mm256_loadu_si256(reinterpret_cast(src + 32)); - __m256i m4 = _mm256_loadu_si256(reinterpret_cast(src + 48)); - __m256i m1p = _mm256_permute2x128_si256(m1, m2, 0x20); - __m256i m2p = _mm256_permute2x128_si256(m1, m2, 0x31); - __m256i m3p = _mm256_permute2x128_si256(m3, m4, 0x20); - __m256i m4p = _mm256_permute2x128_si256(m3, m4, 0x31); - b->chunks[0] = _mm256_packus_epi16(m1p, m2p); - b->chunks[1] = _mm256_packus_epi16(m3p, m4p); -} + using vector_u32 = simd32; -static inline void base64_decode(char *out, __m256i str) { - // credit: aqrit - const __m256i pack_shuffle = - _mm256_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1, - 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1); - const __m256i t0 = _mm256_maddubs_epi16(str, _mm256_set1_epi32(0x01400140)); - const __m256i t1 = _mm256_madd_epi16(t0, _mm256_set1_epi32(0x00011000)); - const __m256i t2 = _mm256_shuffle_epi8(t1, pack_shuffle); + const auto standardmax = vector_u32::splat(0x10ffff + 1); + const auto surrogate_mask = vector_u32::splat(0xfffff800); + const auto surrogate_byte = vector_u32::splat(0x0000d800); - // Store the output: - _mm_storeu_si128((__m128i *)out, _mm256_castsi256_si128(t2)); - _mm_storeu_si128((__m128i *)(out + 12), _mm256_extracti128_si256(t2, 1)); -} -// decode 64 bytes and output 48 bytes -static inline void base64_decode_block(char *out, const char *src) { - base64_decode(out, - _mm256_loadu_si256(reinterpret_cast(src))); - base64_decode(out + 24, _mm256_loadu_si256( - reinterpret_cast(src + 32))); -} -static inline void base64_decode_block_safe(char *out, const char *src) { - base64_decode(out, - _mm256_loadu_si256(reinterpret_cast(src))); - char buffer[32]; // We enforce safety with a buffer. - base64_decode( - buffer, _mm256_loadu_si256(reinterpret_cast(src + 32))); - std::memcpy(out + 24, buffer, 24); -} -static inline void base64_decode_block(char *out, block64 *b) { - base64_decode(out, b->chunks[0]); - base64_decode(out + 24, b->chunks[1]); -} -static inline void base64_decode_block_safe(char *out, block64 *b) { - base64_decode(out, b->chunks[0]); - char buffer[32]; // We enforce safety with a buffer. - base64_decode(buffer, b->chunks[1]); - std::memcpy(out + 24, buffer, 24); -} + constexpr size_t N = vector_u32::ELEMENTS; -template -result compress_decode_base64(char *dst, const chartype *src, size_t srclen, - base64_options options) { - const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value - : tables::base64::to_base64_value; - // skip trailing spaces - while (srclen > 0 && to_base64[uint8_t(src[srclen - 1])] == 64) { - srclen--; - } - size_t equalsigns = 0; - if (srclen > 0 && src[srclen - 1] == '=') { - srclen--; - equalsigns = 1; - // skip trailing spaces - while (srclen > 0 && to_base64[uint8_t(src[srclen - 1])] == 64) { - srclen--; + while (input + N < end) { + auto in = vector_u32(input); + if simdutf_constexpr (!match_system(endianness::BIG)) { + in.swap_bytes(); } - while (srclen > 0 && to_base64[uint8_t(src[srclen - 1])] == 64) { - srclen--; - } - if (srclen > 0 && src[srclen - 1] == '=') { - srclen--; - equalsigns = 2; + + const auto too_large = in >= standardmax; + const auto surrogate = (in & surrogate_mask) == surrogate_byte; + + const auto combined = too_large | surrogate; + if (simdutf_unlikely(combined.any())) { + const size_t consumed = input - start; + auto sr = scalar::utf32::validate_with_errors(input, end - input); + sr.count += consumed; + + return sr; } + + input += N; + } + + const size_t consumed = input - start; + auto sr = scalar::utf32::validate_with_errors(input, end - input); + sr.count += consumed; + + return sr; +} + +} // namespace utf32 +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/validate_utf32.h */ +#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_BASE64 +/* begin file src/generic/base64.h */ +/** + * References and further reading: + * + * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the + * speed of a memory copy, Software: Practice and Experience 50 (2), 2020. + * https://arxiv.org/abs/1910.05109 + * + * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2 + * Instructions, ACM Transactions on the Web 12 (3), 2018. + * https://arxiv.org/abs/1704.00605 + * + * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings. + * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force, + * Request for Comments: 4648. + * + * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization. + * http://www.alfredklomp.com/programming/sse-base64/. (2014). + * + * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD + * acceleration. https://github.com/aklomp/base64. (2014). + * + * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014). + * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/ + * + * Nick Kopp. 2013. Base64 Encoding on a GPU. + * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013). + */ +namespace simdutf { +namespace westmere { +namespace { +namespace base64 { + +/* + The following template function implements API for Base64 decoding. + + An implementation is responsible for providing the `block64` type and + associated methods that perform actual conversion. Please refer + to any vectorized implementation to learn the API of these procedures. +*/ +template +full_result +compress_decode_base64(char *dst, const chartype *src, size_t srclen, + base64_options options, + last_chunk_handling_options last_chunk_options) { + const uint8_t *to_base64 = + default_or_url ? tables::base64::to_base64_default_or_url_value + : (base64_url ? tables::base64::to_base64_url_value + : tables::base64::to_base64_value); + auto ri = simdutf::scalar::base64::find_end(src, srclen, options); + size_t equallocation = ri.equallocation; + size_t equalsigns = ri.equalsigns; + srclen = ri.srclen; + size_t full_input_length = ri.full_input_length; + if (srclen == 0) { + if (!ignore_garbage && equalsigns > 0) { + return {INVALID_BASE64_CHARACTER, equallocation, 0}; + } + return {SUCCESS, full_input_length, 0}; } char *end_of_safe_64byte_zone = - (srclen + 3) / 4 * 3 >= 63 ? dst + (srclen + 3) / 4 * 3 - 63 : dst; + dst == nullptr + ? nullptr + : ((srclen + 3) / 4 * 3 >= 63 ? dst + (srclen + 3) / 4 * 3 - 63 + : dst); const chartype *const srcinit = src; const char *const dstinit = dst; @@ -27509,31 +50707,27 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, if (srclen >= 64) { const chartype *const srcend64 = src + srclen - 64; while (src <= srcend64) { - block64 b; - load_block(&b, src); + block64 b(src); src += 64; - bool error = false; - uint64_t badcharmask = to_base64_mask(&b, &error); - if (error) { + uint64_t error = 0; + const uint64_t badcharmask = + b.to_base64_mask(&error); + if (!ignore_garbage && error) { src -= 64; - while (src < srcend && to_base64[uint8_t(*src)] <= 64) { - src++; - } - return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)}; + const size_t error_offset = trailing_zeroes(error); + return {error_code::INVALID_BASE64_CHARACTER, + size_t(src - srcinit + error_offset), size_t(dst - dstinit)}; } if (badcharmask != 0) { - // optimization opportunity: check for simple masks like those made of - // continuous 1s followed by continuous 0s. And masks containing a - // single bad character. - bufferptr += compress_block(&b, badcharmask, bufferptr); + bufferptr += b.compress_block(badcharmask, bufferptr); } else if (bufferptr != buffer) { - copy_block(&b, bufferptr); + b.copy_block(bufferptr); bufferptr += 64; } else { if (dst >= end_of_safe_64byte_zone) { - base64_decode_block_safe(dst, &b); + b.base64_decode_block_safe(dst); } else { - base64_decode_block(dst, &b); + b.base64_decode_block(dst); } dst += 48; } @@ -27564,8 +50758,10 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) { uint8_t val = to_base64[uint8_t(*src)]; *bufferptr = char(val); - if (val > 64) { - return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)}; + if (!ignore_garbage && + (!scalar::base64::is_eight_byte(*src) || val > 64)) { + return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit), + size_t(dst - dstinit)}; } bufferptr += (val <= 63); src++; @@ -27587,8 +50783,10 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) + (uint32_t(uint8_t(buffer_start[3])) << 0 * 6)) << 8; - triple = scalar::utf32::swap_bytes(triple); - std::memcpy(dst, &triple, 4); +#if !SIMDUTF_IS_BIG_ENDIAN + triple = scalar::u32_swap_bytes(triple); +#endif + std::memcpy(dst, &triple, 3); dst += 3; buffer_start += 4; @@ -27599,7115 +50797,9758 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) + (uint32_t(uint8_t(buffer_start[3])) << 0 * 6)) << 8; - triple = scalar::utf32::swap_bytes(triple); +#if !SIMDUTF_IS_BIG_ENDIAN + triple = scalar::u32_swap_bytes(triple); +#endif std::memcpy(dst, &triple, 3); dst += 3; buffer_start += 4; } // we may have 1, 2 or 3 bytes left and we need to decode them so let us - // bring in src content + // backtrack int leftover = int(bufferptr - buffer_start); - if (leftover > 0) { - while (leftover < 4 && src < srcend) { - uint8_t val = to_base64[uint8_t(*src)]; - if (val > 64) { - return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)}; + while (leftover > 0) { + if (!ignore_garbage) { + while (to_base64[uint8_t(*(src - 1))] == 64) { + src--; } - buffer_start[leftover] = char(val); - leftover += (val <= 63); - src++; - } - - if (leftover == 1) { - return {BASE64_INPUT_REMAINDER, size_t(dst - dstinit)}; - } - if (leftover == 2) { - uint32_t triple = (uint32_t(buffer_start[0]) << 3 * 6) + - (uint32_t(buffer_start[1]) << 2 * 6); - triple = scalar::utf32::swap_bytes(triple); - triple >>= 8; - std::memcpy(dst, &triple, 1); - dst += 1; - } else if (leftover == 3) { - uint32_t triple = (uint32_t(buffer_start[0]) << 3 * 6) + - (uint32_t(buffer_start[1]) << 2 * 6) + - (uint32_t(buffer_start[2]) << 1 * 6); - triple = scalar::utf32::swap_bytes(triple); - triple >>= 8; - std::memcpy(dst, &triple, 2); - dst += 2; } else { - uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) + - (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) + - (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) + - (uint32_t(uint8_t(buffer_start[3])) << 0 * 6)) - << 8; - triple = scalar::utf32::swap_bytes(triple); - std::memcpy(dst, &triple, 3); - dst += 3; + while (to_base64[uint8_t(*(src - 1))] >= 64) { + src--; + } } + src--; + leftover--; } } if (src < srcend + equalsigns) { - result r = - scalar::base64::base64_tail_decode(dst, src, srcend - src, options); - if (r.error == error_code::INVALID_BASE64_CHARACTER) { - r.count += size_t(src - srcinit); - return r; - } else { - r.count += size_t(dst - dstinit); - } - if(r.error == error_code::SUCCESS && equalsigns > 0) { - // additional checks - if((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) { - r.error = error_code::INVALID_BASE64_CHARACTER; + full_result r = scalar::base64::base64_tail_decode( + dst, src, srcend - src, equalsigns, options, last_chunk_options); + r = scalar::base64::patch_tail_result( + r, size_t(src - srcinit), size_t(dst - dstinit), equallocation, + full_input_length, last_chunk_options); + // When is_partial(last_chunk_options) is true, we must either end with + // the end of the stream (beyond whitespace) or right after a non-ignorable + // character or at the very beginning of the stream. + // See https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64 + if (is_partial(last_chunk_options) && r.error == error_code::SUCCESS && + r.input_count < full_input_length) { + // First check if we can extend the input to the end of the stream + while (r.input_count < full_input_length && + base64_ignorable(*(srcinit + r.input_count), options)) { + r.input_count++; + } + // If we are still not at the end of the stream, then we must backtrack + // to the last non-ignorable character. + if (r.input_count < full_input_length) { + while (r.input_count > 0 && + base64_ignorable(*(srcinit + r.input_count - 1), options)) { + r.input_count--; + } } } return r; } - if(equalsigns > 0) { - if((size_t(dst - dstinit) % 3 == 0) || ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) { - return {INVALID_BASE64_CHARACTER, size_t(dst - dstinit)}; + if (!ignore_garbage && equalsigns > 0) { + if ((size_t(dst - dstinit) % 3 == 0) || + ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) { + return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)}; } } - return {SUCCESS, size_t(dst - dstinit)}; + return {SUCCESS, srclen, size_t(dst - dstinit)}; } -/* end file src/haswell/avx2_base64.cpp */ +} // namespace base64 } // unnamed namespace -} // namespace haswell +} // namespace westmere } // namespace simdutf - -/* begin file src/generic/buf_block_reader.h */ +/* end file src/generic/base64.h */ +/* begin file src/generic/find.h */ namespace simdutf { -namespace haswell { +namespace westmere { namespace { +namespace util { + +simdutf_really_inline const char *find(const char *start, const char *end, + char character) noexcept { + // Handle empty or invalid range + if (start >= end) + return end; + // Align the start pointer to 64 bytes + uintptr_t misalignment = reinterpret_cast(start) % 64; + if (misalignment != 0) { + size_t adjustment = 64 - misalignment; + if (size_t(std::distance(start, end)) < adjustment) { + adjustment = std::distance(start, end); + } + for (size_t i = 0; i < adjustment; i++) { + if (start[i] == character) { + return start + i; + } + } + start += adjustment; + } -// Walks through a buffer in block-sized increments, loading the last part with spaces -template -struct buf_block_reader { -public: - simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len); - simdutf_really_inline size_t block_index(); - simdutf_really_inline bool has_full_block() const; - simdutf_really_inline const uint8_t *full_block() const; - /** - * Get the last block, padded with spaces. - * - * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this - * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there - * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding. - * - * @return the number of effective characters in the last block. - */ - simdutf_really_inline size_t get_remainder(uint8_t *dst) const; - simdutf_really_inline void advance(); -private: - const uint8_t *buf; - const size_t len; - const size_t lenminusstep; - size_t idx; -}; - -// Routines to print masks and text for debugging bitmask operations -simdutf_unused static char * format_input_text_64(const uint8_t *text) { - static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); - for (size_t i=0; i); i++) { - buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]); + // Main loop for 64-byte aligned data + for (; std::distance(start, end) >= 64; start += 64) { + simd8x64 input(reinterpret_cast(start)); + uint64_t matches = input.eq(uint8_t(character)); + if (matches != 0) { + // Found a match, return the first one + int index = trailing_zeroes(matches); + return start + index; + } } - buf[sizeof(simd8x64)] = '\0'; - return buf; + return std::find(start, end, character); } -// Routines to print masks and text for debugging bitmask operations -simdutf_unused static char * format_input_text(const simd8x64& in) { - static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); - in.store(reinterpret_cast(buf)); - for (size_t i=0; i); i++) { - if (buf[i] < ' ') { buf[i] = '_'; } +simdutf_really_inline const char16_t * +find(const char16_t *start, const char16_t *end, char16_t character) noexcept { + // Handle empty or invalid range + if (start >= end) + return end; + // Align the start pointer to 64 bytes if misalignment is even + uintptr_t misalignment = reinterpret_cast(start) % 64; + if (misalignment != 0 && misalignment % 2 == 0) { + size_t adjustment = (64 - misalignment) / sizeof(char16_t); + if (size_t(std::distance(start, end)) < adjustment) { + adjustment = std::distance(start, end); + } + for (size_t i = 0; i < adjustment; i++) { + if (start[i] == character) { + return start + i; + } + } + start += adjustment; } - buf[sizeof(simd8x64)] = '\0'; - return buf; -} -simdutf_unused static char * format_mask(uint64_t mask) { - static char *buf = reinterpret_cast(malloc(64 + 1)); - for (size_t i=0; i<64; i++) { - buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' '; + // Main loop for 64-byte aligned data + for (; std::distance(start, end) >= 32; start += 32) { + simd16x32 input(reinterpret_cast(start)); + uint64_t matches = input.eq(uint16_t(character)); + if (matches != 0) { + // Found a match, return the first one + int index = trailing_zeroes(matches) / 2; + return start + index; + } } - buf[64] = '\0'; - return buf; + return std::find(start, end, character); } -template -simdutf_really_inline buf_block_reader::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} +} // namespace util +} // namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/find.h */ +#endif // SIMDUTF_FEATURE_BASE64 + +// +// Implementation-specific overrides +// + +namespace simdutf { +namespace westmere { -template -simdutf_really_inline size_t buf_block_reader::block_index() { return idx; } +#if SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused int +implementation::detect_encodings(const char *input, + size_t length) const noexcept { + // If there is a BOM, then we trust it. + auto bom_encoding = simdutf::BOM::check_bom(input, length); + if (bom_encoding != encoding_type::unspecified) { + return bom_encoding; + } -template -simdutf_really_inline bool buf_block_reader::has_full_block() const { - return idx < lenminusstep; -} + int out = 0; + uint32_t utf16_err = (length % 2); + uint32_t utf32_err = (length % 4); + uint32_t ends_with_high = 0; + const auto v_d8 = simd8::splat(0xd8); + const auto v_f8 = simd8::splat(0xf8); + const auto v_fc = simd8::splat(0xfc); + const auto v_dc = simd8::splat(0xdc); + const __m128i standardmax = _mm_set1_epi32(0x10ffff); + const __m128i offset = _mm_set1_epi32(0xffff2000); + const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff); + __m128i currentmax = _mm_setzero_si128(); + __m128i currentoffsetmax = _mm_setzero_si128(); + + utf8_checker c{}; + buf_block_reader<64> reader(reinterpret_cast(input), length); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + // utf8 checks + c.check_next_input(in); -template -simdutf_really_inline const uint8_t *buf_block_reader::full_block() const { - return &buf[idx]; -} + // utf16le checks + auto in0 = simd16(in.chunks[0]); + auto in1 = simd16(in.chunks[1]); + const auto t0 = in0.shr<8>(); + const auto t1 = in1.shr<8>(); + const auto packed1 = simd16::pack(t0, t1); + auto in2 = simd16(in.chunks[2]); + auto in3 = simd16(in.chunks[3]); + const auto t2 = in2.shr<8>(); + const auto t3 = in3.shr<8>(); + const auto packed2 = simd16::pack(t2, t3); + + const auto surrogates_wordmask_lo = (packed1 & v_f8) == v_d8; + const auto surrogates_wordmask_hi = (packed2 & v_f8) == v_d8; + const uint32_t surrogates_bitmask = + (surrogates_wordmask_hi.to_bitmask() << 16) | + surrogates_wordmask_lo.to_bitmask(); + const auto vL_lo = (packed1 & v_fc) == v_dc; + const auto vL_hi = (packed2 & v_fc) == v_dc; + const uint32_t L = (vL_hi.to_bitmask() << 16) | vL_lo.to_bitmask(); + const uint32_t H = L ^ surrogates_bitmask; + utf16_err |= (((H << 1) | ends_with_high) != L); + ends_with_high = (H & 0x80000000) != 0; + + // utf32le checks + currentmax = _mm_max_epu32(in.chunks[0], currentmax); + currentoffsetmax = + _mm_max_epu32(_mm_add_epi32(in.chunks[0], offset), currentoffsetmax); + currentmax = _mm_max_epu32(in.chunks[1], currentmax); + currentoffsetmax = + _mm_max_epu32(_mm_add_epi32(in.chunks[1], offset), currentoffsetmax); + currentmax = _mm_max_epu32(in.chunks[2], currentmax); + currentoffsetmax = + _mm_max_epu32(_mm_add_epi32(in.chunks[2], offset), currentoffsetmax); + currentmax = _mm_max_epu32(in.chunks[3], currentmax); + currentoffsetmax = + _mm_max_epu32(_mm_add_epi32(in.chunks[3], offset), currentoffsetmax); -template -simdutf_really_inline size_t buf_block_reader::get_remainder(uint8_t *dst) const { - if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers - std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once. - std::memcpy(dst, buf + idx, len - idx); - return len - idx; -} + reader.advance(); + } -template -simdutf_really_inline void buf_block_reader::advance() { - idx += STEP_SIZE; + uint8_t block[64]{}; + size_t idx = reader.block_index(); + std::memcpy(block, &input[idx], length - idx); + simd::simd8x64 in(block); + c.check_next_input(in); + + // utf16le last block check + auto in0 = simd16(in.chunks[0]); + auto in1 = simd16(in.chunks[1]); + const auto t0 = in0.shr<8>(); + const auto t1 = in1.shr<8>(); + const auto packed1 = simd16::pack(t0, t1); + auto in2 = simd16(in.chunks[2]); + auto in3 = simd16(in.chunks[3]); + const auto t2 = in2.shr<8>(); + const auto t3 = in3.shr<8>(); + const auto packed2 = simd16::pack(t2, t3); + + const auto surrogates_wordmask_lo = (packed1 & v_f8) == v_d8; + const auto surrogates_wordmask_hi = (packed2 & v_f8) == v_d8; + const uint32_t surrogates_bitmask = + (surrogates_wordmask_hi.to_bitmask() << 16) | + surrogates_wordmask_lo.to_bitmask(); + const auto vL_lo = (packed1 & v_fc) == v_dc; + const auto vL_hi = (packed2 & v_fc) == v_dc; + const uint32_t L = (vL_hi.to_bitmask() << 16) | vL_lo.to_bitmask(); + const uint32_t H = L ^ surrogates_bitmask; + utf16_err |= (((H << 1) | ends_with_high) != L); + // this is required to check for last byte ending in high and end of input + // is reached + ends_with_high = (H & 0x80000000) != 0; + utf16_err |= ends_with_high; + + // utf32le last block check + currentmax = _mm_max_epu32(in.chunks[0], currentmax); + currentoffsetmax = + _mm_max_epu32(_mm_add_epi32(in.chunks[0], offset), currentoffsetmax); + currentmax = _mm_max_epu32(in.chunks[1], currentmax); + currentoffsetmax = + _mm_max_epu32(_mm_add_epi32(in.chunks[1], offset), currentoffsetmax); + currentmax = _mm_max_epu32(in.chunks[2], currentmax); + currentoffsetmax = + _mm_max_epu32(_mm_add_epi32(in.chunks[2], offset), currentoffsetmax); + currentmax = _mm_max_epu32(in.chunks[3], currentmax); + currentoffsetmax = + _mm_max_epu32(_mm_add_epi32(in.chunks[3], offset), currentoffsetmax); + + reader.advance(); + + c.check_eof(); + bool is_valid_utf8 = !c.errors(); + __m128i is_zero = + _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax); + utf32_err |= (_mm_test_all_zeros(is_zero, is_zero) == 0); + + is_zero = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax), + standardoffsetmax); + utf32_err |= (_mm_test_all_zeros(is_zero, is_zero) == 0); + if (is_valid_utf8) { + out |= encoding_type::UTF8; + } + if (utf16_err == 0) { + out |= encoding_type::UTF16_LE; + } + if (utf32_err == 0) { + out |= encoding_type::UTF32_LE; + } + return out; } +#endif // SIMDUTF_FEATURE_DETECT_ENCODING -} // unnamed namespace -} // namespace haswell -} // namespace simdutf -/* end file src/generic/buf_block_reader.h */ -/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ -namespace simdutf { -namespace haswell { -namespace { -namespace utf8_validation { - -using namespace simd; +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused bool +implementation::validate_utf8(const char *buf, size_t len) const noexcept { + return westmere::utf8_validation::generic_validate_utf8(buf, len); +} +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING - simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { -// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) -// Bit 1 = Too Long (ASCII followed by continuation) -// Bit 2 = Overlong 3-byte -// Bit 4 = Surrogate -// Bit 5 = Overlong 2-byte -// Bit 7 = Two Continuations - constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ - // 11______ 11______ - constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ - constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ - constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ - constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ - constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ - constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ - // 11110100 101_____ - // 11110101 1001____ - // 11110101 101_____ - // 1111011_ 1001____ - // 1111011_ 101_____ - // 11111___ 1001____ - // 11111___ 101_____ - constexpr const uint8_t TOO_LARGE_1000 = 1<<6; - // 11110101 1000____ - // 1111011_ 1000____ - // 11111___ 1000____ - constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ - - const simd8 byte_1_high = prev1.shr<4>().lookup_16( - // 0_______ ________ - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - // 10______ ________ - TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, - // 1100____ ________ - TOO_SHORT | OVERLONG_2, - // 1101____ ________ - TOO_SHORT, - // 1110____ ________ - TOO_SHORT | OVERLONG_3 | SURROGATE, - // 1111____ ________ - TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 - ); - constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . - const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( - // ____0000 ________ - CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, - // ____0001 ________ - CARRY | OVERLONG_2, - // ____001_ ________ - CARRY, - CARRY, - - // ____0100 ________ - CARRY | TOO_LARGE, - // ____0101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - // ____011_ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - - // ____1___ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - // ____1101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000 - ); - const simd8 byte_2_high = input.shr<4>().lookup_16( - // ________ 0_______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, +#if SIMDUTF_FEATURE_UTF8 +simdutf_warn_unused result implementation::validate_utf8_with_errors( + const char *buf, size_t len) const noexcept { + return westmere::utf8_validation::generic_validate_utf8_with_errors(buf, len); +} +#endif // SIMDUTF_FEATURE_UTF8 - // ________ 1000____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, - // ________ 1001____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, - // ________ 101_____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, +#if SIMDUTF_FEATURE_ASCII +simdutf_warn_unused bool +implementation::validate_ascii(const char *buf, size_t len) const noexcept { + return westmere::ascii_validation::generic_validate_ascii(buf, len); +} - // ________ 11______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT - ); - return (byte_1_high & byte_1_low & byte_2_high); - } - simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, - const simd8 prev_input, const simd8 sc) { - simd8 prev2 = input.prev<2>(prev_input); - simd8 prev3 = input.prev<3>(prev_input); - simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); - simd8 must23_80 = must23 & uint8_t(0x80); - return must23_80 ^ sc; - } +simdutf_warn_unused result implementation::validate_ascii_with_errors( + const char *buf, size_t len) const noexcept { + return westmere::ascii_validation::generic_validate_ascii_with_errors(buf, + len); +} +#endif // SIMDUTF_FEATURE_ASCII - // - // Return nonzero if there are incomplete multibyte characters at the end of the block: - // e.g. if there is a 4-byte character, but it's 3 bytes from the end. - // - simdutf_really_inline simd8 is_incomplete(const simd8 input) { - // If the previous input's last 3 bytes match this, they're too short (they ended at EOF): - // ... 1111____ 111_____ 11______ - static const uint8_t max_array[32] = { - 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1 - }; - const simd8 max_value(&max_array[sizeof(max_array)-sizeof(simd8)]); - return input.gt_bits(max_value); - } +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII +simdutf_warn_unused bool +implementation::validate_utf16le_as_ascii(const char16_t *buf, + size_t len) const noexcept { + return westmere::utf16::validate_utf16_as_ascii_with_errors< + endianness::LITTLE>(buf, len) + .error == SUCCESS; +} - struct utf8_checker { - // If this is nonzero, there has been a UTF-8 error. - simd8 error; - // The last input we received - simd8 prev_input_block; - // Whether the last input we received was incomplete (used for ASCII fast path) - simd8 prev_incomplete; +simdutf_warn_unused bool +implementation::validate_utf16be_as_ascii(const char16_t *buf, + size_t len) const noexcept { + return westmere::utf16::validate_utf16_as_ascii_with_errors( + buf, len) + .error == SUCCESS; +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII - // - // Check whether the current bytes are valid UTF-8. - // - simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { - // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes - // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) - simd8 prev1 = input.prev<1>(prev_input); - simd8 sc = check_special_cases(input, prev1); - this->error |= check_multibyte_lengths(input, prev_input, sc); - } - - // The only problem that can happen at EOF is that a multibyte character is too short - // or a byte value too large in the last bytes: check_special_cases only checks for bytes - // too large in the first of two bytes. - simdutf_really_inline void check_eof() { - // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't - // possibly finish them. - this->error |= this->prev_incomplete; - } +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused bool +implementation::validate_utf16le(const char16_t *buf, + size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + // empty input is valid UTF-16. protect the implementation from + // handling nullptr + return true; + } + const auto res = + westmere::utf16::validate_utf16_with_errors(buf, len); + if (res.is_err()) { + return false; + } - simdutf_really_inline void check_next_input(const simd8x64& input) { - if(simdutf_likely(is_ascii(input))) { - this->error |= this->prev_incomplete; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], this->prev_input_block); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], this->prev_input_block); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - this->prev_incomplete = is_incomplete(input.chunks[simd8x64::NUM_CHUNKS-1]); - this->prev_input_block = input.chunks[simd8x64::NUM_CHUNKS-1]; + if (res.count == len) + return true; - } - } + return scalar::utf16::validate(buf + res.count, + len - res.count); +} +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING - // do not forget to call check_eof! - simdutf_really_inline bool errors() const { - return this->error.any_bits_set_anywhere(); - } +#if SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused bool +implementation::validate_utf16be(const char16_t *buf, + size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + // empty input is valid UTF-16. protect the implementation from + // handling nullptr + return true; + } + const auto res = + westmere::utf16::validate_utf16_with_errors(buf, len); + if (res.is_err()) { + return false; + } - }; // struct utf8_checker -} // namespace utf8_validation + if (res.count == len) + return true; -using utf8_validation::utf8_checker; + return scalar::utf16::validate(buf + res.count, + len - res.count); +} -} // unnamed namespace -} // namespace haswell -} // namespace simdutf -/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ -/* begin file src/generic/utf8_validation/utf8_validator.h */ -namespace simdutf { -namespace haswell { -namespace { -namespace utf8_validation { +simdutf_warn_unused result implementation::validate_utf16le_with_errors( + const char16_t *buf, size_t len) const noexcept { + const result res = + westmere::utf16::validate_utf16_with_errors(buf, len); + if (res.count != len) { + const result scalar_res = + scalar::utf16::validate_with_errors( + buf + res.count, len - res.count); + return result(scalar_res.error, res.count + scalar_res.count); + } else { + return res; + } +} -/** - * Validates that the string is actual UTF-8. - */ -template -bool generic_validate_utf8(const uint8_t * input, size_t length) { - checker c{}; - buf_block_reader<64> reader(input, length); - while (reader.has_full_block()) { - simd::simd8x64 in(reader.full_block()); - c.check_next_input(in); - reader.advance(); - } - uint8_t block[64]{}; - reader.get_remainder(block); - simd::simd8x64 in(block); - c.check_next_input(in); - reader.advance(); - c.check_eof(); - return !c.errors(); +simdutf_warn_unused result implementation::validate_utf16be_with_errors( + const char16_t *buf, size_t len) const noexcept { + const result res = + westmere::utf16::validate_utf16_with_errors(buf, len); + if (res.count != len) { + result scalar_res = scalar::utf16::validate_with_errors( + buf + res.count, len - res.count); + return result(scalar_res.error, res.count + scalar_res.count); + } else { + return res; + } } -bool generic_validate_utf8(const char * input, size_t length) { - return generic_validate_utf8(reinterpret_cast(input),length); +void implementation::to_well_formed_utf16le(const char16_t *input, size_t len, + char16_t *output) const noexcept { + return utf16fix_sse(input, len, output); } -/** - * Validates that the string is actual UTF-8 and stops on errors. - */ -template -result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) { - checker c{}; - buf_block_reader<64> reader(input, length); - size_t count{0}; - while (reader.has_full_block()) { - simd::simd8x64 in(reader.full_block()); - c.check_next_input(in); - if(c.errors()) { - if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk - result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(input), reinterpret_cast(input + count), length - count); - res.count += count; - return res; - } - reader.advance(); - count += 64; - } - uint8_t block[64]{}; - reader.get_remainder(block); - simd::simd8x64 in(block); - c.check_next_input(in); - reader.advance(); - c.check_eof(); - if (c.errors()) { - if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk - result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(input), reinterpret_cast(input) + count, length - count); - res.count += count; - return res; - } else { - return result(error_code::SUCCESS, length); - } +void implementation::to_well_formed_utf16be(const char16_t *input, size_t len, + char16_t *output) const noexcept { + return utf16fix_sse(input, len, output); } +#endif // SIMDUTF_FEATURE_UTF16 -result generic_validate_utf8_with_errors(const char * input, size_t length) { - return generic_validate_utf8_with_errors(reinterpret_cast(input),length); +#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused bool +implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept { + return utf32::validate(buf, len); } +#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING -template -bool generic_validate_ascii(const uint8_t * input, size_t length) { - buf_block_reader<64> reader(input, length); - uint8_t blocks[64]{}; - simd::simd8x64 running_or(blocks); - while (reader.has_full_block()) { - simd::simd8x64 in(reader.full_block()); - running_or |= in; - reader.advance(); - } - uint8_t block[64]{}; - reader.get_remainder(block); - simd::simd8x64 in(block); - running_or |= in; - return running_or.is_ascii(); +#if SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused result implementation::validate_utf32_with_errors( + const char32_t *buf, size_t len) const noexcept { + return utf32::validate_with_errors(buf, len); } +#endif // SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_latin1_to_utf8( + const char *buf, size_t len, char *utf8_output) const noexcept { + + std::pair ret = + sse_convert_latin1_to_utf8(buf, len, utf8_output); + size_t converted_chars = ret.second - utf8_output; + + if (ret.first != buf + len) { + const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + converted_chars += scalar_converted_chars; + } -bool generic_validate_ascii(const char * input, size_t length) { - return generic_validate_ascii(reinterpret_cast(input),length); + return converted_chars; } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 -template -result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) { - buf_block_reader<64> reader(input, length); - size_t count{0}; - while (reader.has_full_block()) { - simd::simd8x64 in(reader.full_block()); - if (!in.is_ascii()) { - result res = scalar::ascii::validate_with_errors(reinterpret_cast(input + count), length - count); - return result(res.error, count + res.count); +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + std::pair ret = + sse_convert_latin1_to_utf16(buf, len, utf16_output); + if (ret.first == nullptr) { + return 0; + } + size_t converted_chars = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_converted_chars = + scalar::latin1_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_converted_chars == 0) { + return 0; } - reader.advance(); + converted_chars += scalar_converted_chars; + } + return converted_chars; +} - count += 64; +simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + std::pair ret = + sse_convert_latin1_to_utf16(buf, len, utf16_output); + if (ret.first == nullptr) { + return 0; } - uint8_t block[64]{}; - reader.get_remainder(block); - simd::simd8x64 in(block); - if (!in.is_ascii()) { - result res = scalar::ascii::validate_with_errors(reinterpret_cast(input + count), length - count); - return result(res.error, count + res.count); - } else { - return result(error_code::SUCCESS, length); + size_t converted_chars = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_converted_chars = + scalar::latin1_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_converted_chars == 0) { + return 0; + } + converted_chars += scalar_converted_chars; } + return converted_chars; } +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 -result generic_validate_ascii_with_errors(const char * input, size_t length) { - return generic_validate_ascii_with_errors(reinterpret_cast(input),length); +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_latin1_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept { + std::pair ret = + sse_convert_latin1_to_utf32(buf, len, utf32_output); + if (ret.first == nullptr) { + return 0; + } + size_t converted_chars = ret.second - utf32_output; + if (ret.first != buf + len) { + const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_converted_chars == 0) { + return 0; + } + converted_chars += scalar_converted_chars; + } + return converted_chars; } +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 -} // namespace utf8_validation -} // unnamed namespace -} // namespace haswell -} // namespace simdutf -/* end file src/generic/utf8_validation/utf8_validator.h */ -// transcoding from UTF-8 to UTF-16 -/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ - - -namespace simdutf { -namespace haswell { -namespace { -namespace utf8_to_utf16 { +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept { + utf8_to_latin1::validating_transcoder converter; + return converter.convert(buf, len, latin1_output); +} -using namespace simd; +simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors( + const char *buf, size_t len, char *latin1_output) const noexcept { + utf8_to_latin1::validating_transcoder converter; + return converter.convert_with_errors(buf, len, latin1_output); +} -template -simdutf_warn_unused size_t convert_valid(const char* input, size_t size, - char16_t* utf16_output) noexcept { - // The implementation is not specific to haswell and should be moved to the generic directory. - size_t pos = 0; - char16_t* start{utf16_output}; - const size_t safety_margin = 16; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - // this loop could be unrolled further. For example, we could process the mask - // far more than 64 bytes. - simd8x64 in(reinterpret_cast(input + pos)); - if(in.is_ascii()) { - in.store_ascii_as_utf16(utf16_output); - utf16_output += 64; - pos += 64; - } else { - // Slow path. We hope that the compiler will recognize that this is a slow path. - // Anything that is not a continuation mask is a 'leading byte', that is, the - // start of a new code point. - uint64_t utf8_continuation_mask = in.lt(-65 + 1); - // -65 is 0b10111111 in two-complement's, so largest possible continuation byte - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - // The *start* of code points is not so useful, rather, we want the *end* of code points. - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times when using solely - // the slow/regular path, and at least four times if there are fast paths. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - // - // Thus we may allow convert_masked_utf8_to_utf16 to process - // more bytes at a time under a fast-path mode where 16 bytes - // are consumed at once (e.g., when encountering ASCII). - size_t consumed = convert_masked_utf8_to_utf16(input + pos, - utf8_end_of_code_point_mask, utf16_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. - } - } - utf16_output += scalar::utf8_to_utf16::convert_valid(input + pos, size - pos, utf16_output); - return utf16_output - start; +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept { + return westmere::utf8_to_latin1::convert_valid(buf, len, latin1_output); } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 -} // namespace utf8_to_utf16 -} // unnamed namespace -} // namespace haswell -} // namespace simdutf -/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ -/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */ +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert(buf, len, utf16_output); +} +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert(buf, len, utf16_output); +} -namespace simdutf { -namespace haswell { -namespace { -namespace utf8_to_utf16 { -using namespace simd; +simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert_with_errors(buf, len, + utf16_output); +} +simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert_with_errors(buf, len, utf16_output); +} - simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { -// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) -// Bit 1 = Too Long (ASCII followed by continuation) -// Bit 2 = Overlong 3-byte -// Bit 4 = Surrogate -// Bit 5 = Overlong 2-byte -// Bit 7 = Two Continuations - constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ - // 11______ 11______ - constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ - constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ - constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ - constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ - constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ - constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ - // 11110100 101_____ - // 11110101 1001____ - // 11110101 101_____ - // 1111011_ 1001____ - // 1111011_ 101_____ - // 11111___ 1001____ - // 11111___ 101_____ - constexpr const uint8_t TOO_LARGE_1000 = 1<<6; - // 11110101 1000____ - // 1111011_ 1000____ - // 11111___ 1000____ - constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ - - const simd8 byte_1_high = prev1.shr<4>().lookup_16( - // 0_______ ________ - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - // 10______ ________ - TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, - // 1100____ ________ - TOO_SHORT | OVERLONG_2, - // 1101____ ________ - TOO_SHORT, - // 1110____ ________ - TOO_SHORT | OVERLONG_3 | SURROGATE, - // 1111____ ________ - TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 - ); - constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . - const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( - // ____0000 ________ - CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, - // ____0001 ________ - CARRY | OVERLONG_2, - // ____001_ ________ - CARRY, - CARRY, - - // ____0100 ________ - CARRY | TOO_LARGE, - // ____0101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - // ____011_ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - - // ____1___ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - // ____1101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000 - ); - const simd8 byte_2_high = input.shr<4>().lookup_16( - // ________ 0_______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le( + const char *input, size_t size, char16_t *utf16_output) const noexcept { + return utf8_to_utf16::convert_valid(input, size, + utf16_output); +} - // ________ 1000____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, - // ________ 1001____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, - // ________ 101_____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be( + const char *input, size_t size, char16_t *utf16_output) const noexcept { + return utf8_to_utf16::convert_valid(input, size, + utf16_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 - // ________ 11______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT - ); - return (byte_1_high & byte_1_low & byte_2_high); - } - simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, - const simd8 prev_input, const simd8 sc) { - simd8 prev2 = input.prev<2>(prev_input); - simd8 prev3 = input.prev<3>(prev_input); - simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); - simd8 must23_80 = must23 & uint8_t(0x80); - return must23_80 ^ sc; - } +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_utf8_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept { + utf8_to_utf32::validating_transcoder converter; + return converter.convert(buf, len, utf32_output); +} +simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors( + const char *buf, size_t len, char32_t *utf32_output) const noexcept { + utf8_to_utf32::validating_transcoder converter; + return converter.convert_with_errors(buf, len, utf32_output); +} - struct validating_transcoder { - // If this is nonzero, there has been a UTF-8 error. - simd8 error; +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32( + const char *input, size_t size, char32_t *utf32_output) const noexcept { + return utf8_to_utf32::convert_valid(input, size, utf32_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 - validating_transcoder() : error(uint8_t(0)) {} - // - // Check whether the current bytes are valid UTF-8. - // - simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { - // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes - // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) - simd8 prev1 = input.prev<1>(prev_input); - simd8 sc = check_special_cases(input, prev1); - this->error |= check_multibyte_lengths(input, prev_input, sc); - } - - - template - simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) { - size_t pos = 0; - char16_t* start{utf16_output}; - // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes, - // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate - // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, - // to give us a good margin. - size_t leading_byte = 0; - size_t margin = size; - for(; margin > 0 && leading_byte < 8; margin--) { - leading_byte += (int8_t(in[margin-1]) > -65); - } - // If the input is long enough, then we have that margin-1 is the eight last leading byte. - const size_t safety_margin = size - margin + 1; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 input(reinterpret_cast(in + pos)); - if(input.is_ascii()) { - input.store_ascii_as_utf16(utf16_output); - utf16_output += 64; - pos += 64; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - auto zero = simd8{uint8_t(0)}; - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_utf16(in + pos, - utf8_end_of_code_point_mask, utf16_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. - } - } - if(errors()) { return 0; } - if(pos < size) { - size_t howmany = scalar::utf8_to_utf16::convert(in + pos, size - pos, utf16_output); - if(howmany == 0) { return 0; } - utf16_output += howmany; - } - return utf16_output - start; - } - - template - simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) { - size_t pos = 0; - char16_t* start{utf16_output}; - // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes, - // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate - // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, - // to give us a good margin. - size_t leading_byte = 0; - size_t margin = size; - for(; margin > 0 && leading_byte < 8; margin--) { - leading_byte += (int8_t(in[margin-1]) > -65); - } - // If the input is long enough, then we have that margin-1 is the eight last leading byte. - const size_t safety_margin = size - margin + 1; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 input(reinterpret_cast(in + pos)); - if(input.is_ascii()) { - input.store_ascii_as_utf16(utf16_output); - utf16_output += 64; - pos += 64; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - auto zero = simd8{uint8_t(0)}; - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - if (errors()) { - // rewind_and_convert_with_errors will seek a potential error from in+pos onward, - // with the ability to go back up to pos bytes, and read size-pos bytes forward. - result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); - res.count += pos; - return res; - } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_utf16(in + pos, - utf8_end_of_code_point_mask, utf16_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. - } - } - if(errors()) { - // rewind_and_convert_with_errors will seek a potential error from in+pos onward, - // with the ability to go back up to pos bytes, and read size-pos bytes forward. - result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); - res.count += pos; - return res; - } - if(pos < size) { - // rewind_and_convert_with_errors will seek a potential error from in+pos onward, - // with the ability to go back up to pos bytes, and read size-pos bytes forward. - result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); - if (res.error) { // In case of error, we want the error position - res.count += pos; - return res; - } else { // In case of success, we want the number of word written - utf16_output += res.count; - } - } - return result(error_code::SUCCESS, utf16_output - start); - } +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair ret = + sse_convert_utf16_to_latin1(buf, len, latin1_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - latin1_output; - simdutf_really_inline bool errors() const { - return this->error.any_bits_set_anywhere(); + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_latin1::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} - }; // struct utf8_checker -} // utf8_to_utf16 namespace -} // unnamed namespace -} // namespace haswell -} // namespace simdutf -/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */ -// transcoding from UTF-8 to UTF-32 -/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ - -namespace simdutf { -namespace haswell { -namespace { -namespace utf8_to_utf32 { +simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair ret = + sse_convert_utf16_to_latin1(buf, len, latin1_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - latin1_output; -using namespace simd; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_latin1::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} +simdutf_warn_unused result +implementation::convert_utf16le_to_latin1_with_errors( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair ret = + sse_convert_utf16_to_latin1_with_errors( + buf, len, latin1_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = + scalar::utf16_to_latin1::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + latin1_output; // Set count to the number of 8-bit code units written + return ret.first; +} -simdutf_warn_unused size_t convert_valid(const char* input, size_t size, - char32_t* utf32_output) noexcept { - size_t pos = 0; - char32_t* start{utf32_output}; - const size_t safety_margin = 16; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 in(reinterpret_cast(input + pos)); - if(in.is_ascii()) { - in.store_ascii_as_utf32(utf32_output); - utf32_output += 64; - pos += 64; +simdutf_warn_unused result +implementation::convert_utf16be_to_latin1_with_errors( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair ret = + sse_convert_utf16_to_latin1_with_errors(buf, len, + latin1_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = + scalar::utf16_to_latin1::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; } else { - // -65 is 0b10111111 in two-complement's, so largest possible continuation byte - uint64_t utf8_continuation_mask = in.lt(-65 + 1); - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - size_t max_starting_point = (pos + 64) - 12; - while(pos < max_starting_point) { - size_t consumed = convert_masked_utf8_to_utf32(input + pos, - utf8_end_of_code_point_mask, utf32_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } + ret.second += scalar_res.count; } } - utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output); - return utf32_output - start; + ret.first.count = + ret.second - + latin1_output; // Set count to the number of 8-bit code units written + return ret.first; } +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + // optimization opportunity: we could provide an optimized function. + return convert_utf16be_to_latin1(buf, len, latin1_output); +} -} // namespace utf8_to_utf32 -} // unnamed namespace -} // namespace haswell -} // namespace simdutf -/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ -/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */ +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + // optimization opportunity: we could provide an optimized function. + return convert_utf16le_to_latin1(buf, len, latin1_output); +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + std::pair ret = + sse_convert_utf16_to_utf8(buf, len, utf8_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf8_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} -namespace simdutf { -namespace haswell { -namespace { -namespace utf8_to_utf32 { -using namespace simd; +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + std::pair ret = + sse_convert_utf16_to_utf8(buf, len, utf8_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf8_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} +simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + westmere::sse_convert_utf16_to_utf8_with_errors( + buf, len, utf8_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = + scalar::utf16_to_utf8::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + utf8_output; // Set count to the number of 8-bit code units written + return ret.first; +} - simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { -// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) -// Bit 1 = Too Long (ASCII followed by continuation) -// Bit 2 = Overlong 3-byte -// Bit 4 = Surrogate -// Bit 5 = Overlong 2-byte -// Bit 7 = Two Continuations - constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ - // 11______ 11______ - constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ - constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ - constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ - constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ - constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ - constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ - // 11110100 101_____ - // 11110101 1001____ - // 11110101 101_____ - // 1111011_ 1001____ - // 1111011_ 101_____ - // 11111___ 1001____ - // 11111___ 101_____ - constexpr const uint8_t TOO_LARGE_1000 = 1<<6; - // 11110101 1000____ - // 1111011_ 1000____ - // 11111___ 1000____ - constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ - - const simd8 byte_1_high = prev1.shr<4>().lookup_16( - // 0_______ ________ - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - // 10______ ________ - TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, - // 1100____ ________ - TOO_SHORT | OVERLONG_2, - // 1101____ ________ - TOO_SHORT, - // 1110____ ________ - TOO_SHORT | OVERLONG_3 | SURROGATE, - // 1111____ ________ - TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 - ); - constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . - const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( - // ____0000 ________ - CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, - // ____0001 ________ - CARRY | OVERLONG_2, - // ____001_ ________ - CARRY, - CARRY, - - // ____0100 ________ - CARRY | TOO_LARGE, - // ____0101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - // ____011_ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - - // ____1___ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - // ____1101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000 - ); - const simd8 byte_2_high = input.shr<4>().lookup_16( - // ________ 0_______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, +simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + westmere::sse_convert_utf16_to_utf8_with_errors( + buf, len, utf8_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = + scalar::utf16_to_utf8::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + utf8_output; // Set count to the number of 8-bit code units written + return ret.first; +} - // ________ 1000____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, - // ________ 1001____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, - // ________ 101_____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + return convert_utf16le_to_utf8(buf, len, utf8_output); +} - // ________ 11______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT - ); - return (byte_1_high & byte_1_low & byte_2_high); +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + return convert_utf16be_to_utf8(buf, len, utf8_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_utf32_to_latin1( + const char32_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair ret = + sse_convert_utf32_to_latin1(buf, len, latin1_output); + if (ret.first == nullptr) { + return 0; } - simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, - const simd8 prev_input, const simd8 sc) { - simd8 prev2 = input.prev<2>(prev_input); - simd8 prev3 = input.prev<3>(prev_input); - simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); - simd8 must23_80 = must23 & uint8_t(0x80); - return must23_80 ^ sc; + size_t saved_bytes = ret.second - latin1_output; + // if (ret.first != buf + len) { + if (ret.first < buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; } + return saved_bytes; +} +simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors( + const char32_t *buf, size_t len, char *latin1_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + westmere::sse_convert_utf32_to_latin1_with_errors(buf, len, + latin1_output); + if (ret.first.count != len) { + result scalar_res = scalar::utf32_to_latin1::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + latin1_output; // Set count to the number of 8-bit code units written + return ret.first; +} - struct validating_transcoder { - // If this is nonzero, there has been a UTF-8 error. - simd8 error; +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1( + const char32_t *buf, size_t len, char *latin1_output) const noexcept { + // optimization opportunity: we could provide an optimized function. + return convert_utf32_to_latin1(buf, len, latin1_output); +} +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 - validating_transcoder() : error(uint8_t(0)) {} - // - // Check whether the current bytes are valid UTF-8. - // - simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { - // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes - // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) - simd8 prev1 = input.prev<1>(prev_input); - simd8 sc = check_special_cases(input, prev1); - this->error |= check_multibyte_lengths(input, prev_input, sc); - } - - - - simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) { - size_t pos = 0; - char32_t* start{utf32_output}; - // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes, - // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate - // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, - // to give us a good margin. - size_t leading_byte = 0; - size_t margin = size; - for(; margin > 0 && leading_byte < 8; margin--) { - leading_byte += (int8_t(in[margin-1]) > -65); - } - // If the input is long enough, then we have that margin-1 is the fourth last leading byte. - const size_t safety_margin = size - margin + 1; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 input(reinterpret_cast(in + pos)); - if(input.is_ascii()) { - input.store_ascii_as_utf32(utf32_output); - utf32_output += 64; - pos += 64; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - auto zero = simd8{uint8_t(0)}; - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_utf32(in + pos, - utf8_end_of_code_point_mask, utf32_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. - } - } - if(errors()) { return 0; } - if(pos < size) { - size_t howmany = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output); - if(howmany == 0) { return 0; } - utf32_output += howmany; - } - return utf32_output - start; - } - - simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) { - size_t pos = 0; - char32_t* start{utf32_output}; - // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes, - // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate - // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, - // to give us a good margin. - size_t leading_byte = 0; - size_t margin = size; - for(; margin > 0 && leading_byte < 8; margin--) { - leading_byte += (int8_t(in[margin-1]) > -65); - } - // If the input is long enough, then we have that margin-1 is the fourth last leading byte. - const size_t safety_margin = size - margin + 1; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 input(reinterpret_cast(in + pos)); - if(input.is_ascii()) { - input.store_ascii_as_utf32(utf32_output); - utf32_output += 64; - pos += 64; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - auto zero = simd8{uint8_t(0)}; - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - if (errors()) { - result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); - res.count += pos; - return res; - } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_utf32(in + pos, - utf8_end_of_code_point_mask, utf32_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. - } - } - if(errors()) { - result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); - res.count += pos; - return res; - } - if(pos < size) { - result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); - if (res.error) { // In case of error, we want the error position - res.count += pos; - return res; - } else { // In case of success, we want the number of word written - utf32_output += res.count; - } - } - return result(error_code::SUCCESS, utf32_output - start); +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_output) const noexcept { + std::pair ret = + sse_convert_utf32_to_utf8(buf, len, utf8_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf8_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} - simdutf_really_inline bool errors() const { - return this->error.any_bits_set_anywhere(); +simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors( + const char32_t *buf, size_t len, char *utf8_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + westmere::sse_convert_utf32_to_utf8_with_errors(buf, len, utf8_output); + if (ret.first.count != len) { + result scalar_res = scalar::utf32_to_utf8::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; } + } + ret.first.count = + ret.second - + utf8_output; // Set count to the number of 8-bit code units written + return ret.first; +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 - }; // struct utf8_checker -} // utf8_to_utf32 namespace -} // unnamed namespace -} // namespace haswell -} // namespace simdutf -/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */ -// other functions -/* begin file src/generic/utf8.h */ - -namespace simdutf { -namespace haswell { -namespace { -namespace utf8 { - -using namespace simd; +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + std::pair ret = + sse_convert_utf16_to_utf32(buf, len, utf32_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf32_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_utf32::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} -simdutf_really_inline size_t count_code_points(const char* in, size_t size) { - size_t pos = 0; - size_t count = 0; - for(;pos + 64 <= size; pos += 64) { - simd8x64 input(reinterpret_cast(in + pos)); - uint64_t utf8_continuation_mask = input.gt(-65); - count += count_ones(utf8_continuation_mask); +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + std::pair ret = + sse_convert_utf16_to_utf32(buf, len, utf32_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf32_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_utf32::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; } - return count + scalar::utf8::count_code_points(in + pos, size - pos); + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; } -simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) { - size_t pos = 0; - size_t count = 0; - // This algorithm could no doubt be improved! - for(;pos + 64 <= size; pos += 64) { - simd8x64 input(reinterpret_cast(in + pos)); - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - // We count one word for anything that is not a continuation (so - // leading bytes). - count += 64 - count_ones(utf8_continuation_mask); - int64_t utf8_4byte = input.gteq_unsigned(240); - count += count_ones(utf8_4byte); +simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + westmere::sse_convert_utf16_to_utf32_with_errors( + buf, len, utf32_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = + scalar::utf16_to_utf32::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; } - return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos); + } + ret.first.count = + ret.second - + utf32_output; // Set count to the number of 8-bit code units written + return ret.first; } -} // utf8 namespace -} // unnamed namespace -} // namespace haswell -} // namespace simdutf -/* end file src/generic/utf8.h */ -/* begin file src/generic/utf16.h */ -namespace simdutf { -namespace haswell { -namespace { -namespace utf16 { -template -simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) { - size_t pos = 0; - size_t count = 0; - for(;pos < size/32*32; pos += 32) { - simd16x32 input(reinterpret_cast(in + pos)); - if (!match_system(big_endian)) { input.swap_bytes(); } - uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF); - count += count_ones(not_pair) / 2; +simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + westmere::sse_convert_utf16_to_utf32_with_errors( + buf, len, utf32_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = + scalar::utf16_to_utf32::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; } - return count + scalar::utf16::count_code_points(in + pos, size - pos); + } + ret.first.count = + ret.second - + utf32_output; // Set count to the number of 8-bit code units written + return ret.first; } +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 -template -simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) { - size_t pos = 0; - size_t count = 0; - // This algorithm could no doubt be improved! - for(;pos < size/32*32; pos += 32) { - simd16x32 input(reinterpret_cast(in + pos)); - if (!match_system(big_endian)) { input.swap_bytes(); } - uint64_t ascii_mask = input.lteq(0x7F); - uint64_t twobyte_mask = input.lteq(0x7FF); - uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF); +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_output) const noexcept { + return convert_utf32_to_utf8(buf, len, utf8_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 - size_t ascii_count = count_ones(ascii_mask) / 2; - size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2; - size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2; - size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2; - count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count; +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + std::pair ret = + sse_convert_utf32_to_utf16(buf, len, utf16_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf32_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; } - return count + scalar::utf16::utf8_length_from_utf16(in + pos, size - pos); + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; } -template -simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) { - return count_code_points(in, size); +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + std::pair ret = + sse_convert_utf32_to_utf16(buf, len, utf16_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf32_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; } -simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) { - size_t pos = 0; - - while (pos < size/32*32) { - simd16x32 input(reinterpret_cast(in + pos)); - input.swap_bytes(); - input.store(reinterpret_cast(output)); - pos += 32; - output += 32; +simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + westmere::sse_convert_utf32_to_utf16_with_errors( + buf, len, utf16_output); + if (ret.first.count != len) { + result scalar_res = + scalar::utf32_to_utf16::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } } + ret.first.count = + ret.second - + utf16_output; // Set count to the number of 8-bit code units written + return ret.first; +} - scalar::utf16::change_endianness_utf16(in + pos, size - pos, output); +simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + westmere::sse_convert_utf32_to_utf16_with_errors( + buf, len, utf16_output); + if (ret.first.count != len) { + result scalar_res = + scalar::utf32_to_utf16::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + utf16_output; // Set count to the number of 8-bit code units written + return ret.first; } -} // utf16 -} // unnamed namespace -} // namespace haswell -} // namespace simdutf -/* end file src/generic/utf16.h */ +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + return convert_utf32_to_utf16le(buf, len, utf16_output); +} +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + return convert_utf32_to_utf16be(buf, len, utf16_output); +} -// transcoding from UTF-8 to Latin 1 -/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */ +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + return convert_utf16le_to_utf32(buf, len, utf32_output); +} +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + return convert_utf16be_to_utf32(buf, len, utf32_output); +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 -namespace simdutf { -namespace haswell { -namespace { -namespace utf8_to_latin1 { -using namespace simd; +#if SIMDUTF_FEATURE_UTF16 +void implementation::change_endianness_utf16(const char16_t *input, + size_t length, + char16_t *output) const noexcept { + utf16::change_endianness_utf16(input, length, output); +} +simdutf_warn_unused size_t implementation::count_utf16le( + const char16_t *input, size_t length) const noexcept { + return utf16::count_code_points(input, length); +} - simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { -// For UTF-8 to Latin 1, we can allow any ASCII character, and any continuation byte, -// but the non-ASCII leading bytes must be 0b11000011 or 0b11000010 and nothing else. -// -// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) -// Bit 1 = Too Long (ASCII followed by continuation) -// Bit 2 = Overlong 3-byte -// Bit 4 = Surrogate -// Bit 5 = Overlong 2-byte -// Bit 7 = Two Continuations - constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ - // 11______ 11______ - constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ - constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ - constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ - constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ - constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ - constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ - // 11110100 101_____ - // 11110101 1001____ - // 11110101 101_____ - // 1111011_ 1001____ - // 1111011_ 101_____ - // 11111___ 1001____ - // 11111___ 101_____ - constexpr const uint8_t TOO_LARGE_1000 = 1<<6; - // 11110101 1000____ - // 1111011_ 1000____ - // 11111___ 1000____ - constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ - constexpr const uint8_t FORBIDDEN = 0xff; - - const simd8 byte_1_high = prev1.shr<4>().lookup_16( - // 0_______ ________ - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - // 10______ ________ - TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, - // 1100____ ________ - TOO_SHORT | OVERLONG_2, - // 1101____ ________ - FORBIDDEN, - // 1110____ ________ - FORBIDDEN, - // 1111____ ________ - FORBIDDEN - ); - constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . - const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( - // ____0000 ________ - CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, - // ____0001 ________ - CARRY | OVERLONG_2, - // ____001_ ________ - CARRY, - CARRY, - - // ____0100 ________ - FORBIDDEN, - // ____0101 ________ - FORBIDDEN, - // ____011_ ________ - FORBIDDEN, - FORBIDDEN, +simdutf_warn_unused size_t implementation::count_utf16be( + const char16_t *input, size_t length) const noexcept { + return utf16::count_code_points(input, length); +} +#endif // SIMDUTF_FEATURE_UTF16 - // ____1___ ________ - FORBIDDEN, - FORBIDDEN, - FORBIDDEN, - FORBIDDEN, - FORBIDDEN, - // ____1101 ________ - FORBIDDEN, - FORBIDDEN, - FORBIDDEN - ); - const simd8 byte_2_high = input.shr<4>().lookup_16( - // ________ 0_______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, +#if SIMDUTF_FEATURE_UTF8 +simdutf_warn_unused size_t +implementation::count_utf8(const char *input, size_t length) const noexcept { + return utf8::count_code_points_bytemask(input, length); +} +#endif // SIMDUTF_FEATURE_UTF8 - // ________ 1000____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, - // ________ 1001____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, - // ________ 101_____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::latin1_length_from_utf8( + const char *buf, size_t len) const noexcept { + return count_utf8(buf, len); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 - // ________ 11______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT - ); - return (byte_1_high & byte_1_low & byte_2_high); - } +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::utf8_length_from_utf16le( + const char16_t *input, size_t length) const noexcept { + return utf16::utf8_length_from_utf16_bytemask(input, + length); +} - struct validating_transcoder { - // If this is nonzero, there has been a UTF-8 error. - simd8 error; +simdutf_warn_unused size_t implementation::utf8_length_from_utf16be( + const char16_t *input, size_t length) const noexcept { + return utf16::utf8_length_from_utf16_bytemask(input, length); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 - validating_transcoder() : error(uint8_t(0)) {} - // - // Check whether the current bytes are valid UTF-8. - // - simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { - // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes - // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) - simd8 prev1 = input.prev<1>(prev_input); - this->error |= check_special_cases(input, prev1); - } - - - simdutf_really_inline size_t convert(const char* in, size_t size, char* latin1_output) { - size_t pos = 0; - char* start{latin1_output}; - // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last 16 bytes, - // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate - // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, - // to give us a good margin. - size_t leading_byte = 0; - size_t margin = size; - for(; margin > 0 && leading_byte < 8; margin--) { - leading_byte += (int8_t(in[margin-1]) > -65); //twos complement of -65 is 1011 1111 ... - } - // If the input is long enough, then we have that margin-1 is the eight last leading byte. - const size_t safety_margin = size - margin + 1; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 input(reinterpret_cast(in + pos)); - if(input.is_ascii()) { - input.store((int8_t*)latin1_output); - latin1_output += 64; - pos += 64; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - auto zero = simd8{uint8_t(0)}; - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in this case, we also have ASCII to account for. - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_latin1(in + pos, - utf8_end_of_code_point_mask, latin1_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. - } - } - if(errors()) { return 0; } - if(pos < size) { - size_t howmany = scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output); - if(howmany == 0) { return 0; } - latin1_output += howmany; - } - return latin1_output - start; - } - - simdutf_really_inline result convert_with_errors(const char* in, size_t size, char* latin1_output) { - size_t pos = 0; - char* start{latin1_output}; - // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last 16 bytes, - // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate - // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, - // to give us a good margin. - size_t leading_byte = 0; - size_t margin = size; - for(; margin > 0 && leading_byte < 8; margin--) { - leading_byte += (int8_t(in[margin-1]) > -65); - } - // If the input is long enough, then we have that margin-1 is the eight last leading byte. - const size_t safety_margin = size - margin + 1; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 input(reinterpret_cast(in + pos)); - if(input.is_ascii()) { - input.store((int8_t*)latin1_output); - latin1_output += 64; - pos += 64; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - auto zero = simd8{uint8_t(0)}; - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - if (errors()) { - // rewind_and_convert_with_errors will seek a potential error from in+pos onward, - // with the ability to go back up to pos bytes, and read size-pos bytes forward. - result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, in + pos, size - pos, latin1_output); - res.count += pos; - return res; - } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_latin1(in + pos, - utf8_end_of_code_point_mask, latin1_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. - } - } - if(errors()) { - // rewind_and_convert_with_errors will seek a potential error from in+pos onward, - // with the ability to go back up to pos bytes, and read size-pos bytes forward. - result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, in + pos, size - pos, latin1_output); - res.count += pos; - return res; - } - if(pos < size) { - // rewind_and_convert_with_errors will seek a potential error from in+pos onward, - // with the ability to go back up to pos bytes, and read size-pos bytes forward. - result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, in + pos, size - pos, latin1_output); - if (res.error) { // In case of error, we want the error position - res.count += pos; - return res; - } else { // In case of success, we want the number of word written - latin1_output += res.count; - } +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::utf8_length_from_latin1( + const char *input, size_t len) const noexcept { + const uint8_t *str = reinterpret_cast(input); + size_t answer = len / sizeof(__m128i) * sizeof(__m128i); + size_t i = 0; + if (answer >= 2048) { // long strings optimization + __m128i two_64bits = _mm_setzero_si128(); + while (i + sizeof(__m128i) <= len) { + __m128i runner = _mm_setzero_si128(); + size_t iterations = (len - i) / sizeof(__m128i); + if (iterations > 255) { + iterations = 255; } - return result(error_code::SUCCESS, latin1_output - start); - } - - simdutf_really_inline bool errors() const { - return this->error.any_bits_set_anywhere(); - } - - }; // struct utf8_checker -} // utf8_to_latin1 namespace -} // unnamed namespace -} // namespace haswell -} // namespace simdutf -/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */ -/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */ - - -namespace simdutf { -namespace haswell { -namespace { -namespace utf8_to_latin1 { -using namespace simd; - - - simdutf_really_inline size_t convert_valid(const char* in, size_t size, char* latin1_output) { - size_t pos = 0; - char* start{latin1_output}; - // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last 16 bytes, - // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate - // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, - // to give us a good margin. - size_t leading_byte = 0; - size_t margin = size; - for(; margin > 0 && leading_byte < 8; margin--) { - leading_byte += (int8_t(in[margin-1]) > -65); //twos complement of -65 is 1011 1111 ... - } - // If the input is long enough, then we have that margin-1 is the eight last leading byte. - const size_t safety_margin = size - margin + 1; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 input(reinterpret_cast(in + pos)); - if(input.is_ascii()) { - input.store((int8_t*)latin1_output); - latin1_output += 64; - pos += 64; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - uint64_t utf8_continuation_mask = input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in this case, we also have ASCII to account for. - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_latin1(in + pos, - utf8_end_of_code_point_mask, latin1_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. - } + size_t max_i = i + iterations * sizeof(__m128i) - sizeof(__m128i); + for (; i + 4 * sizeof(__m128i) <= max_i; i += 4 * sizeof(__m128i)) { + __m128i input1 = _mm_loadu_si128((const __m128i *)(str + i)); + __m128i input2 = + _mm_loadu_si128((const __m128i *)(str + i + sizeof(__m128i))); + __m128i input3 = + _mm_loadu_si128((const __m128i *)(str + i + 2 * sizeof(__m128i))); + __m128i input4 = + _mm_loadu_si128((const __m128i *)(str + i + 3 * sizeof(__m128i))); + __m128i input12 = + _mm_add_epi8(_mm_cmpgt_epi8(_mm_setzero_si128(), input1), + _mm_cmpgt_epi8(_mm_setzero_si128(), input2)); + __m128i input34 = + _mm_add_epi8(_mm_cmpgt_epi8(_mm_setzero_si128(), input3), + _mm_cmpgt_epi8(_mm_setzero_si128(), input4)); + __m128i input1234 = _mm_add_epi8(input12, input34); + runner = _mm_sub_epi8(runner, input1234); } - if(pos < size) { - size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos, latin1_output); - latin1_output += howmany; + for (; i <= max_i; i += sizeof(__m128i)) { + __m128i more_input = _mm_loadu_si128((const __m128i *)(str + i)); + runner = _mm_sub_epi8(runner, + _mm_cmpgt_epi8(_mm_setzero_si128(), more_input)); } - return latin1_output - start; - } - - } -} // utf8_to_latin1 namespace -} // unnamed namespace -} // namespace haswell - // namespace simdutf -/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */ - -namespace simdutf { -namespace haswell { - -simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept { - // If there is a BOM, then we trust it. - auto bom_encoding = simdutf::BOM::check_bom(input, length); - if(bom_encoding != encoding_type::unspecified) { return bom_encoding; } - if (length % 2 == 0) { - return avx2_detect_encodings(input, length); - } else { - if (implementation::validate_utf8(input, length)) { - return simdutf::encoding_type::UTF8; - } else { - return simdutf::encoding_type::unspecified; - } - } + two_64bits = + _mm_add_epi64(two_64bits, _mm_sad_epu8(runner, _mm_setzero_si128())); + } + answer += + _mm_extract_epi64(two_64bits, 0) + _mm_extract_epi64(two_64bits, 1); + } else if (answer > 0) { // short string optimization + for (; i + 2 * sizeof(__m128i) <= len; i += 2 * sizeof(__m128i)) { + __m128i latin = _mm_loadu_si128((const __m128i *)(input + i)); + uint16_t non_ascii = (uint16_t)_mm_movemask_epi8(latin); + answer += count_ones(non_ascii); + latin = _mm_loadu_si128((const __m128i *)(input + i) + 1); + non_ascii = (uint16_t)_mm_movemask_epi8(latin); + answer += count_ones(non_ascii); + } + for (; i + sizeof(__m128i) <= len; i += sizeof(__m128i)) { + __m128i latin = _mm_loadu_si128((const __m128i *)(input + i)); + uint16_t non_ascii = (uint16_t)_mm_movemask_epi8(latin); + answer += count_ones(non_ascii); + } + } + return answer + scalar::latin1::utf8_length_from_latin1( + reinterpret_cast(str + i), len - i); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf32_length_from_utf16le( + const char16_t *input, size_t length) const noexcept { + return utf16::utf32_length_from_utf16(input, length); } -simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { - return haswell::utf8_validation::generic_validate_utf8(buf,len); +simdutf_warn_unused size_t implementation::utf32_length_from_utf16be( + const char16_t *input, size_t length) const noexcept { + return utf16::utf32_length_from_utf16(input, length); } +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 -simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept { - return haswell::utf8_validation::generic_validate_utf8_with_errors(buf,len); +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::utf16_length_from_utf8( + const char *input, size_t length) const noexcept { + return utf8::utf16_length_from_utf8_bytemask(input, length); +} +simdutf_warn_unused result +implementation::utf8_length_from_utf16le_with_replacement( + const char16_t *input, size_t length) const noexcept { + return utf16::utf8_length_from_utf16_with_replacement( + input, length); } -simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept { - return haswell::utf8_validation::generic_validate_ascii(buf,len); +simdutf_warn_unused result +implementation::utf8_length_from_utf16be_with_replacement( + const char16_t *input, size_t length) const noexcept { + return utf16::utf8_length_from_utf16_with_replacement( + input, length); } -simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept { - return haswell::utf8_validation::generic_validate_ascii_with_errors(buf,len); +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf8_length_from_utf32( + const char32_t *input, size_t length) const noexcept { + return utf32::utf8_length_from_utf32(input, length); } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 -simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept { - const char16_t* tail = avx2_validate_utf16(buf, len); - if (tail) { - return scalar::utf16::validate(tail, len - (tail - buf)); - } else { - return false; +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf16_length_from_utf32( + const char32_t *input, size_t length) const noexcept { + const __m128i v_00000000 = _mm_setzero_si128(); + const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000); + size_t pos = 0; + size_t count = 0; + for (; pos + 4 <= length; pos += 4) { + __m128i in = _mm_loadu_si128((__m128i *)(input + pos)); + const __m128i surrogate_bytemask = + _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000); + const uint16_t surrogate_bitmask = + static_cast(_mm_movemask_epi8(surrogate_bytemask)); + size_t surrogate_count = (16 - count_ones(surrogate_bitmask)) / 4; + count += 4 + surrogate_count; } + return count + + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos); } +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 -simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept { - const char16_t* tail = avx2_validate_utf16(buf, len); - if (tail) { - return scalar::utf16::validate(tail, len - (tail - buf)); +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf32_length_from_utf8( + const char *input, size_t length) const noexcept { + return utf8::count_code_points(input, length); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_BASE64 +simdutf_warn_unused result implementation::base64_to_binary( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + if (options & base64_default_or_url) { + if (options == base64_options::base64_default_or_url_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } } else { - return false; + if (options == base64_options::base64_default_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } } } -simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept { - result res = avx2_validate_utf16_with_errors(buf, len); - if (res.count != len) { - result scalar_res = scalar::utf16::validate_with_errors(buf + res.count, len - res.count); - return result(scalar_res.error, res.count + scalar_res.count); +simdutf_warn_unused full_result implementation::base64_to_binary_details( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + if (options & base64_default_or_url) { + if (options == base64_options::base64_default_or_url_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } } else { - return res; + if (options == base64_options::base64_default_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } } } -simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept { - result res = avx2_validate_utf16_with_errors(buf, len); - if (res.count != len) { - result scalar_res = scalar::utf16::validate_with_errors(buf + res.count, len - res.count); - return result(scalar_res.error, res.count + scalar_res.count); +simdutf_warn_unused result implementation::base64_to_binary( + const char16_t *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + if (options & base64_default_or_url) { + if (options == base64_options::base64_default_or_url_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } } else { - return res; + if (options == base64_options::base64_default_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } } } -simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept { - const char32_t* tail = avx2_validate_utf32le(buf, len); - if (tail) { - return scalar::utf32::validate(tail, len - (tail - buf)); +simdutf_warn_unused full_result implementation::base64_to_binary_details( + const char16_t *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + if (options & base64_default_or_url) { + if (options == base64_options::base64_default_or_url_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } } else { - return false; + if (options == base64_options::base64_default_accept_garbage) { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return base64::compress_decode_base64( + output, input, length, options, last_chunk_options); + } } } -simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept { - result res = avx2_validate_utf32le_with_errors(buf, len); - if (res.count != len) { - result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count); - return result(scalar_res.error, res.count + scalar_res.count); +size_t implementation::binary_to_base64(const char *input, size_t length, + char *output, + base64_options options) const noexcept { + if (options & base64_url) { + return encode_base64(output, input, length, options); } else { - return res; + return encode_base64(output, input, length, options); } } -simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept { - std::pair ret = avx2_convert_latin1_to_utf8(buf, len, utf8_output); - size_t converted_chars = ret.second - utf8_output; +size_t implementation::binary_to_base64_with_lines( + const char *input, size_t length, char *output, size_t line_length, + base64_options options) const noexcept { + if (options & base64_url) { + return encode_base64_impl(output, input, length, options, + line_length); - if (ret.first != buf + len) { - const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert( - ret.first, len - (ret.first - buf), ret.second); - converted_chars += scalar_converted_chars; + } else { + return encode_base64_impl(output, input, length, options, + line_length); } - - return converted_chars; } -simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - std::pair ret = avx2_convert_latin1_to_utf16(buf, len, utf16_output); - if (ret.first == nullptr) { return 0; } - size_t converted_chars = ret.second - utf16_output; - if (ret.first != buf + len) { - const size_t scalar_converted_chars = scalar::latin1_to_utf16::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_converted_chars == 0) { return 0; } - converted_chars += scalar_converted_chars; - } - return converted_chars; +const char *implementation::find(const char *start, const char *end, + char character) const noexcept { + return util::find(start, end, character); } -simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - std::pair ret = avx2_convert_latin1_to_utf16(buf, len, utf16_output); - if (ret.first == nullptr) { return 0; } - size_t converted_chars = ret.second - utf16_output; - if (ret.first != buf + len) { - const size_t scalar_converted_chars = scalar::latin1_to_utf16::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_converted_chars == 0) { return 0; } - converted_chars += scalar_converted_chars; - } - return converted_chars; +const char16_t *implementation::find(const char16_t *start, const char16_t *end, + char16_t character) const noexcept { + return util::find(start, end, character); } +#endif // SIMDUTF_FEATURE_BASE64 -simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept { - std::pair ret = avx2_convert_latin1_to_utf32(buf, len, utf32_output); - if (ret.first == nullptr) { return 0; } - size_t converted_chars = ret.second - utf32_output; - if (ret.first != buf + len) { - const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_converted_chars == 0) { return 0; } - converted_chars += scalar_converted_chars; - } - return converted_chars; -} +} // namespace westmere +} // namespace simdutf -simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept { - utf8_to_latin1::validating_transcoder converter; - return converter.convert(buf, len, latin1_output); -} +/* begin file src/simdutf/westmere/end.h */ +#if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE +// nothing needed. +#else +SIMDUTF_UNTARGET_REGION +#endif -simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept { - utf8_to_latin1::validating_transcoder converter; - return converter.convert_with_errors(buf, len, latin1_output); -} +#undef SIMDUTF_SIMD_HAS_BYTEMASK +/* end file src/simdutf/westmere/end.h */ +/* end file src/westmere/implementation.cpp */ +#endif +#if SIMDUTF_IMPLEMENTATION_LASX +/* begin file src/lasx/implementation.cpp */ +/* begin file src/simdutf/lasx/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "lasx" +// #define SIMDUTF_IMPLEMENTATION lasx +#define SIMDUTF_SIMD_HAS_UNSIGNED_CMP 1 + +#if SIMDUTF_CAN_ALWAYS_RUN_LASX +// nothing needed. +#else +SIMDUTF_TARGET_LASX +#endif +/* end file src/simdutf/lasx/begin.h */ +namespace simdutf { +namespace lasx { +namespace { +#ifndef SIMDUTF_LASX_H + #error "lasx.h must be included" +#endif +using namespace simd; + +#if SIMDUTF_FEATURE_UTF8 +// convert vmskltz/vmskgez/vmsknz to +// simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes index +const uint8_t lasx_1_2_utf8_bytes_mask[] = { + 0, 1, 4, 5, 16, 17, 20, 21, 64, 65, 68, 69, 80, 81, 84, + 85, 2, 3, 6, 7, 18, 19, 22, 23, 66, 67, 70, 71, 82, 83, + 86, 87, 8, 9, 12, 13, 24, 25, 28, 29, 72, 73, 76, 77, 88, + 89, 92, 93, 10, 11, 14, 15, 26, 27, 30, 31, 74, 75, 78, 79, + 90, 91, 94, 95, 32, 33, 36, 37, 48, 49, 52, 53, 96, 97, 100, + 101, 112, 113, 116, 117, 34, 35, 38, 39, 50, 51, 54, 55, 98, 99, + 102, 103, 114, 115, 118, 119, 40, 41, 44, 45, 56, 57, 60, 61, 104, + 105, 108, 109, 120, 121, 124, 125, 42, 43, 46, 47, 58, 59, 62, 63, + 106, 107, 110, 111, 122, 123, 126, 127, 128, 129, 132, 133, 144, 145, 148, + 149, 192, 193, 196, 197, 208, 209, 212, 213, 130, 131, 134, 135, 146, 147, + 150, 151, 194, 195, 198, 199, 210, 211, 214, 215, 136, 137, 140, 141, 152, + 153, 156, 157, 200, 201, 204, 205, 216, 217, 220, 221, 138, 139, 142, 143, + 154, 155, 158, 159, 202, 203, 206, 207, 218, 219, 222, 223, 160, 161, 164, + 165, 176, 177, 180, 181, 224, 225, 228, 229, 240, 241, 244, 245, 162, 163, + 166, 167, 178, 179, 182, 183, 226, 227, 230, 231, 242, 243, 246, 247, 168, + 169, 172, 173, 184, 185, 188, 189, 232, 233, 236, 237, 248, 249, 252, 253, + 170, 171, 174, 175, 186, 187, 190, 191, 234, 235, 238, 239, 250, 251, 254, + 255}; +#endif // SIMDUTF_FEATURE_UTF8 -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* input, size_t size, - char* latin1_output) const noexcept { - return utf8_to_latin1::convert_valid(input, size, latin1_output); +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32 +simdutf_really_inline __m128i lsx_swap_bytes(__m128i vec) { + return __lsx_vshuf4i_b(vec, 0b10110001); } - -simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - utf8_to_utf16::validating_transcoder converter; - return converter.convert(buf, len, utf16_output); +simdutf_really_inline __m256i lasx_swap_bytes(__m256i vec) { + return __lasx_xvshuf4i_b(vec, 0b10110001); } +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32 -simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - utf8_to_utf16::validating_transcoder converter; - return converter.convert(buf, len, utf16_output); +#if SIMDUTF_FEATURE_ASCII || SIMDUTF_FEATURE_DETECT_ENCODING || \ + SIMDUTF_FEATURE_UTF8 +simdutf_really_inline bool is_ascii(const simd8x64 &input) { + return input.is_ascii(); } +#endif // SIMDUTF_FEATURE_ASCII || SIMDUTF_FEATURE_DETECT_ENCODING || + // SIMDUTF_FEATURE_UTF8 -simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - utf8_to_utf16::validating_transcoder converter; - return converter.convert_with_errors(buf, len, utf16_output); +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_really_inline simd8 +must_be_2_3_continuation(const simd8 prev2, + const simd8 prev3) { + simd8 is_third_byte = prev2 >= uint8_t(0b11100000u); + simd8 is_fourth_byte = prev3 >= uint8_t(0b11110000u); + return is_third_byte ^ is_fourth_byte; } +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING -simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - utf8_to_utf16::validating_transcoder converter; - return converter.convert_with_errors(buf, len, utf16_output); -} +#if SIMDUTF_FEATURE_UTF8 && (SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32) +// common functions for utf8 conversions +simdutf_really_inline __m128i convert_utf8_3_byte_to_utf16(__m128i in) { + // Low half contains 10bbbbbb|10cccccc + // High half contains 1110aaaa|1110aaaa + const v16u8 sh = {2, 1, 5, 4, 8, 7, 11, 10, 0, 0, 3, 3, 6, 6, 9, 9}; + const v8u16 v0fff = {0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff}; + + __m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, (__m128i)sh); + // 1110aaaa => aaaa0000 + __m128i perm_high = __lsx_vslli_b(__lsx_vbsrl_v(perm, 8), 4); + // 10bbbbbb 10cccccc => 0010bbbb bbcccccc + __m128i composed = __lsx_vbitsel_v(__lsx_vsrli_h(perm, 2), /* perm >> 2*/ + perm, __lsx_vrepli_h(0x3f) /* 0x003f */); + // 0010bbbb bbcccccc => aaaabbbb bbcccccc + composed = __lsx_vbitsel_v(perm_high, composed, (__m128i)v0fff); -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* input, size_t size, - char16_t* utf16_output) const noexcept { - return utf8_to_utf16::convert_valid(input, size, utf16_output); + return composed; } -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* input, size_t size, - char16_t* utf16_output) const noexcept { - return utf8_to_utf16::convert_valid(input, size, utf16_output); +simdutf_really_inline __m128i convert_utf8_2_byte_to_utf16(__m128i in) { + // 10bbbbb 110aaaaa => 00bbbbb 000aaaaa + __m128i composed = __lsx_vand_v(in, __lsx_vldi(0x3f)); + // 00bbbbbb 000aaaaa => 00000aaa aabbbbbb + composed = __lsx_vbitsel_v( + __lsx_vsrli_h(__lsx_vslli_h(composed, 8), 2), /* (aaaaa << 8) >> 2 */ + __lsx_vsrli_h(composed, 8), /* bbbbbb >> 8 */ + __lsx_vrepli_h(0x3f)); /* 0x003f */ + return composed; } -simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept { - utf8_to_utf32::validating_transcoder converter; - return converter.convert(buf, len, utf32_output); +simdutf_really_inline __m128i +convert_utf8_1_to_2_byte_to_utf16(__m128i in, size_t shufutf8_idx) { + // Converts 6 1-2 byte UTF-8 characters to 6 UTF-16 characters. + // This is a relatively easy scenario + // we process SIX (6) input code-code units. The max length in bytes of six + // code code units spanning between 1 and 2 bytes each is 12 bytes. + __m128i sh = + __lsx_vld(reinterpret_cast( + simdutf::tables::utf8_to_utf16::shufutf8[shufutf8_idx]), + 0); + // Shuffle + // 1 byte: 00000000 0bbbbbbb + // 2 byte: 110aaaaa 10bbbbbb + __m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, sh); + // 1 byte: 00000000 0bbbbbbb + // 2 byte: 00000000 00bbbbbb + __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_h(0x7f)); // 6 or 7 bits + // 1 byte: 00000000 00000000 + // 2 byte: 00000aaa aa000000 + __m128i v1f00 = lsx_splat_u16(0x1f00); + __m128i composed = __lsx_vsrli_h(__lsx_vand_v(perm, v1f00), 2); // 5 bits + // Combine with a shift right accumulate + // 1 byte: 00000000 0bbbbbbb + // 2 byte: 00000aaa aabbbbbb + composed = __lsx_vadd_h(ascii, composed); + return composed; } +#endif // SIMDUTF_FEATURE_UTF8 && (SIMDUTF_FEATURE_UTF16 || + // SIMDUTF_FEATURE_UTF32) -simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept { - utf8_to_utf32::validating_transcoder converter; - return converter.convert_with_errors(buf, len, utf32_output); +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +/* begin file src/lasx/lasx_validate_utf16.cpp */ +template +simd8 utf16_gather_high_bytes(const simd16 in0, + const simd16 in1) { + if (big_endian) { + const auto mask = simd16(0x00ff); + const auto t0 = in0 & mask; + const auto t1 = in1 & mask; + + return simd16::pack(t0, t1); + } else { + return simd16::pack_shifted_right<8>(in0, in1); + } } +/* end file src/lasx/lasx_validate_utf16.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING +/* begin file src/lasx/lasx_validate_utf32le.cpp */ +const char32_t *lasx_validate_utf32le(const char32_t *input, size_t size) { + const char32_t *end = input + size; -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size, - char32_t* utf32_output) const noexcept { - return utf8_to_utf32::convert_valid(input, size, utf32_output); -} + // Performance degradation when memory address is not 32-byte aligned + while (((uint64_t)input & 0x1F) && input < end) { + uint32_t word = *input++; + if (word > 0x10FFFF || (word >= 0xD800 && word <= 0xDFFF)) { + return nullptr; + } + } + __m256i offset = lasx_splat_u32(0xffff2000); + __m256i standardoffsetmax = lasx_splat_u32(0xfffff7ff); + __m256i standardmax = lasx_splat_u32(0x10ffff); + __m256i currentmax = __lasx_xvldi(0x0); + __m256i currentoffsetmax = __lasx_xvldi(0x0); -simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { - std::pair ret = haswell::avx2_convert_utf16_to_latin1(buf, len, latin1_output); - if (ret.first == nullptr) { return 0; } - size_t saved_bytes = ret.second - latin1_output; - if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf16_to_latin1::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; + while (input + 8 < end) { + __m256i in = __lasx_xvld(reinterpret_cast(input), 0); + currentmax = __lasx_xvmax_wu(in, currentmax); + // 0xD8__ + 0x2000 = 0xF8__ => 0xF8__ > 0xF7FF + currentoffsetmax = + __lasx_xvmax_wu(__lasx_xvadd_w(in, offset), currentoffsetmax); + input += 8; } - return saved_bytes; -} - -simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { - std::pair ret = haswell::avx2_convert_utf16_to_latin1(buf, len, latin1_output); - if (ret.first == nullptr) { return 0; } - size_t saved_bytes = ret.second - latin1_output; - if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf16_to_latin1::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; + __m256i is_zero = + __lasx_xvxor_v(__lasx_xvmax_wu(currentmax, standardmax), standardmax); + if (__lasx_xbnz_v(is_zero)) { + return nullptr; } - return saved_bytes; -} -simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept { - std::pair ret = avx2_convert_utf16_to_latin1_with_errors(buf, len, latin1_output); - if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count - if (ret.first.count != len) { // All good so far, but not finished - result scalar_res = scalar::utf16_to_latin1::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); - if (scalar_res.error) { - scalar_res.count += ret.first.count; - return scalar_res; - } else { - ret.second += scalar_res.count; - } + is_zero = __lasx_xvxor_v(__lasx_xvmax_wu(currentoffsetmax, standardoffsetmax), + standardoffsetmax); + if (__lasx_xbnz_v(is_zero)) { + return nullptr; } - ret.first.count = ret.second - latin1_output; // Set count to the number of 8-bit code units written - return ret.first; + return input; } -simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept { - std::pair ret = avx2_convert_utf16_to_latin1_with_errors(buf, len, latin1_output); - if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count - if (ret.first.count != len) { // All good so far, but not finished - result scalar_res = scalar::utf16_to_latin1::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); - if (scalar_res.error) { - scalar_res.count += ret.first.count; - return scalar_res; - } else { - ret.second += scalar_res.count; +const result lasx_validate_utf32le_with_errors(const char32_t *input, + size_t size) { + const char32_t *start = input; + const char32_t *end = input + size; + + // Performance degradation when memory address is not 32-byte aligned + while (((uint64_t)input & 0x1F) && input < end) { + uint32_t word = *input; + if (word > 0x10FFFF) { + return result(error_code::TOO_LARGE, input - start); } + if (word >= 0xD800 && word <= 0xDFFF) { + return result(error_code::SURROGATE, input - start); + } + input++; } - ret.first.count = ret.second - latin1_output; // Set count to the number of 8-bit code units written - return ret.first; -} -simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { - // optimization opportunity: implement a custom function - return convert_utf16be_to_latin1(buf, len, latin1_output); -} + __m256i offset = lasx_splat_u32(0xffff2000); + __m256i standardoffsetmax = lasx_splat_u32(0xfffff7ff); + __m256i standardmax = lasx_splat_u32(0x10ffff); + __m256i currentmax = __lasx_xvldi(0x0); + __m256i currentoffsetmax = __lasx_xvldi(0x0); -simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { - // optimization opportunity: implement a custom function - return convert_utf16le_to_latin1(buf, len, latin1_output); -} + while (input + 8 < end) { + __m256i in = __lasx_xvld(reinterpret_cast(input), 0); + currentmax = __lasx_xvmax_wu(in, currentmax); + currentoffsetmax = + __lasx_xvmax_wu(__lasx_xvadd_w(in, offset), currentoffsetmax); -simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - std::pair ret = haswell::avx2_convert_utf16_to_utf8(buf, len, utf8_output); - if (ret.first == nullptr) { return 0; } - size_t saved_bytes = ret.second - utf8_output; - if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; + __m256i is_zero = + __lasx_xvxor_v(__lasx_xvmax_wu(currentmax, standardmax), standardmax); + if (__lasx_xbnz_v(is_zero)) { + return result(error_code::TOO_LARGE, input - start); + } + is_zero = + __lasx_xvxor_v(__lasx_xvmax_wu(currentoffsetmax, standardoffsetmax), + standardoffsetmax); + if (__lasx_xbnz_v(is_zero)) { + return result(error_code::SURROGATE, input - start); + } + input += 8; } - return saved_bytes; -} -simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - std::pair ret = haswell::avx2_convert_utf16_to_utf8(buf, len, utf8_output); - if (ret.first == nullptr) { return 0; } - size_t saved_bytes = ret.second - utf8_output; - if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; - } - return saved_bytes; + return result(error_code::SUCCESS, input - start); } +/* end file src/lasx/lasx_validate_utf32le.cpp */ +#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING -simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of code units written even if finished - std::pair ret = haswell::avx2_convert_utf16_to_utf8_with_errors(buf, len, utf8_output); - if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count - if (ret.first.count != len) { // All good so far, but not finished - result scalar_res = scalar::utf16_to_utf8::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); - if (scalar_res.error) { - scalar_res.count += ret.first.count; - return scalar_res; - } else { - ret.second += scalar_res.count; - } - } - ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit code units written - return ret.first; -} +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/lasx/lasx_convert_latin1_to_utf8.cpp */ +/* + Returns a pair: the first unprocessed byte from buf and utf8_output + A scalar routing should carry on the conversion of the tail. +*/ -simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of code units written even if finished - std::pair ret = haswell::avx2_convert_utf16_to_utf8_with_errors(buf, len, utf8_output); - if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count - if (ret.first.count != len) { // All good so far, but not finished - result scalar_res = scalar::utf16_to_utf8::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); - if (scalar_res.error) { - scalar_res.count += ret.first.count; - return scalar_res; - } else { - ret.second += scalar_res.count; +std::pair +lasx_convert_latin1_to_utf8(const char *latin1_input, size_t len, + char *utf8_out) { + uint8_t *utf8_output = reinterpret_cast(utf8_out); + const size_t safety_margin = 12; + const char *end = latin1_input + len; + + // We always write 16 bytes, of which more than the first 8 bytes + // are valid. A safety margin of 8 is more than sufficient. + while (end - latin1_input >= std::ptrdiff_t(16 + safety_margin)) { + __m128i in8 = __lsx_vld(reinterpret_cast(latin1_input), 0); + uint32_t ascii_mask = __lsx_vpickve2gr_wu(__lsx_vmskgez_b(in8), 0); + if (ascii_mask == 0xFFFF) { + __lsx_vst(in8, utf8_output, 0); + utf8_output += 16; + latin1_input += 16; + continue; } - } - ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit code units written - return ret.first; -} + // We just fallback on UTF-16 code. This could be optimized/simplified + // further. + __m256i in16 = __lasx_vext2xv_hu_bu(____m256i(in8)); + // 1. prepare 2-byte values + // input 8-bit word : [aabb|bbbb] x 16 + // expected output : [1100|00aa|10bb|bbbb] x 16 + // t0 = [0000|00aa|bbbb|bb00] + __m256i t0 = __lasx_xvslli_h(in16, 2); + // t1 = [0000|00aa|0000|0000] + __m256i t1 = __lasx_xvand_v(t0, lasx_splat_u16(0x300)); + // t3 = [0000|00aa|00bb|bbbb] + __m256i t2 = __lasx_xvbitsel_v(t1, in16, __lasx_xvrepli_h(0x3f)); + // t4 = [1100|00aa|10bb|bbbb] + __m256i t3 = __lasx_xvor_v(t2, __lasx_xvreplgr2vr_h(uint16_t(0xc080))); + // merge ASCII and 2-byte codewords + __m256i one_byte_bytemask = __lasx_xvsle_hu(in16, __lasx_xvrepli_h(0x7F)); + __m256i utf8_unpacked = __lasx_xvbitsel_v(t3, in16, one_byte_bytemask); + + const uint8_t *row0 = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes + [lasx_1_2_utf8_bytes_mask[(ascii_mask & 0xFF)]][0]; + __m128i shuffle0 = __lsx_vld(row0 + 1, 0); + __m128i utf8_unpacked_lo = lasx_extracti128_lo(utf8_unpacked); + __m128i utf8_packed0 = + __lsx_vshuf_b(utf8_unpacked_lo, utf8_unpacked_lo, shuffle0); + __lsx_vst(utf8_packed0, utf8_output, 0); + utf8_output += row0[0]; + + const uint8_t *row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes + [lasx_1_2_utf8_bytes_mask[(ascii_mask >> 8)]][0]; + __m128i shuffle1 = __lsx_vld(row1 + 1, 0); + __m128i utf8_unpacked_hi = lasx_extracti128_hi(utf8_unpacked); + __m128i utf8_packed1 = + __lsx_vshuf_b(utf8_unpacked_hi, utf8_unpacked_hi, shuffle1); + __lsx_vst(utf8_packed1, utf8_output, 0); + utf8_output += row1[0]; -simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - return convert_utf16le_to_utf8(buf, len, utf8_output); -} + latin1_input += 16; + } // while -simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - return convert_utf16be_to_utf8(buf, len, utf8_output); + return std::make_pair(latin1_input, reinterpret_cast(utf8_output)); } +/* end file src/lasx/lasx_convert_latin1_to_utf8.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/lasx/lasx_convert_latin1_to_utf16.cpp */ +std::pair +lasx_convert_latin1_to_utf16le(const char *buf, size_t len, + char16_t *utf16_output) { + const char *end = buf + len; -simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { - std::pair ret = avx2_convert_utf32_to_utf8(buf, len, utf8_output); - if (ret.first == nullptr) { return 0; } - size_t saved_bytes = ret.second - utf8_output; - if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; + // Performance degradation when memory address is not 32-byte aligned + while (((uint64_t)utf16_output & 0x1F) && buf < end) { + *utf16_output++ = uint8_t(*buf) & 0xFF; + buf++; } - return saved_bytes; -} -simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept { - std::pair ret = avx2_convert_utf32_to_latin1(buf, len, latin1_output); - if (ret.first == nullptr) { return 0; } - size_t saved_bytes = ret.second - latin1_output; - if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; - } - return saved_bytes; -} + while (end - buf >= 32) { + __m256i in8 = __lasx_xvld(reinterpret_cast(buf), 0); -simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of code units written even if finished - std::pair ret = avx2_convert_utf32_to_latin1_with_errors(buf, len, latin1_output); - if (ret.first.count != len) { - result scalar_res = scalar::utf32_to_latin1::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); - if (scalar_res.error) { - scalar_res.count += ret.first.count; - return scalar_res; - } else { - ret.second += scalar_res.count; - } + __m256i inlow = __lasx_vext2xv_hu_bu(in8); + __m256i in8_high = __lasx_xvpermi_q(in8, in8, 0b00000001); + __m256i inhigh = __lasx_vext2xv_hu_bu(in8_high); + __lasx_xvst(inlow, reinterpret_cast(utf16_output), 0); + __lasx_xvst(inhigh, reinterpret_cast(utf16_output), 32); + + utf16_output += 32; + buf += 32; } - ret.first.count = ret.second - latin1_output; // Set count to the number of 8-bit code units written - return ret.first; -} -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept { - return convert_utf32_to_latin1(buf,len,latin1_output); -} + if (end - buf >= 16) { + __m128i zero = __lsx_vldi(0); + __m128i in8 = __lsx_vld(reinterpret_cast(buf), 0); -simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of code units written even if finished - std::pair ret = haswell::avx2_convert_utf32_to_utf8_with_errors(buf, len, utf8_output); - if (ret.first.count != len) { - result scalar_res = scalar::utf32_to_utf8::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); - if (scalar_res.error) { - scalar_res.count += ret.first.count; - return scalar_res; - } else { - ret.second += scalar_res.count; - } + __m128i inlow = __lsx_vilvl_b(zero, in8); + __m128i inhigh = __lsx_vilvh_b(zero, in8); + __lsx_vst(inlow, reinterpret_cast(utf16_output), 0); + __lsx_vst(inhigh, reinterpret_cast(utf16_output), 16); + + utf16_output += 16; + buf += 16; } - ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit code units written - return ret.first; + return std::make_pair(buf, utf16_output); } -simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - std::pair ret = haswell::avx2_convert_utf16_to_utf32(buf, len, utf32_output); - if (ret.first == nullptr) { return 0; } - size_t saved_bytes = ret.second - utf32_output; - if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; +std::pair +lasx_convert_latin1_to_utf16be(const char *buf, size_t len, + char16_t *utf16_output) { + const char *end = buf + len; + + while (((uint64_t)utf16_output & 0x1F) && buf < end) { + *utf16_output++ = char16_t((uint16_t(*buf++) << 8)); } - return saved_bytes; -} -simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - std::pair ret = haswell::avx2_convert_utf16_to_utf32(buf, len, utf32_output); - if (ret.first == nullptr) { return 0; } - size_t saved_bytes = ret.second - utf32_output; - if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; + __m256i zero = __lasx_xvldi(0); + while (end - buf >= 32) { + __m256i in8 = __lasx_xvld(reinterpret_cast(buf), 0); + + __m256i in8_shuf = __lasx_xvpermi_d(in8, 0b11011000); + + __m256i inlow = __lasx_xvilvl_b(in8_shuf, zero); + __m256i inhigh = __lasx_xvilvh_b(in8_shuf, zero); + __lasx_xvst(inlow, reinterpret_cast(utf16_output), 0); + __lasx_xvst(inhigh, reinterpret_cast(utf16_output), 32); + utf16_output += 32; + buf += 32; } - return saved_bytes; -} -simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of code units written even if finished - std::pair ret = haswell::avx2_convert_utf16_to_utf32_with_errors(buf, len, utf32_output); - if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count - if (ret.first.count != len) { // All good so far, but not finished - result scalar_res = scalar::utf16_to_utf32::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); - if (scalar_res.error) { - scalar_res.count += ret.first.count; - return scalar_res; - } else { - ret.second += scalar_res.count; - } + if (end - buf >= 16) { + __m128i zero_128 = __lsx_vldi(0); + __m128i in8 = __lsx_vld(reinterpret_cast(buf), 0); + + __m128i inlow = __lsx_vilvl_b(in8, zero_128); + __m128i inhigh = __lsx_vilvh_b(in8, zero_128); + __lsx_vst(inlow, reinterpret_cast(utf16_output), 0); + __lsx_vst(inhigh, reinterpret_cast(utf16_output), 16); + utf16_output += 16; + buf += 16; } - ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit code units written - return ret.first; + + return std::make_pair(buf, utf16_output); } +/* end file src/lasx/lasx_convert_latin1_to_utf16.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/lasx/lasx_convert_latin1_to_utf32.cpp */ +std::pair +lasx_convert_latin1_to_utf32(const char *buf, size_t len, + char32_t *utf32_output) { + const char *end = buf + len; -simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of code units written even if finished - std::pair ret = haswell::avx2_convert_utf16_to_utf32_with_errors(buf, len, utf32_output); - if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count - if (ret.first.count != len) { // All good so far, but not finished - result scalar_res = scalar::utf16_to_utf32::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); - if (scalar_res.error) { - scalar_res.count += ret.first.count; - return scalar_res; - } else { - ret.second += scalar_res.count; - } + // LASX requires 32-byte alignment, otherwise performance will be degraded + while (((uint64_t)utf32_output & 0x1F) && buf < end) { + *utf32_output++ = ((uint32_t)*buf) & 0xFF; + buf++; } - ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit code units written - return ret.first; -} -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { - return convert_utf32_to_utf8(buf, len, utf8_output); -} + while (end - buf >= 32) { + __m256i in8 = __lasx_xvld(reinterpret_cast(buf), 0); -simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - std::pair ret = avx2_convert_utf32_to_utf16(buf, len, utf16_output); - if (ret.first == nullptr) { return 0; } - size_t saved_bytes = ret.second - utf16_output; - if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; + __m256i in32_0 = __lasx_vext2xv_wu_bu(in8); + __lasx_xvst(in32_0, reinterpret_cast(utf32_output), 0); + + __m256i in8_1 = __lasx_xvpermi_d(in8, 0b00000001); + __m256i in32_1 = __lasx_vext2xv_wu_bu(in8_1); + __lasx_xvst(in32_1, reinterpret_cast(utf32_output), 32); + + __m256i in8_2 = __lasx_xvpermi_d(in8, 0b00000010); + __m256i in32_2 = __lasx_vext2xv_wu_bu(in8_2); + __lasx_xvst(in32_2, reinterpret_cast(utf32_output), 64); + + __m256i in8_3 = __lasx_xvpermi_d(in8, 0b00000011); + __m256i in32_3 = __lasx_vext2xv_wu_bu(in8_3); + __lasx_xvst(in32_3, reinterpret_cast(utf32_output), 96); + + utf32_output += 32; + buf += 32; } - return saved_bytes; -} -simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - std::pair ret = avx2_convert_utf32_to_utf16(buf, len, utf16_output); - if (ret.first == nullptr) { return 0; } - size_t saved_bytes = ret.second - utf16_output; - if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; + if (end - buf >= 16) { + __m128i in8 = __lsx_vld(reinterpret_cast(buf), 0); + + __m128i zero = __lsx_vldi(0); + __m128i in16low = __lsx_vilvl_b(zero, in8); + __m128i in16high = __lsx_vilvh_b(zero, in8); + __m128i in32_0 = __lsx_vilvl_h(zero, in16low); + __m128i in32_1 = __lsx_vilvh_h(zero, in16low); + __m128i in32_2 = __lsx_vilvl_h(zero, in16high); + __m128i in32_3 = __lsx_vilvh_h(zero, in16high); + + __lsx_vst(in32_0, reinterpret_cast(utf32_output), 0); + __lsx_vst(in32_1, reinterpret_cast(utf32_output), 16); + __lsx_vst(in32_2, reinterpret_cast(utf32_output), 32); + __lsx_vst(in32_3, reinterpret_cast(utf32_output), 48); + + utf32_output += 16; + buf += 16; } - return saved_bytes; + + return std::make_pair(buf, utf32_output); } +/* end file src/lasx/lasx_convert_latin1_to_utf32.cpp */ +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 -simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of code units written even if finished - std::pair ret = haswell::avx2_convert_utf32_to_utf16_with_errors(buf, len, utf16_output); - if (ret.first.count != len) { - result scalar_res = scalar::utf32_to_utf16::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); - if (scalar_res.error) { - scalar_res.count += ret.first.count; - return scalar_res; +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +/* begin file src/lasx/lasx_convert_utf8_to_utf16.cpp */ +// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 16, usually 12). +template +size_t convert_masked_utf8_to_utf16(const char *input, + uint64_t utf8_end_of_code_point_mask, + char16_t *&utf16_output) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + __m128i in = __lsx_vld(reinterpret_cast(input), 0); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & 0xfff; + // + // Optimization note: our main path below is load-latency dependent. Thus it + // is maybe beneficial to have fast paths that depend on branch prediction but + // have less latency. This results in more instructions but, potentially, also + // higher speeds. + + // We first try a few fast paths. + // The obvious first test is ASCII, which actually consumes the full 16. + if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xFFFF) { + __m128i zero = __lsx_vldi(0); + if simdutf_constexpr (match_system(big_endian)) { + __lsx_vst(__lsx_vilvl_b(zero, in), + reinterpret_cast(utf16_output), 0); + __lsx_vst(__lsx_vilvh_b(zero, in), + reinterpret_cast(utf16_output), 16); } else { - ret.second += scalar_res.count; + __lsx_vst(__lsx_vilvl_b(in, zero), + reinterpret_cast(utf16_output), 0); + __lsx_vst(__lsx_vilvh_b(in, zero), + reinterpret_cast(utf16_output), 16); } + utf16_output += 16; // We wrote 16 16-bit characters. + return 16; // We consumed 16 bytes. } - ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit code units written - return ret.first; -} -simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of code units written even if finished - std::pair ret = haswell::avx2_convert_utf32_to_utf16_with_errors(buf, len, utf16_output); - if (ret.first.count != len) { - result scalar_res = scalar::utf32_to_utf16::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); - if (scalar_res.error) { - scalar_res.count += ret.first.count; - return scalar_res; - } else { - ret.second += scalar_res.count; + // 3 byte sequences are the next most common, as seen in CJK, which has long + // sequences of these. + if (input_utf8_end_of_code_point_mask == 0x924) { + // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte + // UTF-16 code units. + __m128i composed = convert_utf8_3_byte_to_utf16(in); + // Byte swap if necessary + if simdutf_constexpr (!match_system(big_endian)) { + composed = lsx_swap_bytes(composed); } + + __lsx_vst(composed, reinterpret_cast(utf16_output), 0); + utf16_output += 4; // We wrote 4 16-bit characters. + return 12; // We consumed 12 bytes. } - ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit code units written - return ret.first; -} -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - return convert_utf32_to_utf16le(buf, len, utf16_output); -} + // 2 byte sequences occur in short bursts in languages like Greek and Russian. + if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xAAAA) { + // We want to take 6 2-byte UTF-8 code units and turn them into 6 2-byte + // UTF-16 code units. + __m128i composed = convert_utf8_2_byte_to_utf16(in); + // Byte swap if necessary + if simdutf_constexpr (!match_system(big_endian)) { + composed = lsx_swap_bytes(composed); + } -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - return convert_utf32_to_utf16be(buf, len, utf16_output); -} + __lsx_vst(composed, reinterpret_cast(utf16_output), 0); + utf16_output += 8; // We wrote 6 16-bit characters. + return 16; // We consumed 12 bytes. + } -simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - return convert_utf16le_to_utf32(buf, len, utf32_output); -} + /// We do not have a fast path available, or the fast path is unimportant, so + /// we fallback. + const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex + [input_utf8_end_of_code_point_mask][0]; -simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - return convert_utf16be_to_utf32(buf, len, utf32_output); -} + const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex + [input_utf8_end_of_code_point_mask][1]; + const __m128i zero = __lsx_vldi(0); + if (idx < 64) { + // SIX (6) input code-code units + // Convert to UTF-16 + __m128i composed = convert_utf8_1_to_2_byte_to_utf16(in, idx); + // Byte swap if necessary + if simdutf_constexpr (!match_system(big_endian)) { + composed = lsx_swap_bytes(composed); + } + // Store + __lsx_vst(composed, reinterpret_cast(utf16_output), 0); + utf16_output += 6; // We wrote 6 16-bit characters. + return consumed; + } else if (idx < 145) { + // FOUR (4) input code-code units + // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing. + __m128i sh = __lsx_vld(reinterpret_cast( + simdutf::tables::utf8_to_utf16::shufutf8[idx]), + 0); + // XXX: depending on the system scalar instructions might be faster. + // 1 byte: 00000000 00000000 0ccccccc + // 2 byte: 00000000 110bbbbb 10cccccc + // 3 byte: 1110aaaa 10bbbbbb 10cccccc + sh = __lsx_vand_v(sh, __lsx_vldi(0x1f)); + __m128i perm = __lsx_vshuf_b(zero, in, sh); + // 1 byte: 00000000 0ccccccc + // 2 byte: xx0bbbbb x0cccccc + // 3 byte: xxbbbbbb x0cccccc + __m128i lowperm = __lsx_vpickev_h(perm, perm); + // 1 byte: 00000000 00000000 + // 2 byte: 00000000 00000000 + // 3 byte: 00000000 1110aaaa + __m128i highperm = __lsx_vpickod_h(perm, perm); + // 3 byte: aaaa0000 00000000 + highperm = __lsx_vslli_h(highperm, 12); + // ASCII + // 1 byte: 00000000 0ccccccc + // 2+byte: 00000000 00cccccc + __m128i ascii = __lsx_vand_v(lowperm, __lsx_vrepli_h(0x7f)); + // 1 byte: 00000000 00000000 + // 2 byte: xx0bbbbb 00000000 + // 3 byte: xxbbbbbb 00000000 + __m128i middlebyte = __lsx_vand_v(lowperm, lsx_splat_u16(0xFF00)); + // 1 byte: 00000000 0ccccccc + // 2 byte: 0010bbbb bbcccccc + // 3 byte: 0010bbbb bbcccccc + __m128i composed = __lsx_vor_v(__lsx_vsrli_h(middlebyte, 2), ascii); -void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept { - utf16::change_endianness_utf16(input, length, output); -} + __m128i v0fff = __lsx_vreplgr2vr_h(uint16_t(0xfff)); + // aaaabbbb bbcccccc + composed = __lsx_vbitsel_v(highperm, composed, v0fff); -simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept { - return utf16::count_code_points(input, length); -} + if simdutf_constexpr (!match_system(big_endian)) { + composed = lsx_swap_bytes(composed); + } -simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept { - return utf16::count_code_points(input, length); -} + __lsx_vst(composed, reinterpret_cast(utf16_output), 0); + utf16_output += 4; // We wrote 4 16-bit codepoints + return consumed; + } else if (idx < 209) { + // THREE (3) input code-code units + if (input_utf8_end_of_code_point_mask == 0x888) { + __m128i expected_mask = + (__m128i)v16u8{0xf8, 0xc0, 0xc0, 0xc0, 0xf8, 0xc0, 0xc0, 0xc0, + 0xf8, 0xc0, 0xc0, 0xc0, 0x0, 0x0, 0x0, 0x0}; + __m128i expected = + (__m128i)v16u8{0xf0, 0x80, 0x80, 0x80, 0xf0, 0x80, 0x80, 0x80, + 0xf0, 0x80, 0x80, 0x80, 0x0, 0x0, 0x0, 0x0}; + __m128i check = __lsx_vseq_b(__lsx_vand_v(in, expected_mask), expected); + if (__lsx_bz_b(check)) + return 12; + // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte + // UTF-16 pairs. Generating surrogate pairs is a little tricky though, but + // it is easier when we can assume they are all pairs. This version does + // not use the LUT, but 4 byte sequences are less common and the overhead + // of the extra memory access is less important than the early branch + // overhead in shorter sequences. -simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept { - return utf8::count_code_points(input, length); -} + // Swap byte pairs + // 10dddddd 10cccccc|10bbbbbb 11110aaa + // 10cccccc 10dddddd|11110aaa 10bbbbbb + __m128i swap = lsx_swap_bytes(in); + // Shift left 2 bits + // cccccc00 dddddd00 xxxxxxxx bbbbbb00 + __m128i shift = __lsx_vslli_b(swap, 2); + // Create a magic number containing the low 2 bits of the trail surrogate + // and all the corrections needed to create the pair. UTF-8 4b prefix = + // -0x0000|0xF000 surrogate offset = -0x0000|0x0040 (0x10000 << 6) + // surrogate high = +0x0000|0xD800 + // surrogate low = +0xDC00|0x0000 + // ------------------------------- + // = +0xDC00|0xE7C0 + __m128i magic = __lsx_vreplgr2vr_w(uint32_t(0xDC00E7C0)); + // Generate unadjusted trail surrogate minus lowest 2 bits + // xxxxxxxx xxxxxxxx|11110aaa bbbbbb00 + __m128i trail = __lsx_vbitsel_v(shift, swap, lsx_splat_u32(0x0000FF00)); + // Insert low 2 bits of trail surrogate to magic number for later + // 11011100 00000000 11100111 110000cc + __m128i magic_with_low_2 = __lsx_vor_v(__lsx_vsrli_w(shift, 30), magic); -simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept { - return count_utf8(buf,len); -} + // Generate lead surrogate + // xxxxcccc ccdddddd|xxxxxxxx xxxxxxxx + // 000000cc ccdddddd|xxxxxxxx xxxxxxxx + __m128i lead = __lsx_vbitsel_v( + __lsx_vsrli_h(__lsx_vand_v(shift, __lsx_vldi(0x3F)), 4), swap, + __lsx_vrepli_h(0x3f /* 0x003f*/)); -simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept { - return scalar::utf16::latin1_length_from_utf16(length); -} + // Blend pairs + // 000000cc ccdddddd|11110aaa bbbbbb00 + __m128i blend = __lsx_vbitsel_v(lead, trail, lsx_splat_u32(0x0000FFFF)); -simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t length) const noexcept { - return scalar::utf32::latin1_length_from_utf32(length); + // Add magic number to finish the result + // 110111CC CCDDDDDD|110110AA BBBBBBCC + __m128i composed = __lsx_vadd_h(blend, magic_with_low_2); + // Byte swap if necessary + if simdutf_constexpr (!match_system(big_endian)) { + composed = lsx_swap_bytes(composed); + } + __lsx_vst(composed, reinterpret_cast(utf16_output), 0); + utf16_output += 6; // We 3 32-bit surrogate pairs. + return 12; // We consumed 12 bytes. + } + // 3 1-4 byte sequences + __m128i sh = __lsx_vld(reinterpret_cast( + simdutf::tables::utf8_to_utf16::shufutf8[idx]), + 0); + // 1 byte: 00000000 00000000 00000000 0ddddddd + // 3 byte: 00000000 00000000 110ccccc 10dddddd + // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd + // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd + sh = __lsx_vand_v(sh, __lsx_vldi(0x1f)); + __m128i perm = __lsx_vshuf_b(zero, in, sh); + // added to fix issue https://github.com/simdutf/simdutf/issues/514 + // We only want to write 2 * 16-bit code units when that is actually what we + // have. Unfortunately, we cannot trust the input. So it is possible to get + // 0xff as an input byte and it should not result in a surrogate pair. We + // need to check for that. + uint32_t permbuffer[4]; + __lsx_vst(perm, permbuffer, 0); + // Mask the low and middle bytes + // 00000000 00000000 00000000 0ddddddd + __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7f)); + // Because the surrogates need more work, the high surrogate is computed + // first. + __m128i middlehigh = __lsx_vslli_w(perm, 2); + // 00000000 00000000 00cccccc 00000000 + __m128i middlebyte = __lsx_vand_v(perm, lsx_splat_u32(0x00003F00)); + // Start assembling the sequence. Since the 4th byte is in the same position + // as it would be in a surrogate and there is no dependency, shift left + // instead of right. 3 byte: 00000000 10bbbbxx xxxxxxxx xxxxxxxx 4 byte: + // 11110aaa bbbbbbxx xxxxxxxx xxxxxxxx + __m128i ab = __lsx_vbitsel_v(middlehigh, perm, lsx_splat_u32(0xFF000000)); + // Top 16 bits contains the high ten bits of the surrogate pair before + // correction 3 byte: 00000000 10bbbbcc|cccc0000 00000000 4 byte: 11110aaa + // bbbbbbcc|cccc0000 00000000 - high 10 bits correct w/o correction + __m128i v_fffc0000 = __lsx_vreplgr2vr_w(uint32_t(0xFFFC0000)); + __m128i abc = __lsx_vbitsel_v(__lsx_vslli_w(middlebyte, 4), ab, v_fffc0000); + // Combine the low 6 or 7 bits by a shift right accumulate + // 3 byte: 00000000 00000010|bbbbcccc ccdddddd - low 16 bits correct + // 4 byte: 00000011 110aaabb|bbbbcccc ccdddddd - low 10 bits correct w/o + // correction + __m128i composed = __lsx_vor_v(ascii, __lsx_vsrli_w(abc, 6)); + // After this is for surrogates + // Blend the low and high surrogates + // 4 byte: 11110aaa bbbbbbcc|bbbbcccc ccdddddd + __m128i mixed = __lsx_vbitsel_v(abc, composed, lsx_splat_u32(0x0000FFFF)); + // Clear the upper 6 bits of the low surrogate. Don't clear the upper bits + // yet as 0x10000 was not subtracted from the codepoint yet. 4 byte: + // 11110aaa bbbbbbcc|000000cc ccdddddd + __m128i v_ffff03ff = __lsx_vreplgr2vr_w(uint32_t(0xFFFF03FF)); + __m128i masked_pair = __lsx_vand_v(mixed, v_ffff03ff); + // Correct the remaining UTF-8 prefix, surrogate offset, and add the + // surrogate prefixes in one magic 16-bit addition. similar magic number but + // without the continue byte adjust and halfword swapped UTF-8 4b prefix = + // -0xF000|0x0000 surrogate offset = -0x0040|0x0000 (0x10000 << 6) + // surrogate high = +0xD800|0x0000 + // surrogate low = +0x0000|0xDC00 + // ----------------------------------- + // = +0xE7C0|0xDC00 + __m128i magic = __lsx_vreplgr2vr_w(uint32_t(0xE7C0DC00)); + // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD - surrogate pair complete + __m128i surrogates = __lsx_vadd_w(masked_pair, magic); + // If the high bit is 1 (s32 less than zero), this needs a surrogate pair + __m128i is_pair = __lsx_vslt_w(perm, zero); + // Select either the 4 byte surrogate pair or the 2 byte solo codepoint + // 3 byte: 0xxxxxxx xxxxxxxx|bbbbcccc ccdddddd + // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD + __m128i selected = __lsx_vbitsel_v(composed, surrogates, is_pair); + // Byte swap if necessary + if simdutf_constexpr (!match_system(big_endian)) { + selected = lsx_swap_bytes(selected); + } + // Attempting to shuffle and store would be complex, just scalarize. + uint32_t buffer_tmp[4]; + __lsx_vst(selected, buffer_tmp, 0); + // Test for the top bit of the surrogate mask. Remove due to issue 514 + // const uint32_t SURROGATE_MASK = match_system(big_endian) ? 0x80000000 : + // 0x00800000; + for (size_t i = 0; i < 3; i++) { + // Surrogate + // Used to be if (buffer[i] & SURROGATE_MASK) { + // See discussion above. + // patch for issue https://github.com/simdutf/simdutf/issues/514 + if ((permbuffer[i] & 0xf8000000) == 0xf0000000) { + utf16_output[0] = uint16_t(buffer_tmp[i] >> 16); + utf16_output[1] = uint16_t(buffer_tmp[i] & 0xFFFF); + utf16_output += 2; + } else { + utf16_output[0] = uint16_t(buffer_tmp[i] & 0xFFFF); + utf16_output++; + } + } + return consumed; + } else { + // here we know that there is an error but we do not handle errors + return 12; + } } +/* end file src/lasx/lasx_convert_utf8_to_utf16.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +/* begin file src/lasx/lasx_convert_utf8_to_utf32.cpp */ +// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 12). +size_t convert_masked_utf8_to_utf32(const char *input, + uint64_t utf8_end_of_code_point_mask, + char32_t *&utf32_out) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + uint32_t *&utf32_output = reinterpret_cast(utf32_out); + __m128i in = __lsx_vld(reinterpret_cast(input), 0); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & 0xFFF; + // + // Optimization note: our main path below is load-latency dependent. Thus it + // is maybe beneficial to have fast paths that depend on branch prediction but + // have less latency. This results in more instructions but, potentially, also + // higher speeds. + // + // We first try a few fast paths. + if ((utf8_end_of_code_point_mask & 0xffff) == 0xffff) { + // We process in chunks of 16 bytes. + // use fast implementation in src/simdutf/arm64/simd.h + // Ideally the compiler can keep the tables in registers. + __m128i zero = __lsx_vldi(0); + __m128i in16low = __lsx_vilvl_b(zero, in); + __m128i in16high = __lsx_vilvh_b(zero, in); + __m128i in32_0 = __lsx_vilvl_h(zero, in16low); + __m128i in32_1 = __lsx_vilvh_h(zero, in16low); + __m128i in32_2 = __lsx_vilvl_h(zero, in16high); + __m128i in32_3 = __lsx_vilvh_h(zero, in16high); + + __lsx_vst(in32_0, reinterpret_cast(utf32_output), 0); + __lsx_vst(in32_1, reinterpret_cast(utf32_output), 16); + __lsx_vst(in32_2, reinterpret_cast(utf32_output), 32); + __lsx_vst(in32_3, reinterpret_cast(utf32_output), 48); -simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept { - return utf16::utf8_length_from_utf16(input, length); -} + utf32_output += 16; // We wrote 16 32-bit characters. + return 16; // We consumed 16 bytes. + } + __m128i zero = __lsx_vldi(0); + if (input_utf8_end_of_code_point_mask == 0x924) { + // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte + // UTF-32 code units. Convert to UTF-16 + __m128i composed_utf16 = convert_utf8_3_byte_to_utf16(in); + __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16); -simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept { - return utf16::utf8_length_from_utf16(input, length); -} + __lsx_vst(utf32_low, reinterpret_cast(utf32_output), 0); + utf32_output += 4; // We wrote 4 32-bit characters. + return 12; // We consumed 12 bytes. + } + // 2 byte sequences occur in short bursts in languages like Greek and Russian. + if (input_utf8_end_of_code_point_mask == 0xaaa) { + // We want to take 6 2-byte UTF-8 code units and turn them into 6 4-byte + // UTF-32 code units. Convert to UTF-16 + __m128i composed_utf16 = convert_utf8_2_byte_to_utf16(in); -simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept { - return utf16::utf32_length_from_utf16(input, length); -} + __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16); + __m128i utf32_high = __lsx_vilvh_h(zero, composed_utf16); -simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept { - return utf16::utf32_length_from_utf16(input, length); -} + __lsx_vst(utf32_low, reinterpret_cast(utf32_output), 0); + __lsx_vst(utf32_high, reinterpret_cast(utf32_output), 16); + utf32_output += 6; + return 12; // We consumed 12 bytes. + } + // Either no fast path or an unimportant fast path. + const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex + [input_utf8_end_of_code_point_mask][0]; + const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex + [input_utf8_end_of_code_point_mask][1]; -simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept { - return scalar::latin1::utf16_length_from_latin1(length); -} + if (idx < 64) { + // SIX (6) input code-code units + // Convert to UTF-16 + __m128i composed_utf16 = convert_utf8_1_to_2_byte_to_utf16(in, idx); + __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16); + __m128i utf32_high = __lsx_vilvh_h(zero, composed_utf16); -simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept { - return utf8::utf16_length_from_utf8(input, length); -} + __lsx_vst(utf32_low, reinterpret_cast(utf32_output), 0); + __lsx_vst(utf32_high, reinterpret_cast(utf32_output), 16); + utf32_output += 6; + return consumed; + } else if (idx < 145) { + // FOUR (4) input code-code units + // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing. + __m128i sh = __lsx_vld(reinterpret_cast( + simdutf::tables::utf8_to_utf16::shufutf8[idx]), + 0); + // Shuffle + // 1 byte: 00000000 00000000 0ccccccc + // 2 byte: 00000000 110bbbbb 10cccccc + // 3 byte: 1110aaaa 10bbbbbb 10cccccc + sh = __lsx_vand_v(sh, __lsx_vldi(0x1f)); + __m128i perm = __lsx_vshuf_b(zero, in, sh); + // Split + // 00000000 00000000 0ccccccc + __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7F)); // 6 or 7 bits + // Note: unmasked + // xxxxxxxx aaaaxxxx xxxxxxxx + __m128i high = + __lsx_vsrli_w(__lsx_vand_v(perm, __lsx_vldi(0xf)), 4); // 4 bits + // Use 16 bit bic instead of and. + // The top bits will be corrected later in the bsl + // 00000000 10bbbbbb 00000000 + __m128i middle = + __lsx_vand_v(perm, lsx_splat_u32(0x0000FF00)); // 5 or 6 bits + // Combine low and middle with shift right accumulate + // 00000000 00xxbbbb bbcccccc + __m128i lowmid = __lsx_vor_v(ascii, __lsx_vsrli_w(middle, 2)); + // Insert top 4 bits from high byte with bitwise select + // 00000000 aaaabbbb bbcccccc + __m128i composed = __lsx_vbitsel_v(lowmid, high, lsx_splat_u32(0x0000F000)); + __lsx_vst(composed, utf32_output, 0); + utf32_output += 4; // We wrote 4 32-bit characters. + return consumed; + } else if (idx < 209) { + // THREE (3) input code-code units + if (input_utf8_end_of_code_point_mask == 0x888) { + // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte + // UTF-32 code units. This uses the same method as the fixed 3 byte + // version, reversing and shift left insert. However, there is no need for + // a shuffle mask now, just rev16 and rev32. + // + // This version does not use the LUT, but 4 byte sequences are less common + // and the overhead of the extra memory access is less important than the + // early branch overhead in shorter sequences, so it comes last. + + // Swap pairs of bytes + // 10dddddd|10cccccc|10bbbbbb|11110aaa + // 10cccccc 10dddddd|11110aaa 10bbbbbb + __m128i swap = lsx_swap_bytes(in); + // Shift left and insert + // xxxxcccc ccdddddd|xxxxxxxa aabbbbbb + __m128i merge1 = __lsx_vbitsel_v(__lsx_vsrli_h(swap, 2), swap, + __lsx_vrepli_h(0x3f /*0x003F*/)); + // Shift insert again + // xxxxxxxx xxxaaabb bbbbcccc ccdddddd + __m128i merge2 = + __lsx_vbitsel_v(__lsx_vslli_w(merge1, 12), /* merge1 << 12 */ + __lsx_vsrli_w(merge1, 16), /* merge1 >> 16 */ + lsx_splat_u32(0x00000FFF)); + // Clear the garbage + // 00000000 000aaabb bbbbcccc ccdddddd + __m128i composed = __lsx_vand_v(merge2, lsx_splat_u32(0x1FFFFF)); + // Store + __lsx_vst(composed, utf32_output, 0); + utf32_output += 3; // We wrote 3 32-bit characters. + return 12; // We consumed 12 bytes. + } + // Unlike UTF-16, doing a fast codepath doesn't have nearly as much benefit + // due to surrogates no longer being involved. + __m128i sh = __lsx_vld(reinterpret_cast( + simdutf::tables::utf8_to_utf16::shufutf8[idx]), + 0); + // 1 byte: 00000000 00000000 00000000 0ddddddd + // 2 byte: 00000000 00000000 110ccccc 10dddddd + // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd + // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd + sh = __lsx_vand_v(sh, __lsx_vldi(0x1f)); + __m128i perm = __lsx_vshuf_b(zero, in, sh); + // Ascii + __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7F)); + __m128i middle = __lsx_vand_v(perm, lsx_splat_u32(0x00003f00)); + // 00000000 00000000 0000cccc ccdddddd + __m128i cd = __lsx_vor_v(__lsx_vsrli_w(middle, 2), ascii); -simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept { - return scalar::latin1::utf32_length_from_latin1(length); + __m128i correction = __lsx_vand_v(perm, lsx_splat_u32(0x00400000)); + __m128i corrected = __lsx_vadd_b(perm, __lsx_vsrli_w(correction, 1)); + // Insert twice + // 00000000 000aaabb bbbbxxxx xxxxxxxx + __m128i corrected_srli2 = + __lsx_vsrli_w(__lsx_vand_v(corrected, __lsx_vrepli_b(0x7)), 2); + __m128i ab = + __lsx_vbitsel_v(corrected_srli2, corrected, __lsx_vrepli_h(0x3f)); + ab = __lsx_vsrli_w(ab, 4); + // 00000000 000aaabb bbbbcccc ccdddddd + __m128i composed = __lsx_vbitsel_v(ab, cd, lsx_splat_u32(0x00000FFF)); + // Store + __lsx_vst(composed, utf32_output, 0); + utf32_output += 3; // We wrote 3 32-bit characters. + return consumed; + } else { + // here we know that there is an error but we do not handle errors + return 12; + } } +/* end file src/lasx/lasx_convert_utf8_to_utf32.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/lasx/lasx_convert_utf8_to_latin1.cpp */ +size_t convert_masked_utf8_to_latin1(const char *input, + uint64_t utf8_end_of_code_point_mask, + char *&latin1_output) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + __m128i in = __lsx_vld(reinterpret_cast(input), 0); -simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char *input, size_t len) const noexcept { - const uint8_t *data = reinterpret_cast(input); - size_t answer = len / sizeof(__m256i) * sizeof(__m256i); - size_t i = 0; - __m256i four_64bits = _mm256_setzero_si256(); - while (i + sizeof(__m256i) <= len) { - __m256i runner = _mm256_setzero_si256(); - // We can do up to 255 loops without overflow. - size_t iterations = (len - i) / sizeof(__m256i); - if (iterations > 255) { - iterations = 255; - } - size_t max_i = i + iterations * sizeof(__m256i) - sizeof(__m256i); - for (; i + 4*sizeof(__m256i) <= max_i; i += 4*sizeof(__m256i)) { - __m256i input1 = _mm256_loadu_si256((const __m256i *)(data + i)); - __m256i input2 = _mm256_loadu_si256((const __m256i *)(data + i + sizeof(__m256i))); - __m256i input3 = _mm256_loadu_si256((const __m256i *)(data + i + 2*sizeof(__m256i))); - __m256i input4 = _mm256_loadu_si256((const __m256i *)(data + i + 3*sizeof(__m256i))); - __m256i input12 = _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input1), - _mm256_cmpgt_epi8(_mm256_setzero_si256(), input2)); - __m256i input23 = _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input3), - _mm256_cmpgt_epi8(_mm256_setzero_si256(), input4)); - __m256i input1234 = _mm256_add_epi8(input12, input23); - runner = _mm256_sub_epi8( - runner, input1234); - } - for (; i <= max_i; i += sizeof(__m256i)) { - __m256i input_256_chunk = _mm256_loadu_si256((const __m256i *)(data + i)); - runner = _mm256_sub_epi8( - runner, _mm256_cmpgt_epi8(_mm256_setzero_si256(), input_256_chunk)); - } - four_64bits = _mm256_add_epi64( - four_64bits, _mm256_sad_epu8(runner, _mm256_setzero_si256())); - } - answer += _mm256_extract_epi64(four_64bits, 0) + - _mm256_extract_epi64(four_64bits, 1) + - _mm256_extract_epi64(four_64bits, 2) + - _mm256_extract_epi64(four_64bits, 3); - return answer + scalar::latin1::utf8_length_from_latin1(reinterpret_cast(data + i), len - i); -} - -simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept { - const __m256i v_00000000 = _mm256_setzero_si256(); - const __m256i v_ffffff80 = _mm256_set1_epi32((uint32_t)0xffffff80); - const __m256i v_fffff800 = _mm256_set1_epi32((uint32_t)0xfffff800); - const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000); - size_t pos = 0; - size_t count = 0; - for(;pos + 8 <= length; pos += 8) { - __m256i in = _mm256_loadu_si256((__m256i*)(input + pos)); - const __m256i ascii_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffffff80), v_00000000); - const __m256i one_two_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_fffff800), v_00000000); - const __m256i two_bytes_bytemask = _mm256_xor_si256(one_two_bytes_bytemask, ascii_bytes_bytemask); - const __m256i one_two_three_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000); - const __m256i three_bytes_bytemask = _mm256_xor_si256(one_two_three_bytes_bytemask, one_two_bytes_bytemask); - const uint32_t ascii_bytes_bitmask = static_cast(_mm256_movemask_epi8(ascii_bytes_bytemask)); - const uint32_t two_bytes_bitmask = static_cast(_mm256_movemask_epi8(two_bytes_bytemask)); - const uint32_t three_bytes_bitmask = static_cast(_mm256_movemask_epi8(three_bytes_bytemask)); - - size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4; - size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4; - size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4; - count += 32 - 3*ascii_count - 2*two_bytes_count - three_bytes_count; - } - return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos); -} - -simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept { - const __m256i v_00000000 = _mm256_setzero_si256(); - const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000); - size_t pos = 0; - size_t count = 0; - for(;pos + 8 <= length; pos += 8) { - __m256i in = _mm256_loadu_si256((__m256i*)(input + pos)); - const __m256i surrogate_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000); - const uint32_t surrogate_bitmask = static_cast(_mm256_movemask_epi8(surrogate_bytemask)); - size_t surrogate_count = (32-count_ones(surrogate_bitmask))/4; - count += 8 + surrogate_count; + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & 0xfff; + // Optimization note: our main path below is load-latency dependent. Thus it + // is maybe beneficial to have fast paths that depend on branch prediction but + // have less latency. This results in more instructions but, potentially, also + // higher speeds. + + // We first try a few fast paths. + // The obvious first test is ASCII, which actually consumes the full 16. + if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xFFFF) { + // We process in chunks of 16 bytes + __lsx_vst(in, reinterpret_cast(latin1_output), 0); + latin1_output += 16; // We wrote 16 18-bit characters. + return 16; // We consumed 16 bytes. + } + /// We do not have a fast path available, or the fast path is unimportant, so + /// we fallback. + const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex + [input_utf8_end_of_code_point_mask][0]; + + const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex + [input_utf8_end_of_code_point_mask][1]; + // this indicates an invalid input: + if (idx >= 64) { + return consumed; } - return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos); + // Here we should have (idx < 64), if not, there is a bug in the validation or + // elsewhere. SIX (6) input code-code units this is a relatively easy scenario + // we process SIX (6) input code-code units. The max length in bytes of six + // code code units spanning between 1 and 2 bytes each is 12 bytes. Converts 6 + // 1-2 byte UTF-8 characters to 6 UTF-16 characters. This is a relatively easy + // scenario we process SIX (6) input code-code units. The max length in bytes + // of six code code units spanning between 1 and 2 bytes each is 12 bytes. + __m128i sh = __lsx_vld(reinterpret_cast( + simdutf::tables::utf8_to_utf16::shufutf8[idx]), + 0); + // Shuffle + // 1 byte: 00000000 0bbbbbbb + // 2 byte: 110aaaaa 10bbbbbb + sh = __lsx_vand_v(sh, __lsx_vldi(0x1f)); + __m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, sh); + // ascii mask + // 1 byte: 11111111 11111111 + // 2 byte: 00000000 00000000 + __m128i ascii_mask = __lsx_vslt_bu(perm, __lsx_vldi(0x80)); + // utf8 mask + // 1 byte: 00000000 00000000 + // 2 byte: 00111111 00111111 + __m128i utf8_mask = __lsx_vand_v(__lsx_vsle_bu(__lsx_vldi(0x80), perm), + __lsx_vldi(0b00111111)); + // mask + // 1 byte: 11111111 11111111 + // 2 byte: 00111111 00111111 + __m128i mask = __lsx_vor_v(utf8_mask, ascii_mask); + + __m128i composed = __lsx_vbitsel_v(__lsx_vsrli_h(perm, 2), perm, mask); + // writing 8 bytes even though we only care about the first 6 bytes. + __m128i latin1_packed = __lsx_vpickev_b(__lsx_vldi(0), composed); + + __lsx_vst(latin1_packed, reinterpret_cast(latin1_output), 0); + latin1_output += 6; // We wrote 6 bytes. + return consumed; } +/* end file src/lasx/lasx_convert_utf8_to_latin1.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 -simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept { - return utf8::count_code_points(input, length); +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/lasx/lasx_convert_utf16_to_latin1.cpp */ +template +std::pair +lasx_convert_utf16_to_latin1(const char16_t *buf, size_t len, + char *latin1_output) { + const char16_t *end = buf + len; + while (end - buf >= 16) { + __m128i in = __lsx_vld(reinterpret_cast(buf), 0); + __m128i in1 = __lsx_vld(reinterpret_cast(buf), 16); + if simdutf_constexpr (!match_system(big_endian)) { + in = lsx_swap_bytes(in); + in1 = lsx_swap_bytes(in1); + } + if (__lsx_bz_v(__lsx_vpickod_b(in1, in))) { + // 1. pack the bytes + __m128i latin1_packed = __lsx_vpickev_b(in1, in); + // 2. store (8 bytes) + __lsx_vst(latin1_packed, reinterpret_cast(latin1_output), 0); + // 3. adjust pointers + buf += 16; + latin1_output += 16; + } else { + return std::make_pair(nullptr, reinterpret_cast(latin1_output)); + } + } // while + return std::make_pair(buf, latin1_output); } -simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char * input, size_t length) const noexcept { - return scalar::base64::maximal_binary_length_from_base64(input, length); +template +std::pair +lasx_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len, + char *latin1_output) { + const char16_t *start = buf; + const char16_t *end = buf + len; + while (end - buf >= 16) { + __m128i in = __lsx_vld(reinterpret_cast(buf), 0); + __m128i in1 = __lsx_vld(reinterpret_cast(buf), 16); + if simdutf_constexpr (!match_system(big_endian)) { + in = lsx_swap_bytes(in); + in1 = lsx_swap_bytes(in1); + } + if (__lsx_bz_v(__lsx_vpickod_b(in1, in))) { + // 1. pack the bytes + __m128i latin1_packed = __lsx_vpickev_b(in1, in); + // 2. store (8 bytes) + __lsx_vst(latin1_packed, reinterpret_cast(latin1_output), 0); + // 3. adjust pointers + buf += 16; + latin1_output += 16; + } else { + // Let us do a scalar fallback. + for (int k = 0; k < 16; k++) { + uint16_t word = scalar::utf16::swap_if_needed(buf[k]); + if (word <= 0xff) { + *latin1_output++ = char(word); + } else { + return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), + latin1_output); + } + } + } + } // while + return std::make_pair(result(error_code::SUCCESS, buf - start), + latin1_output); } +/* end file src/lasx/lasx_convert_utf16_to_latin1.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +/* begin file src/lasx/lasx_convert_utf16_to_utf8.cpp */ +/* + The vectorized algorithm works on single LASX register i.e., it + loads eight 16-bit code units. -simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept { - return (options & base64_url) ? compress_decode_base64(output, input, length, options) : compress_decode_base64(output, input, length, options); -} + We consider three cases: + 1. an input register contains no surrogates and each value + is in range 0x0000 .. 0x07ff. + 2. an input register contains no surrogates and values are + is in range 0x0000 .. 0xffff. + 3. an input register contains surrogates --- i.e. codepoints + can have 16 or 32 bits. -simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept { - return scalar::base64::maximal_binary_length_from_base64(input, length); -} + Ad 1. -simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept { - return (options & base64_url) ? compress_decode_base64(output, input, length, options) : compress_decode_base64(output, input, length, options); -} + When values are less than 0x0800, it means that a 16-bit code unit + can be converted into: 1) single UTF8 byte (when it's an ASCII + char) or 2) two UTF8 bytes. -simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length, base64_options options) const noexcept { - return scalar::base64::base64_length_from_binary(length, options); -} + For this case we do only some shuffle to obtain these 2-byte + codes and finally compress the whole LASX register with a single + shuffle. -size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept { - if(options & base64_url) { - return encode_base64(output, input, length); - } else { - return encode_base64(output, input, length); - } -} -} // namespace haswell -} // namespace simdutf + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. -/* begin file src/simdutf/haswell/end.h */ -#if SIMDUTF_CAN_ALWAYS_RUN_HASWELL -// nothing needed. -#else -SIMDUTF_UNTARGET_REGION -#endif + Ad 2. + When values fit in 16-bit code units, but are above 0x07ff, then + a single word may produce one, two or three UTF8 bytes. -#if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593 -SIMDUTF_POP_DISABLE_WARNINGS -#endif // end of workaround -/* end file src/simdutf/haswell/end.h */ -/* end file src/haswell/implementation.cpp */ -#endif -#if SIMDUTF_IMPLEMENTATION_PPC64 -/* begin file src/ppc64/implementation.cpp */ + We prepare data for all these three cases in two registers. + The first register contains lower two UTF8 bytes (used in all + cases), while the second one contains just the third byte for + the three-UTF8-bytes case. + Finally these two registers are interleaved forming eight-element + array of 32-bit values. The array spans two LASX registers. + The bytes from the registers are compressed using two shuffles. + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + To summarize: + - We need two 256-entry tables that have 8704 bytes in total. +*/ +/* + Returns a pair: the first unprocessed byte from buf and utf8_output + A scalar routing should carry on the conversion of the tail. +*/ -/* begin file src/simdutf/ppc64/begin.h */ -// redefining SIMDUTF_IMPLEMENTATION to "ppc64" -// #define SIMDUTF_IMPLEMENTATION ppc64 -/* end file src/simdutf/ppc64/begin.h */ -namespace simdutf { -namespace ppc64 { -namespace { -#ifndef SIMDUTF_PPC64_H -#error "ppc64.h must be included" -#endif -using namespace simd; +template +std::pair +lasx_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) { + uint8_t *utf8_output = reinterpret_cast(utf8_out); + const char16_t *end = buf + len; + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 -simdutf_really_inline bool is_ascii(const simd8x64& input) { - // careful: 0x80 is not ascii. - return input.reduce_or().saturating_sub(0b01111111u).bits_not_set_anywhere(); -} + __m256i v_07ff = __lasx_xvreplgr2vr_h(uint16_t(0x7ff)); + __m256i zero = __lasx_xvldi(0); + __m128i zero_128 = __lsx_vldi(0); + while (end - buf >= std::ptrdiff_t(16 + safety_margin)) { + __m256i in = __lasx_xvld(reinterpret_cast(buf), 0); + if simdutf_constexpr (!match_system(big_endian)) { + in = lasx_swap_bytes(in); + } + if (__lasx_xbnz_h(__lasx_xvslt_hu( + in, __lasx_xvrepli_h(0x7F)))) { // ASCII fast path!!!! + // 1. pack the bytes + __m256i utf8_packed = + __lasx_xvpermi_d(__lasx_xvpickev_b(in, in), 0b00001000); + // 2. store (16 bytes) + __lsx_vst(lasx_extracti128_lo(utf8_packed), utf8_output, 0); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } -simdutf_unused simdutf_really_inline simd8 must_be_continuation(const simd8 prev1, const simd8 prev2, const simd8 prev3) { - simd8 is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0 - simd8 is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0 - simd8 is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0 - // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine. - return simd8(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0); -} + if (__lasx_xbz_v(__lasx_xvslt_hu(v_07ff, in))) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 16 + // expected output : [110a|aaaa|10bb|bbbb] x 16 + // t0 = [000a|aaaa|bbbb|bb00] + __m256i t0 = __lasx_xvslli_h(in, 2); + // t1 = [000a|aaaa|0000|0000] + __m256i t1 = __lasx_xvand_v(t0, lasx_splat_u16(0x1f00)); + // t2 = [0000|0000|00bb|bbbb] + __m256i t2 = __lasx_xvand_v(in, __lasx_xvrepli_h(0x3f)); + // t3 = [000a|aaaa|00bb|bbbb] + __m256i t3 = __lasx_xvor_v(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + __m256i v_c080 = __lasx_xvreplgr2vr_h(uint16_t(0xc080)); + __m256i t4 = __lasx_xvor_v(t3, v_c080); + // 2. merge ASCII and 2-byte codewords + __m256i one_byte_bytemask = + __lasx_xvsle_hu(in, __lasx_xvrepli_h(0x7F /*0x007F*/)); + __m256i utf8_unpacked = __lasx_xvbitsel_v(t4, in, one_byte_bytemask); + // 3. prepare bitmask for 8-bit lookup + __m256i mask = __lasx_xvmskltz_h(one_byte_bytemask); + uint32_t m1 = __lasx_xvpickve2gr_wu(mask, 0); + uint32_t m2 = __lasx_xvpickve2gr_wu(mask, 4); + // 4. pack the bytes + const uint8_t *row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes + [lasx_1_2_utf8_bytes_mask[m1]][0]; + __m128i shuffle1 = __lsx_vld(row1, 1); + __m128i utf8_packed1 = + __lsx_vshuf_b(zero_128, lasx_extracti128_lo(utf8_unpacked), shuffle1); + + const uint8_t *row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes + [lasx_1_2_utf8_bytes_mask[m2]][0]; + __m128i shuffle2 = __lsx_vld(row2, 1); + __m128i utf8_packed2 = + __lsx_vshuf_b(zero_128, lasx_extracti128_hi(utf8_unpacked), shuffle2); + // 5. store bytes + __lsx_vst(utf8_packed1, utf8_output, 0); + utf8_output += row1[0]; -simdutf_really_inline simd8 must_be_2_3_continuation(const simd8 prev2, const simd8 prev3) { - simd8 is_third_byte = prev2.saturating_sub(0xe0u-0x80); // Only 111_____ will be >= 0x80 - simd8 is_fourth_byte = prev3.saturating_sub(0xf0u-0x80); // Only 1111____ will be >= 0x80 - // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine. - return simd8(is_third_byte | is_fourth_byte); -} + __lsx_vst(utf8_packed2, utf8_output, 0); + utf8_output += row2[0]; -} // unnamed namespace -} // namespace ppc64 -} // namespace simdutf + buf += 16; + continue; + } + __m256i surrogates_bytemask = __lasx_xvseq_h( + __lasx_xvand_v(in, lasx_splat_u16(0xf800)), lasx_splat_u16(0xd800)); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. + // However, it is likely an uncommon occurrence. + if (__lasx_xbz_v(surrogates_bytemask)) { + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - + single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - + two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - + three UTF-8 bytes -/* begin file src/generic/buf_block_reader.h */ -namespace simdutf { -namespace ppc64 { -namespace { + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. -// Walks through a buffer in block-sized increments, loading the last part with spaces -template -struct buf_block_reader { -public: - simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len); - simdutf_really_inline size_t block_index(); - simdutf_really_inline bool has_full_block() const; - simdutf_really_inline const uint8_t *full_block() const; - /** - * Get the last block, padded with spaces. - * - * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this - * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there - * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding. - * - * @return the number of effective characters in the last block. - */ - simdutf_really_inline size_t get_remainder(uint8_t *dst) const; - simdutf_really_inline void advance(); -private: - const uint8_t *buf; - const size_t len; - const size_t lenminusstep; - size_t idx; -}; + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. -// Routines to print masks and text for debugging bitmask operations -simdutf_unused static char * format_input_text_64(const uint8_t *text) { - static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); - for (size_t i=0; i); i++) { - buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]); - } - buf[sizeof(simd8x64)] = '\0'; - return buf; -} + We precompute byte 1 for case #3 and -- **conditionally** -- + precompute either byte 1 for case #2 or byte 2 for case #3. Note that + they differ by exactly one bit. -// Routines to print masks and text for debugging bitmask operations -simdutf_unused static char * format_input_text(const simd8x64& in) { - static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); - in.store(reinterpret_cast(buf)); - for (size_t i=0; i); i++) { - if (buf[i] < ' ') { buf[i] = '_'; } - } - buf[sizeof(simd8x64)] = '\0'; - return buf; -} + Finally from these two code units we build proper UTF-8 sequence, + taking into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + __m256i t0 = __lasx_xvpickev_b(in, in); + t0 = __lasx_xvilvl_b(t0, t0); -simdutf_unused static char * format_mask(uint64_t mask) { - static char *buf = reinterpret_cast(malloc(64 + 1)); - for (size_t i=0; i<64; i++) { - buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' '; - } - buf[64] = '\0'; - return buf; -} + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|00cc|cccc] + __m256i v_3f7f = __lasx_xvreplgr2vr_h(uint16_t(0x3F7F)); + __m256i t1 = __lasx_xvand_v(t0, v_3f7f); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + __m256i t2 = __lasx_xvor_v(t1, lasx_splat_u16(0x8000)); + + // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] + __m256i s0 = __lasx_xvsrli_h(in, 12); + // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] + __m256i s1 = __lasx_xvslli_h(in, 2); + // s1: [aabb|bbbb|cccc|cc00] => [00bb|bbbb|0000|0000] + s1 = __lasx_xvand_v(s1, lasx_splat_u16(0x3f00)); + + // [00bb|bbbb|0000|aaaa] + __m256i s2 = __lasx_xvor_v(s0, s1); + // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + __m256i v_c0e0 = __lasx_xvreplgr2vr_h(uint16_t(0xC0E0)); + __m256i s3 = __lasx_xvor_v(s2, v_c0e0); + __m256i one_or_two_bytes_bytemask = __lasx_xvsle_hu(in, v_07ff); + __m256i m0 = + __lasx_xvandn_v(one_or_two_bytes_bytemask, lasx_splat_u16(0x4000)); + __m256i s4 = __lasx_xvxor_v(s3, m0); -template -simdutf_really_inline buf_block_reader::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} + // 4. expand code units 16-bit => 32-bit + __m256i out0 = __lasx_xvilvl_h(s4, t2); + __m256i out1 = __lasx_xvilvh_h(s4, t2); -template -simdutf_really_inline size_t buf_block_reader::block_index() { return idx; } + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + __m256i one_byte_bytemask = __lasx_xvsle_hu(in, __lasx_xvrepli_h(0x7F)); + __m256i one_byte_bytemask_low = + __lasx_xvilvl_h(one_byte_bytemask, one_byte_bytemask); + __m256i one_byte_bytemask_high = + __lasx_xvilvh_h(one_byte_bytemask, one_byte_bytemask); + + __m256i one_or_two_bytes_bytemask_low = + __lasx_xvilvl_h(one_or_two_bytes_bytemask, zero); + __m256i one_or_two_bytes_bytemask_high = + __lasx_xvilvh_h(one_or_two_bytes_bytemask, zero); + + __m256i mask0 = __lasx_xvmskltz_h( + __lasx_xvor_v(one_or_two_bytes_bytemask_low, one_byte_bytemask_low)); + __m256i mask1 = __lasx_xvmskltz_h(__lasx_xvor_v( + one_or_two_bytes_bytemask_high, one_byte_bytemask_high)); + + uint32_t mask = __lasx_xvpickve2gr_wu(mask0, 0); + const uint8_t *row0 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF] + [0]; + __m128i shuffle0 = __lsx_vld(row0, 1); + __m128i utf8_0 = + __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out0), shuffle0); + __lsx_vst(utf8_0, utf8_output, 0); + utf8_output += row0[0]; -template -simdutf_really_inline bool buf_block_reader::has_full_block() const { - return idx < lenminusstep; -} + mask = __lasx_xvpickve2gr_wu(mask1, 0); + const uint8_t *row1 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF] + [0]; + __m128i shuffle1 = __lsx_vld(row1, 1); + __m128i utf8_1 = + __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out1), shuffle1); + __lsx_vst(utf8_1, utf8_output, 0); + utf8_output += row1[0]; -template -simdutf_really_inline const uint8_t *buf_block_reader::full_block() const { - return &buf[idx]; -} + mask = __lasx_xvpickve2gr_wu(mask0, 4); + const uint8_t *row2 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF] + [0]; + __m128i shuffle2 = __lsx_vld(row2, 1); + __m128i utf8_2 = + __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out0), shuffle2); + __lsx_vst(utf8_2, utf8_output, 0); + utf8_output += row2[0]; -template -simdutf_really_inline size_t buf_block_reader::get_remainder(uint8_t *dst) const { - if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers - std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once. - std::memcpy(dst, buf + idx, len - idx); - return len - idx; -} + mask = __lasx_xvpickve2gr_wu(mask1, 4); + const uint8_t *row3 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF] + [0]; + __m128i shuffle3 = __lsx_vld(row3, 1); + __m128i utf8_3 = + __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out1), shuffle3); + __lsx_vst(utf8_3, utf8_output, 0); + utf8_output += row3[0]; -template -simdutf_really_inline void buf_block_reader::advance() { - idx += STEP_SIZE; + buf += 16; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint16_t word = scalar::utf16::swap_if_needed(buf[k]); + if ((word & 0xFF80) == 0) { + *utf8_output++ = char(word); + } else if ((word & 0xF800) == 0) { + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if ((word & 0xF800) != 0xD800) { + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = + scalar::utf16::swap_if_needed(buf[k + 1]); + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if ((diff | diff2) > 0x3FF) { + return std::make_pair(nullptr, + reinterpret_cast(utf8_output)); + } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf8_output++ = char((value >> 18) | 0b11110000); + *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + return std::make_pair(buf, reinterpret_cast(utf8_output)); } -} // unnamed namespace -} // namespace ppc64 -} // namespace simdutf -/* end file src/generic/buf_block_reader.h */ -/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ -namespace simdutf { -namespace ppc64 { -namespace { -namespace utf8_validation { - -using namespace simd; - - simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { -// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) -// Bit 1 = Too Long (ASCII followed by continuation) -// Bit 2 = Overlong 3-byte -// Bit 4 = Surrogate -// Bit 5 = Overlong 2-byte -// Bit 7 = Two Continuations - constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ - // 11______ 11______ - constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ - constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ - constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ - constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ - constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ - constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ - // 11110100 101_____ - // 11110101 1001____ - // 11110101 101_____ - // 1111011_ 1001____ - // 1111011_ 101_____ - // 11111___ 1001____ - // 11111___ 101_____ - constexpr const uint8_t TOO_LARGE_1000 = 1<<6; - // 11110101 1000____ - // 1111011_ 1000____ - // 11111___ 1000____ - constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ - - const simd8 byte_1_high = prev1.shr<4>().lookup_16( - // 0_______ ________ - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - // 10______ ________ - TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, - // 1100____ ________ - TOO_SHORT | OVERLONG_2, - // 1101____ ________ - TOO_SHORT, - // 1110____ ________ - TOO_SHORT | OVERLONG_3 | SURROGATE, - // 1111____ ________ - TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 - ); - constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . - const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( - // ____0000 ________ - CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, - // ____0001 ________ - CARRY | OVERLONG_2, - // ____001_ ________ - CARRY, - CARRY, - - // ____0100 ________ - CARRY | TOO_LARGE, - // ____0101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - // ____011_ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - - // ____1___ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - // ____1101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000 - ); - const simd8 byte_2_high = input.shr<4>().lookup_16( - // ________ 0_______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, +/* + Returns a pair: a result struct and utf8_output. + If there is an error, the count field of the result is the position of the + error. Otherwise, it is the position of the first unprocessed byte in buf + (even if finished). A scalar routing should carry on the conversion of the + tail if needed. +*/ +template +std::pair +lasx_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len, + char *utf8_out) { + uint8_t *utf8_output = reinterpret_cast(utf8_out); + const char16_t *start = buf; + const char16_t *end = buf + len; - // ________ 1000____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, - // ________ 1001____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, - // ________ 101_____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 - // ________ 11______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT - ); - return (byte_1_high & byte_1_low & byte_2_high); - } - simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, - const simd8 prev_input, const simd8 sc) { - simd8 prev2 = input.prev<2>(prev_input); - simd8 prev3 = input.prev<3>(prev_input); - simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); - simd8 must23_80 = must23 & uint8_t(0x80); - return must23_80 ^ sc; - } + __m256i v_07ff = __lasx_xvreplgr2vr_h(uint16_t(0x7ff)); + __m256i zero = __lasx_xvldi(0); + __m128i zero_128 = __lsx_vldi(0); + while (end - buf >= std::ptrdiff_t(16 + safety_margin)) { + __m256i in = __lasx_xvld(reinterpret_cast(buf), 0); + if simdutf_constexpr (!match_system(big_endian)) { + in = lasx_swap_bytes(in); + } + if (__lasx_xbnz_h(__lasx_xvslt_hu( + in, __lasx_xvrepli_h(0x7F)))) { // ASCII fast path!!!! + // 1. pack the bytes + __m256i utf8_packed = + __lasx_xvpermi_d(__lasx_xvpickev_b(in, in), 0b00001000); + // 2. store (16 bytes) + __lsx_vst(lasx_extracti128_lo(utf8_packed), utf8_output, 0); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } - // - // Return nonzero if there are incomplete multibyte characters at the end of the block: - // e.g. if there is a 4-byte character, but it's 3 bytes from the end. - // - simdutf_really_inline simd8 is_incomplete(const simd8 input) { - // If the previous input's last 3 bytes match this, they're too short (they ended at EOF): - // ... 1111____ 111_____ 11______ - static const uint8_t max_array[32] = { - 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1 - }; - const simd8 max_value(&max_array[sizeof(max_array)-sizeof(simd8)]); - return input.gt_bits(max_value); - } + if (__lasx_xbz_v(__lasx_xvslt_hu(v_07ff, in))) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 16 + // expected output : [110a|aaaa|10bb|bbbb] x 16 + // t0 = [000a|aaaa|bbbb|bb00] + __m256i t0 = __lasx_xvslli_h(in, 2); + // t1 = [000a|aaaa|0000|0000] + __m256i t1 = __lasx_xvand_v(t0, lasx_splat_u16(0x1f00)); + // t2 = [0000|0000|00bb|bbbb] + __m256i t2 = __lasx_xvand_v(in, __lasx_xvrepli_h(0x3f)); + // t3 = [000a|aaaa|00bb|bbbb] + __m256i t3 = __lasx_xvor_v(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + __m256i v_c080 = __lasx_xvreplgr2vr_h(uint16_t(0xc080)); + __m256i t4 = __lasx_xvor_v(t3, v_c080); + // 2. merge ASCII and 2-byte codewords + __m256i one_byte_bytemask = + __lasx_xvsle_hu(in, __lasx_xvrepli_h(0x7F /*0x007F*/)); + __m256i utf8_unpacked = __lasx_xvbitsel_v(t4, in, one_byte_bytemask); + // 3. prepare bitmask for 8-bit lookup + __m256i mask = __lasx_xvmskltz_h(one_byte_bytemask); + uint32_t m1 = __lasx_xvpickve2gr_wu(mask, 0); + uint32_t m2 = __lasx_xvpickve2gr_wu(mask, 4); + // 4. pack the bytes + const uint8_t *row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes + [lasx_1_2_utf8_bytes_mask[m1]][0]; + __m128i shuffle1 = __lsx_vld(row1, 1); + __m128i utf8_packed1 = + __lsx_vshuf_b(zero_128, lasx_extracti128_lo(utf8_unpacked), shuffle1); + + const uint8_t *row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes + [lasx_1_2_utf8_bytes_mask[m2]][0]; + __m128i shuffle2 = __lsx_vld(row2, 1); + __m128i utf8_packed2 = + __lsx_vshuf_b(zero_128, lasx_extracti128_hi(utf8_unpacked), shuffle2); + // 5. store bytes + __lsx_vst(utf8_packed1, utf8_output, 0); + utf8_output += row1[0]; - struct utf8_checker { - // If this is nonzero, there has been a UTF-8 error. - simd8 error; - // The last input we received - simd8 prev_input_block; - // Whether the last input we received was incomplete (used for ASCII fast path) - simd8 prev_incomplete; + __lsx_vst(utf8_packed2, utf8_output, 0); + utf8_output += row2[0]; - // - // Check whether the current bytes are valid UTF-8. - // - simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { - // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes - // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) - simd8 prev1 = input.prev<1>(prev_input); - simd8 sc = check_special_cases(input, prev1); - this->error |= check_multibyte_lengths(input, prev_input, sc); - } - - // The only problem that can happen at EOF is that a multibyte character is too short - // or a byte value too large in the last bytes: check_special_cases only checks for bytes - // too large in the first of two bytes. - simdutf_really_inline void check_eof() { - // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't - // possibly finish them. - this->error |= this->prev_incomplete; + buf += 16; + continue; } + __m256i surrogates_bytemask = __lasx_xvseq_h( + __lasx_xvand_v(in, lasx_splat_u16(0xf800)), lasx_splat_u16(0xd800)); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. + // However, it is likely an uncommon occurrence. + if (__lasx_xbz_v(surrogates_bytemask)) { + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - + single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - + two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - + three UTF-8 bytes - simdutf_really_inline void check_next_input(const simd8x64& input) { - if(simdutf_likely(is_ascii(input))) { - this->error |= this->prev_incomplete; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], this->prev_input_block); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], this->prev_input_block); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - this->prev_incomplete = is_incomplete(input.chunks[simd8x64::NUM_CHUNKS-1]); - this->prev_input_block = input.chunks[simd8x64::NUM_CHUNKS-1]; + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. - } - } + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. - // do not forget to call check_eof! - simdutf_really_inline bool errors() const { - return this->error.any_bits_set_anywhere(); - } + We precompute byte 1 for case #3 and -- **conditionally** -- + precompute either byte 1 for case #2 or byte 2 for case #3. Note that + they differ by exactly one bit. - }; // struct utf8_checker -} // namespace utf8_validation + Finally from these two code units we build proper UTF-8 sequence, + taking into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + __m256i t0 = __lasx_xvpickev_b(in, in); + t0 = __lasx_xvilvl_b(t0, t0); -using utf8_validation::utf8_checker; + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|00cc|cccc] + __m256i v_3f7f = __lasx_xvreplgr2vr_h(uint16_t(0x3F7F)); + __m256i t1 = __lasx_xvand_v(t0, v_3f7f); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + __m256i t2 = __lasx_xvor_v(t1, lasx_splat_u16(0x8000)); + + // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] + __m256i s0 = __lasx_xvsrli_h(in, 12); + // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] + __m256i s1 = __lasx_xvslli_h(in, 2); + // s1: [aabb|bbbb|cccc|cc00] => [00bb|bbbb|0000|0000] + s1 = __lasx_xvand_v(s1, lasx_splat_u16(0x3f00)); + + // [00bb|bbbb|0000|aaaa] + __m256i s2 = __lasx_xvor_v(s0, s1); + // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + __m256i v_c0e0 = __lasx_xvreplgr2vr_h(uint16_t(0xC0E0)); + __m256i s3 = __lasx_xvor_v(s2, v_c0e0); + __m256i one_or_two_bytes_bytemask = __lasx_xvsle_hu(in, v_07ff); + __m256i m0 = + __lasx_xvandn_v(one_or_two_bytes_bytemask, lasx_splat_u16(0x4000)); + __m256i s4 = __lasx_xvxor_v(s3, m0); -} // unnamed namespace -} // namespace ppc64 -} // namespace simdutf -/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ -/* begin file src/generic/utf8_validation/utf8_validator.h */ -namespace simdutf { -namespace ppc64 { -namespace { -namespace utf8_validation { + // 4. expand code units 16-bit => 32-bit + __m256i out0 = __lasx_xvilvl_h(s4, t2); + __m256i out1 = __lasx_xvilvh_h(s4, t2); -/** - * Validates that the string is actual UTF-8. - */ -template -bool generic_validate_utf8(const uint8_t * input, size_t length) { - checker c{}; - buf_block_reader<64> reader(input, length); - while (reader.has_full_block()) { - simd::simd8x64 in(reader.full_block()); - c.check_next_input(in); - reader.advance(); - } - uint8_t block[64]{}; - reader.get_remainder(block); - simd::simd8x64 in(block); - c.check_next_input(in); - reader.advance(); - c.check_eof(); - return !c.errors(); -} + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + __m256i one_byte_bytemask = __lasx_xvsle_hu(in, __lasx_xvrepli_h(0x7F)); + __m256i one_byte_bytemask_low = + __lasx_xvilvl_h(one_byte_bytemask, one_byte_bytemask); + __m256i one_byte_bytemask_high = + __lasx_xvilvh_h(one_byte_bytemask, one_byte_bytemask); + + __m256i one_or_two_bytes_bytemask_low = + __lasx_xvilvl_h(one_or_two_bytes_bytemask, zero); + __m256i one_or_two_bytes_bytemask_high = + __lasx_xvilvh_h(one_or_two_bytes_bytemask, zero); + + __m256i mask0 = __lasx_xvmskltz_h( + __lasx_xvor_v(one_or_two_bytes_bytemask_low, one_byte_bytemask_low)); + __m256i mask1 = __lasx_xvmskltz_h(__lasx_xvor_v( + one_or_two_bytes_bytemask_high, one_byte_bytemask_high)); + + uint32_t mask = __lasx_xvpickve2gr_wu(mask0, 0); + const uint8_t *row0 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF] + [0]; + __m128i shuffle0 = __lsx_vld(row0, 1); + __m128i utf8_0 = + __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out0), shuffle0); + __lsx_vst(utf8_0, utf8_output, 0); + utf8_output += row0[0]; -bool generic_validate_utf8(const char * input, size_t length) { - return generic_validate_utf8(reinterpret_cast(input),length); -} + mask = __lasx_xvpickve2gr_wu(mask1, 0); + const uint8_t *row1 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF] + [0]; + __m128i shuffle1 = __lsx_vld(row1, 1); + __m128i utf8_1 = + __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out1), shuffle1); + __lsx_vst(utf8_1, utf8_output, 0); + utf8_output += row1[0]; -/** - * Validates that the string is actual UTF-8 and stops on errors. - */ -template -result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) { - checker c{}; - buf_block_reader<64> reader(input, length); - size_t count{0}; - while (reader.has_full_block()) { - simd::simd8x64 in(reader.full_block()); - c.check_next_input(in); - if(c.errors()) { - if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk - result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(input), reinterpret_cast(input + count), length - count); - res.count += count; - return res; - } - reader.advance(); - count += 64; - } - uint8_t block[64]{}; - reader.get_remainder(block); - simd::simd8x64 in(block); - c.check_next_input(in); - reader.advance(); - c.check_eof(); - if (c.errors()) { - if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk - result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(input), reinterpret_cast(input) + count, length - count); - res.count += count; - return res; - } else { - return result(error_code::SUCCESS, length); - } -} + mask = __lasx_xvpickve2gr_wu(mask0, 4); + const uint8_t *row2 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF] + [0]; + __m128i shuffle2 = __lsx_vld(row2, 1); + __m128i utf8_2 = + __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out0), shuffle2); + __lsx_vst(utf8_2, utf8_output, 0); + utf8_output += row2[0]; -result generic_validate_utf8_with_errors(const char * input, size_t length) { - return generic_validate_utf8_with_errors(reinterpret_cast(input),length); -} + mask = __lasx_xvpickve2gr_wu(mask1, 4); + const uint8_t *row3 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF] + [0]; + __m128i shuffle3 = __lsx_vld(row3, 1); + __m128i utf8_3 = + __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out1), shuffle3); + __lsx_vst(utf8_3, utf8_output, 0); + utf8_output += row3[0]; -template -bool generic_validate_ascii(const uint8_t * input, size_t length) { - buf_block_reader<64> reader(input, length); - uint8_t blocks[64]{}; - simd::simd8x64 running_or(blocks); - while (reader.has_full_block()) { - simd::simd8x64 in(reader.full_block()); - running_or |= in; - reader.advance(); + buf += 16; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint16_t word = scalar::utf16::swap_if_needed(buf[k]); + if ((word & 0xFF80) == 0) { + *utf8_output++ = char(word); + } else if ((word & 0xF800) == 0) { + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if ((word & 0xF800) != 0xD800) { + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = + scalar::utf16::swap_if_needed(buf[k + 1]); + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if ((diff | diff2) > 0x3FF) { + return std::make_pair( + result(error_code::SURROGATE, buf - start + k - 1), + reinterpret_cast(utf8_output)); + } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf8_output++ = char((value >> 18) | 0b11110000); + *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); + } + } + buf += k; } - uint8_t block[64]{}; - reader.get_remainder(block); - simd::simd8x64 in(block); - running_or |= in; - return running_or.is_ascii(); -} + } // while -bool generic_validate_ascii(const char * input, size_t length) { - return generic_validate_ascii(reinterpret_cast(input),length); + return std::make_pair(result(error_code::SUCCESS, buf - start), + reinterpret_cast(utf8_output)); } +/* end file src/lasx/lasx_convert_utf16_to_utf8.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +/* begin file src/lasx/lasx_convert_utf16_to_utf32.cpp */ +template +std::pair +lasx_convert_utf16_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_out) { + uint32_t *utf32_output = reinterpret_cast(utf32_out); + const char16_t *end = buf + len; -template -result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) { - buf_block_reader<64> reader(input, length); - size_t count{0}; - while (reader.has_full_block()) { - simd::simd8x64 in(reader.full_block()); - if (!in.is_ascii()) { - result res = scalar::ascii::validate_with_errors(reinterpret_cast(input + count), length - count); - return result(res.error, count + res.count); + // Performance degradation when memory address is not 32-byte aligned + while (((uint64_t)utf32_output & 0x1f) && buf < end) { + uint16_t word = scalar::utf16::swap_if_needed(buf[0]); + if ((word & 0xF800) != 0xD800) { + *utf32_output++ = char32_t(word); + buf++; + } else { + if (buf + 1 >= end) { + return std::make_pair(nullptr, + reinterpret_cast(utf32_output)); + } + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = scalar::utf16::swap_if_needed(buf[1]); + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if ((diff | diff2) > 0x3FF) { + return std::make_pair(nullptr, + reinterpret_cast(utf32_output)); + } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf32_output++ = char32_t(value); + buf += 2; } - reader.advance(); - - count += 64; - } - uint8_t block[64]{}; - reader.get_remainder(block); - simd::simd8x64 in(block); - if (!in.is_ascii()) { - result res = scalar::ascii::validate_with_errors(reinterpret_cast(input + count), length - count); - return result(res.error, count + res.count); - } else { - return result(error_code::SUCCESS, length); } -} - -result generic_validate_ascii_with_errors(const char * input, size_t length) { - return generic_validate_ascii_with_errors(reinterpret_cast(input),length); -} -} // namespace utf8_validation -} // unnamed namespace -} // namespace ppc64 -} // namespace simdutf -/* end file src/generic/utf8_validation/utf8_validator.h */ -// transcoding from UTF-8 to UTF-16 -/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ + __m256i v_f800 = lasx_splat_u16(0xf800); + __m256i v_d800 = lasx_splat_u16(0xd800); + while (end - buf >= 16) { + __m256i in = __lasx_xvld(reinterpret_cast(buf), 0); + if simdutf_constexpr (!match_system(big_endian)) { + in = lasx_swap_bytes(in); + } -namespace simdutf { -namespace ppc64 { -namespace { -namespace utf8_to_utf16 { + __m256i surrogates_bytemask = + __lasx_xvseq_h(__lasx_xvand_v(in, v_f800), v_d800); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. + // However, it is likely an uncommon occurrence. + if (__lasx_xbz_v(surrogates_bytemask)) { + // case: no surrogate pairs, extend all 16-bit code units to 32-bit code + // units + __m256i in_hi = __lasx_xvpermi_q(in, in, 0b00000001); + __lasx_xvst(__lasx_vext2xv_wu_hu(in), utf32_output, 0); + __lasx_xvst(__lasx_vext2xv_wu_hu(in_hi), utf32_output, 32); + utf32_output += 16; + buf += 16; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint16_t word = scalar::utf16::swap_if_needed(buf[k]); + if ((word & 0xF800) != 0xD800) { + *utf32_output++ = char32_t(word); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = + scalar::utf16::swap_if_needed(buf[k + 1]); + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if ((diff | diff2) > 0x3FF) { + return std::make_pair(nullptr, + reinterpret_cast(utf32_output)); + } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf32_output++ = char32_t(value); + } + } + buf += k; + } + } // while + return std::make_pair(buf, reinterpret_cast(utf32_output)); +} -using namespace simd; +/* + Returns a pair: a result struct and utf8_output. + If there is an error, the count field of the result is the position of the + error. Otherwise, it is the position of the first unprocessed byte in buf + (even if finished). A scalar routing should carry on the conversion of the + tail if needed. +*/ +template +std::pair +lasx_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len, + char32_t *utf32_out) { + uint32_t *utf32_output = reinterpret_cast(utf32_out); + const char16_t *start = buf; + const char16_t *end = buf + len; -template -simdutf_warn_unused size_t convert_valid(const char* input, size_t size, - char16_t* utf16_output) noexcept { - // The implementation is not specific to haswell and should be moved to the generic directory. - size_t pos = 0; - char16_t* start{utf16_output}; - const size_t safety_margin = 16; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - // this loop could be unrolled further. For example, we could process the mask - // far more than 64 bytes. - simd8x64 in(reinterpret_cast(input + pos)); - if(in.is_ascii()) { - in.store_ascii_as_utf16(utf16_output); - utf16_output += 64; - pos += 64; + // Performance degradation when memory address is not 32-byte aligned + while (((uint64_t)utf32_output & 0x1f) && buf < end) { + uint16_t word = scalar::utf16::swap_if_needed(buf[0]); + if ((word & 0xF800) != 0xD800) { + *utf32_output++ = char32_t(word); + buf++; + } else if (buf + 1 < end) { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = scalar::utf16::swap_if_needed(buf[1]); + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if ((diff | diff2) > 0x3FF) { + return std::make_pair(result(error_code::SURROGATE, buf - start), + reinterpret_cast(utf32_output)); + } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf32_output++ = char32_t(value); + buf += 2; } else { - // Slow path. We hope that the compiler will recognize that this is a slow path. - // Anything that is not a continuation mask is a 'leading byte', that is, the - // start of a new code point. - uint64_t utf8_continuation_mask = in.lt(-65 + 1); - // -65 is 0b10111111 in two-complement's, so largest possible continuation byte - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - // The *start* of code points is not so useful, rather, we want the *end* of code points. - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times when using solely - // the slow/regular path, and at least four times if there are fast paths. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - // - // Thus we may allow convert_masked_utf8_to_utf16 to process - // more bytes at a time under a fast-path mode where 16 bytes - // are consumed at once (e.g., when encountering ASCII). - size_t consumed = convert_masked_utf8_to_utf16(input + pos, - utf8_end_of_code_point_mask, utf16_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; + return std::make_pair(result(error_code::SURROGATE, buf - start), + reinterpret_cast(utf32_output)); + } + } + + __m256i v_f800 = lasx_splat_u16(0xf800); + __m256i v_d800 = lasx_splat_u16(0xd800); + while (end - buf >= 16) { + __m256i in = __lasx_xvld(reinterpret_cast(buf), 0); + if simdutf_constexpr (!match_system(big_endian)) { + in = lasx_swap_bytes(in); + } + + __m256i surrogates_bytemask = + __lasx_xvseq_h(__lasx_xvand_v(in, v_f800), v_d800); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. + // However, it is likely an uncommon occurrence. + if (__lasx_xbz_v(surrogates_bytemask)) { + // case: no surrogate pairs, extend all 16-bit code units to 32-bit code + // units + __m256i in_hi = __lasx_xvpermi_q(in, in, 0b00000001); + __lasx_xvst(__lasx_vext2xv_wu_hu(in), utf32_output, 0); + __lasx_xvst(__lasx_vext2xv_wu_hu(in_hi), utf32_output, 32); + utf32_output += 16; + buf += 16; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint16_t word = scalar::utf16::swap_if_needed(buf[k]); + if ((word & 0xF800) != 0xD800) { + *utf32_output++ = char32_t(word); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = + scalar::utf16::swap_if_needed(buf[k + 1]); + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if ((diff | diff2) > 0x3FF) { + return std::make_pair( + result(error_code::SURROGATE, buf - start + k - 1), + reinterpret_cast(utf32_output)); + } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf32_output++ = char32_t(value); + } } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. + buf += k; } - } - utf16_output += scalar::utf8_to_utf16::convert_valid(input + pos, size - pos, utf16_output); - return utf16_output - start; + } // while + return std::make_pair(result(error_code::SUCCESS, buf - start), + reinterpret_cast(utf32_output)); } +/* end file src/lasx/lasx_convert_utf16_to_utf32.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 -} // namespace utf8_to_utf16 -} // unnamed namespace -} // namespace ppc64 -} // namespace simdutf -/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ -/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */ - - -namespace simdutf { -namespace ppc64 { -namespace { -namespace utf8_to_utf16 { -using namespace simd; - +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/lasx/lasx_convert_utf32_to_latin1.cpp */ +std::pair +lasx_convert_utf32_to_latin1(const char32_t *buf, size_t len, + char *latin1_output) { + const char32_t *end = buf + len; + const __m256i shuf_mask = ____m256i( + (__m128i)v16u8{0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0}); + __m256i v_ff = __lasx_xvrepli_w(0xFF); - simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { -// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) -// Bit 1 = Too Long (ASCII followed by continuation) -// Bit 2 = Overlong 3-byte -// Bit 4 = Surrogate -// Bit 5 = Overlong 2-byte -// Bit 7 = Two Continuations - constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ - // 11______ 11______ - constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ - constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ - constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ - constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ - constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ - constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ - // 11110100 101_____ - // 11110101 1001____ - // 11110101 101_____ - // 1111011_ 1001____ - // 1111011_ 101_____ - // 11111___ 1001____ - // 11111___ 101_____ - constexpr const uint8_t TOO_LARGE_1000 = 1<<6; - // 11110101 1000____ - // 1111011_ 1000____ - // 11111___ 1000____ - constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ - - const simd8 byte_1_high = prev1.shr<4>().lookup_16( - // 0_______ ________ - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - // 10______ ________ - TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, - // 1100____ ________ - TOO_SHORT | OVERLONG_2, - // 1101____ ________ - TOO_SHORT, - // 1110____ ________ - TOO_SHORT | OVERLONG_3 | SURROGATE, - // 1111____ ________ - TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 - ); - constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . - const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( - // ____0000 ________ - CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, - // ____0001 ________ - CARRY | OVERLONG_2, - // ____001_ ________ - CARRY, - CARRY, - - // ____0100 ________ - CARRY | TOO_LARGE, - // ____0101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - // ____011_ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - - // ____1___ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - // ____1101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000 - ); - const simd8 byte_2_high = input.shr<4>().lookup_16( - // ________ 0_______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + while (end - buf >= 16) { + __m256i in1 = __lasx_xvld(reinterpret_cast(buf), 0); + __m256i in2 = __lasx_xvld(reinterpret_cast(buf), 32); - // ________ 1000____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, - // ________ 1001____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, - // ________ 101_____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + __m256i in12 = __lasx_xvor_v(in1, in2); + if (__lasx_xbz_v(__lasx_xvslt_wu(v_ff, in12))) { + // 1. pack the bytes + __m256i latin1_packed_tmp = __lasx_xvshuf_b(in2, in1, shuf_mask); + latin1_packed_tmp = __lasx_xvpermi_d(latin1_packed_tmp, 0b00001000); + __m128i latin1_packed = lasx_extracti128_lo(latin1_packed_tmp); + latin1_packed = __lsx_vpermi_w(latin1_packed, latin1_packed, 0b11011000); + // 2. store (8 bytes) + __lsx_vst(latin1_packed, reinterpret_cast(latin1_output), 0); + // 3. adjust pointers + buf += 16; + latin1_output += 16; + } else { + return std::make_pair(nullptr, reinterpret_cast(latin1_output)); + } + } // while + return std::make_pair(buf, latin1_output); +} - // ________ 11______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT - ); - return (byte_1_high & byte_1_low & byte_2_high); - } - simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, - const simd8 prev_input, const simd8 sc) { - simd8 prev2 = input.prev<2>(prev_input); - simd8 prev3 = input.prev<3>(prev_input); - simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); - simd8 must23_80 = must23 & uint8_t(0x80); - return must23_80 ^ sc; - } +std::pair +lasx_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len, + char *latin1_output) { + const char32_t *start = buf; + const char32_t *end = buf + len; + const __m256i shuf_mask = ____m256i( + (__m128i)v16u8{0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0}); + __m256i v_ff = __lasx_xvrepli_w(0xFF); - struct validating_transcoder { - // If this is nonzero, there has been a UTF-8 error. - simd8 error; + while (end - buf >= 16) { + __m256i in1 = __lasx_xvld(reinterpret_cast(buf), 0); + __m256i in2 = __lasx_xvld(reinterpret_cast(buf), 32); - validating_transcoder() : error(uint8_t(0)) {} - // - // Check whether the current bytes are valid UTF-8. - // - simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { - // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes - // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) - simd8 prev1 = input.prev<1>(prev_input); - simd8 sc = check_special_cases(input, prev1); - this->error |= check_multibyte_lengths(input, prev_input, sc); - } - - - template - simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) { - size_t pos = 0; - char16_t* start{utf16_output}; - // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes, - // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate - // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, - // to give us a good margin. - size_t leading_byte = 0; - size_t margin = size; - for(; margin > 0 && leading_byte < 8; margin--) { - leading_byte += (int8_t(in[margin-1]) > -65); - } - // If the input is long enough, then we have that margin-1 is the eight last leading byte. - const size_t safety_margin = size - margin + 1; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 input(reinterpret_cast(in + pos)); - if(input.is_ascii()) { - input.store_ascii_as_utf16(utf16_output); - utf16_output += 64; - pos += 64; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - auto zero = simd8{uint8_t(0)}; - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_utf16(in + pos, - utf8_end_of_code_point_mask, utf16_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. - } - } - if(errors()) { return 0; } - if(pos < size) { - size_t howmany = scalar::utf8_to_utf16::convert(in + pos, size - pos, utf16_output); - if(howmany == 0) { return 0; } - utf16_output += howmany; - } - return utf16_output - start; - } - - template - simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) { - size_t pos = 0; - char16_t* start{utf16_output}; - // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes, - // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate - // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, - // to give us a good margin. - size_t leading_byte = 0; - size_t margin = size; - for(; margin > 0 && leading_byte < 8; margin--) { - leading_byte += (int8_t(in[margin-1]) > -65); - } - // If the input is long enough, then we have that margin-1 is the eight last leading byte. - const size_t safety_margin = size - margin + 1; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 input(reinterpret_cast(in + pos)); - if(input.is_ascii()) { - input.store_ascii_as_utf16(utf16_output); - utf16_output += 64; - pos += 64; + __m256i in12 = __lasx_xvor_v(in1, in2); + if (__lasx_xbz_v(__lasx_xvslt_wu(v_ff, in12))) { + // 1. pack the bytes + __m256i latin1_packed_tmp = __lasx_xvshuf_b(in2, in1, shuf_mask); + latin1_packed_tmp = __lasx_xvpermi_d(latin1_packed_tmp, 0b00001000); + __m128i latin1_packed = lasx_extracti128_lo(latin1_packed_tmp); + latin1_packed = __lsx_vpermi_w(latin1_packed, latin1_packed, 0b11011000); + // 2. store (8 bytes) + __lsx_vst(latin1_packed, reinterpret_cast(latin1_output), 0); + // 3. adjust pointers + buf += 16; + latin1_output += 16; + } else { + // Let us do a scalar fallback. + for (int k = 0; k < 16; k++) { + uint32_t word = buf[k]; + if (word <= 0xff) { + *latin1_output++ = char(word); } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - auto zero = simd8{uint8_t(0)}; - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - if (errors()) { - // rewind_and_convert_with_errors will seek a potential error from in+pos onward, - // with the ability to go back up to pos bytes, and read size-pos bytes forward. - result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); - res.count += pos; - return res; - } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_utf16(in + pos, - utf8_end_of_code_point_mask, utf16_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. - } - } - if(errors()) { - // rewind_and_convert_with_errors will seek a potential error from in+pos onward, - // with the ability to go back up to pos bytes, and read size-pos bytes forward. - result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); - res.count += pos; - return res; - } - if(pos < size) { - // rewind_and_convert_with_errors will seek a potential error from in+pos onward, - // with the ability to go back up to pos bytes, and read size-pos bytes forward. - result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); - if (res.error) { // In case of error, we want the error position - res.count += pos; - return res; - } else { // In case of success, we want the number of word written - utf16_output += res.count; + return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), + latin1_output); } } - return result(error_code::SUCCESS, utf16_output - start); } + } // while + return std::make_pair(result(error_code::SUCCESS, buf - start), + latin1_output); +} +/* end file src/lasx/lasx_convert_utf32_to_latin1.cpp */ +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +/* begin file src/lasx/lasx_convert_utf32_to_utf8.cpp */ +std::pair +lasx_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) { + uint8_t *utf8_output = reinterpret_cast(utf8_out); + const char32_t *end = buf + len; - simdutf_really_inline bool errors() const { - return this->error.any_bits_set_anywhere(); + // load addr align 32 + while (((uint64_t)buf & 0x1F) && buf < end) { + uint32_t word = *buf; + if ((word & 0xFFFFFF80) == 0) { + *utf8_output++ = char(word); + } else if ((word & 0xFFFFF800) == 0) { + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if ((word & 0xFFFF0000) == 0) { + if (word >= 0xD800 && word <= 0xDFFF) { + return std::make_pair(nullptr, reinterpret_cast(utf8_output)); + } + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + if (word > 0x10FFFF) { + return std::make_pair(nullptr, reinterpret_cast(utf8_output)); + } + *utf8_output++ = char((word >> 18) | 0b11110000); + *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); } + buf++; + } - }; // struct utf8_checker -} // utf8_to_utf16 namespace -} // unnamed namespace -} // namespace ppc64 -} // namespace simdutf -/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */ -// transcoding from UTF-8 to UTF-32 -/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ + __m256i v_c080 = lasx_splat_u16(0xc080); + __m256i v_07ff = lasx_splat_u16(0x07ff); + __m256i v_dfff = lasx_splat_u16(0xdfff); + __m256i v_d800 = lasx_splat_u16(0xd800); + __m256i zero = __lasx_xvldi(0); + __m128i zero_128 = __lsx_vldi(0); + __m256i forbidden_bytemask = __lasx_xvldi(0x0); -namespace simdutf { -namespace ppc64 { -namespace { -namespace utf8_to_utf32 { + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 -using namespace simd; + while (end - buf > std::ptrdiff_t(16 + safety_margin)) { + __m256i in = __lasx_xvld(reinterpret_cast(buf), 0); + __m256i nextin = __lasx_xvld(reinterpret_cast(buf), 32); + // Check if no bits set above 16th + if (__lasx_xbz_v(__lasx_xvpickod_h(in, nextin))) { + // Pack UTF-32 to UTF-16 safely (without surrogate pairs) + // Apply UTF-16 => UTF-8 routine (lasx_convert_utf16_to_utf8.cpp) + __m256i utf16_packed = + __lasx_xvpermi_d(__lasx_xvpickev_h(nextin, in), 0b11011000); -simdutf_warn_unused size_t convert_valid(const char* input, size_t size, - char32_t* utf32_output) noexcept { - size_t pos = 0; - char32_t* start{utf32_output}; - const size_t safety_margin = 16; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 in(reinterpret_cast(input + pos)); - if(in.is_ascii()) { - in.store_ascii_as_utf32(utf32_output); - utf32_output += 64; - pos += 64; - } else { - // -65 is 0b10111111 in two-complement's, so largest possible continuation byte - uint64_t utf8_continuation_mask = in.lt(-65 + 1); - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - size_t max_starting_point = (pos + 64) - 12; - while(pos < max_starting_point) { - size_t consumed = convert_masked_utf8_to_utf32(input + pos, - utf8_end_of_code_point_mask, utf32_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; + if (__lasx_xbz_v(__lasx_xvslt_hu(__lasx_xvrepli_h(0x7F), + utf16_packed))) { // ASCII fast path!!!! + // 1. pack the bytes + // obviously suboptimal. + __m256i utf8_packed = __lasx_xvpermi_d( + __lasx_xvpickev_b(utf16_packed, utf16_packed), 0b00001000); + // 2. store (8 bytes) + __lsx_vst(lasx_extracti128_lo(utf8_packed), utf8_output, 0); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! } - } - } - utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output); - return utf32_output - start; -} + if (__lasx_xbz_v(__lasx_xvslt_hu(v_07ff, utf16_packed))) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 -} // namespace utf8_to_utf32 -} // unnamed namespace -} // namespace ppc64 -} // namespace simdutf -/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ -/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */ + // t0 = [000a|aaaa|bbbb|bb00] + const __m256i t0 = __lasx_xvslli_h(utf16_packed, 2); + // t1 = [000a|aaaa|0000|0000] + const __m256i t1 = __lasx_xvand_v(t0, lasx_splat_u16(0x1f00)); + // t2 = [0000|0000|00bb|bbbb] + const __m256i t2 = __lasx_xvand_v(utf16_packed, __lasx_xvrepli_h(0x3f)); + // t3 = [000a|aaaa|00bb|bbbb] + const __m256i t3 = __lasx_xvor_v(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const __m256i t4 = __lasx_xvor_v(t3, v_c080); + // 2. merge ASCII and 2-byte codewords + __m256i one_byte_bytemask = + __lasx_xvsle_hu(utf16_packed, __lasx_xvrepli_h(0x7F /*0x007F*/)); + __m256i utf8_unpacked = + __lasx_xvbitsel_v(t4, utf16_packed, one_byte_bytemask); + // 3. prepare bitmask for 8-bit lookup + __m256i mask = __lasx_xvmskltz_h(one_byte_bytemask); + uint32_t m1 = __lasx_xvpickve2gr_wu(mask, 0); + uint32_t m2 = __lasx_xvpickve2gr_wu(mask, 4); + // 4. pack the bytes + const uint8_t *row1 = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes + [lasx_1_2_utf8_bytes_mask[m1]][0]; + __m128i shuffle1 = __lsx_vld(row1, 1); + __m128i utf8_packed1 = __lsx_vshuf_b( + zero_128, lasx_extracti128_lo(utf8_unpacked), shuffle1); + + const uint8_t *row2 = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes + [lasx_1_2_utf8_bytes_mask[m2]][0]; + __m128i shuffle2 = __lsx_vld(row2, 1); + __m128i utf8_packed2 = __lsx_vshuf_b( + zero_128, lasx_extracti128_hi(utf8_unpacked), shuffle2); + // 5. store bytes + __lsx_vst(utf8_packed1, utf8_output, 0); + utf8_output += row1[0]; + __lsx_vst(utf8_packed2, utf8_output, 0); + utf8_output += row2[0]; -namespace simdutf { -namespace ppc64 { -namespace { -namespace utf8_to_utf32 { -using namespace simd; + buf += 16; + continue; + } else { + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes + forbidden_bytemask = __lasx_xvor_v( + __lasx_xvand_v( + __lasx_xvsle_h(utf16_packed, v_dfff), // utf16_packed <= 0xdfff + __lasx_xvsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800 + forbidden_bytemask); + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - + single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - + two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - + three UTF-8 bytes + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. - simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { -// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) -// Bit 1 = Too Long (ASCII followed by continuation) -// Bit 2 = Overlong 3-byte -// Bit 4 = Surrogate -// Bit 5 = Overlong 2-byte -// Bit 7 = Two Continuations - constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ - // 11______ 11______ - constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ - constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ - constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ - constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ - constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ - constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ - // 11110100 101_____ - // 11110101 1001____ - // 11110101 101_____ - // 1111011_ 1001____ - // 1111011_ 101_____ - // 11111___ 1001____ - // 11111___ 101_____ - constexpr const uint8_t TOO_LARGE_1000 = 1<<6; - // 11110101 1000____ - // 1111011_ 1000____ - // 11111___ 1000____ - constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ - - const simd8 byte_1_high = prev1.shr<4>().lookup_16( - // 0_______ ________ - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - // 10______ ________ - TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, - // 1100____ ________ - TOO_SHORT | OVERLONG_2, - // 1101____ ________ - TOO_SHORT, - // 1110____ ________ - TOO_SHORT | OVERLONG_3 | SURROGATE, - // 1111____ ________ - TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 - ); - constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . - const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( - // ____0000 ________ - CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, - // ____0001 ________ - CARRY | OVERLONG_2, - // ____001_ ________ - CARRY, - CARRY, - - // ____0100 ________ - CARRY | TOO_LARGE, - // ____0101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - // ____011_ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - - // ____1___ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - // ____1101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000 - ); - const simd8 byte_2_high = input.shr<4>().lookup_16( - // ________ 0_______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + We precompute byte 1 for case #1 and the common byte for cases #2 & + #3 in register t2. - // ________ 1000____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, - // ________ 1001____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, - // ________ 101_____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + We precompute byte 1 for case #3 and -- **conditionally** -- + precompute either byte 1 for case #2 or byte 2 for case #3. Note that + they differ by exactly one bit. - // ________ 11______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT - ); - return (byte_1_high & byte_1_low & byte_2_high); - } - simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, - const simd8 prev_input, const simd8 sc) { - simd8 prev2 = input.prev<2>(prev_input); - simd8 prev3 = input.prev<3>(prev_input); - simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); - simd8 must23_80 = must23 & uint8_t(0x80); - return must23_80 ^ sc; - } + Finally from these two code units we build proper UTF-8 sequence, + taking into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + __m256i t0 = __lasx_xvpickev_b(utf16_packed, utf16_packed); + t0 = __lasx_xvilvl_b(t0, t0); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + __m256i v_3f7f = __lasx_xvreplgr2vr_h(uint16_t(0x3F7F)); + __m256i t1 = __lasx_xvand_v(t0, v_3f7f); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + __m256i t2 = __lasx_xvor_v(t1, lasx_splat_u16(0x8000)); + // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] + __m256i s0 = __lasx_xvsrli_h(utf16_packed, 12); + // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] + __m256i s1 = __lasx_xvslli_h(utf16_packed, 2); + // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000] + s1 = __lasx_xvand_v(s1, lasx_splat_u16(0x3f00)); + // [00bb|bbbb|0000|aaaa] + __m256i s2 = __lasx_xvor_v(s0, s1); + // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + __m256i v_c0e0 = __lasx_xvreplgr2vr_h(uint16_t(0xC0E0)); + __m256i s3 = __lasx_xvor_v(s2, v_c0e0); + // __m256i v_07ff = vmovq_n_u16((uint16_t)0x07FF); + __m256i one_or_two_bytes_bytemask = + __lasx_xvsle_hu(utf16_packed, v_07ff); + __m256i m0 = + __lasx_xvandn_v(one_or_two_bytes_bytemask, lasx_splat_u16(0x4000)); + __m256i s4 = __lasx_xvxor_v(s3, m0); - struct validating_transcoder { - // If this is nonzero, there has been a UTF-8 error. - simd8 error; + // 4. expand code units 16-bit => 32-bit + __m256i out0 = __lasx_xvilvl_h(s4, t2); + __m256i out1 = __lasx_xvilvh_h(s4, t2); - validating_transcoder() : error(uint8_t(0)) {} - // - // Check whether the current bytes are valid UTF-8. - // - simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { - // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes - // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) - simd8 prev1 = input.prev<1>(prev_input); - simd8 sc = check_special_cases(input, prev1); - this->error |= check_multibyte_lengths(input, prev_input, sc); - } - - - - simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) { - size_t pos = 0; - char32_t* start{utf32_output}; - // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes, - // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate - // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, - // to give us a good margin. - size_t leading_byte = 0; - size_t margin = size; - for(; margin > 0 && leading_byte < 8; margin--) { - leading_byte += (int8_t(in[margin-1]) > -65); - } - // If the input is long enough, then we have that margin-1 is the fourth last leading byte. - const size_t safety_margin = size - margin + 1; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 input(reinterpret_cast(in + pos)); - if(input.is_ascii()) { - input.store_ascii_as_utf32(utf32_output); - utf32_output += 64; - pos += 64; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - auto zero = simd8{uint8_t(0)}; - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_utf32(in + pos, - utf8_end_of_code_point_mask, utf32_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. - } + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + __m256i one_byte_bytemask = + __lasx_xvsle_hu(utf16_packed, __lasx_xvrepli_h(0x7F)); + + __m256i one_or_two_bytes_bytemask_u16_to_u32_low = + __lasx_xvilvl_h(one_or_two_bytes_bytemask, zero); + __m256i one_or_two_bytes_bytemask_u16_to_u32_high = + __lasx_xvilvh_h(one_or_two_bytes_bytemask, zero); + + __m256i one_byte_bytemask_u16_to_u32_low = + __lasx_xvilvl_h(one_byte_bytemask, one_byte_bytemask); + __m256i one_byte_bytemask_u16_to_u32_high = + __lasx_xvilvh_h(one_byte_bytemask, one_byte_bytemask); + + __m256i mask0 = __lasx_xvmskltz_h( + __lasx_xvor_v(one_or_two_bytes_bytemask_u16_to_u32_low, + one_byte_bytemask_u16_to_u32_low)); + __m256i mask1 = __lasx_xvmskltz_h( + __lasx_xvor_v(one_or_two_bytes_bytemask_u16_to_u32_high, + one_byte_bytemask_u16_to_u32_high)); + + uint32_t mask = __lasx_xvpickve2gr_wu(mask0, 0); + const uint8_t *row0 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF] + [0]; + __m128i shuffle0 = __lsx_vld(row0, 1); + __m128i utf8_0 = + __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out0), shuffle0); + __lsx_vst(utf8_0, utf8_output, 0); + utf8_output += row0[0]; + + mask = __lasx_xvpickve2gr_wu(mask1, 0); + const uint8_t *row1 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF] + [0]; + __m128i shuffle1 = __lsx_vld(row1, 1); + __m128i utf8_1 = + __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out1), shuffle1); + __lsx_vst(utf8_1, utf8_output, 0); + utf8_output += row1[0]; + + mask = __lasx_xvpickve2gr_wu(mask0, 4); + const uint8_t *row2 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF] + [0]; + __m128i shuffle2 = __lsx_vld(row2, 1); + __m128i utf8_2 = + __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out0), shuffle2); + __lsx_vst(utf8_2, utf8_output, 0); + utf8_output += row2[0]; + + mask = __lasx_xvpickve2gr_wu(mask1, 4); + const uint8_t *row3 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF] + [0]; + __m128i shuffle3 = __lsx_vld(row3, 1); + __m128i utf8_3 = + __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out1), shuffle3); + __lsx_vst(utf8_3, utf8_output, 0); + utf8_output += row3[0]; + + buf += 16; } - if(errors()) { return 0; } - if(pos < size) { - size_t howmany = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output); - if(howmany == 0) { return 0; } - utf32_output += howmany; - } - return utf32_output - start; - } - - simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) { - size_t pos = 0; - char32_t* start{utf32_output}; - // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes, - // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate - // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, - // to give us a good margin. - size_t leading_byte = 0; - size_t margin = size; - for(; margin > 0 && leading_byte < 8; margin--) { - leading_byte += (int8_t(in[margin-1]) > -65); - } - // If the input is long enough, then we have that margin-1 is the fourth last leading byte. - const size_t safety_margin = size - margin + 1; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 input(reinterpret_cast(in + pos)); - if(input.is_ascii()) { - input.store_ascii_as_utf32(utf32_output); - utf32_output += 64; - pos += 64; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - auto zero = simd8{uint8_t(0)}; - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - if (errors()) { - result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); - res.count += pos; - return res; + // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> + // will produce four UTF-8 bytes. + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint32_t word = buf[k]; + if ((word & 0xFFFFFF80) == 0) { + *utf8_output++ = char(word); + } else if ((word & 0xFFFFF800) == 0) { + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if ((word & 0xFFFF0000) == 0) { + if (word >= 0xD800 && word <= 0xDFFF) { + return std::make_pair(nullptr, + reinterpret_cast(utf8_output)); } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_utf32(in + pos, - utf8_end_of_code_point_mask, utf32_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + if (word > 0x10FFFF) { + return std::make_pair(nullptr, + reinterpret_cast(utf8_output)); } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. + *utf8_output++ = char((word >> 18) | 0b11110000); + *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); } } - if(errors()) { - result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); - res.count += pos; - return res; - } - if(pos < size) { - result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); - if (res.error) { // In case of error, we want the error position - res.count += pos; - return res; - } else { // In case of success, we want the number of word written - utf32_output += res.count; - } + buf += k; + } + } // while + + // check for invalid input + if (__lasx_xbnz_v(forbidden_bytemask)) { + return std::make_pair(nullptr, reinterpret_cast(utf8_output)); + } + return std::make_pair(buf, reinterpret_cast(utf8_output)); +} + +std::pair +lasx_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len, + char *utf8_out) { + uint8_t *utf8_output = reinterpret_cast(utf8_out); + const char32_t *start = buf; + const char32_t *end = buf + len; + + // load addr align 32 + while (((uint64_t)buf & 0x1F) && buf < end) { + uint32_t word = *buf; + if ((word & 0xFFFFFF80) == 0) { + *utf8_output++ = char(word); + } else if ((word & 0xFFFFF800) == 0) { + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if ((word & 0xFFFF0000) == 0) { + if (word >= 0xD800 && word <= 0xDFFF) { + return std::make_pair(result(error_code::SURROGATE, buf - start), + reinterpret_cast(utf8_output)); } - return result(error_code::SUCCESS, utf32_output - start); - } - - simdutf_really_inline bool errors() const { - return this->error.any_bits_set_anywhere(); + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + if (word > 0x10FFFF) { + return std::make_pair(result(error_code::TOO_LARGE, buf - start), + reinterpret_cast(utf8_output)); + } + *utf8_output++ = char((word >> 18) | 0b11110000); + *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); } + buf++; + } - }; // struct utf8_checker -} // utf8_to_utf32 namespace -} // unnamed namespace -} // namespace ppc64 -} // namespace simdutf -/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */ -// other functions -/* begin file src/generic/utf8.h */ + __m256i v_c080 = lasx_splat_u16(0xc080); + __m256i v_07ff = lasx_splat_u16(0x07ff); + __m256i v_dfff = lasx_splat_u16(0xdfff); + __m256i v_d800 = lasx_splat_u16(0xd800); + __m256i zero = __lasx_xvldi(0); + __m128i zero_128 = __lsx_vldi(0); + __m256i forbidden_bytemask = __lasx_xvldi(0x0); + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 -namespace simdutf { -namespace ppc64 { -namespace { -namespace utf8 { + while (end - buf > std::ptrdiff_t(16 + safety_margin)) { + __m256i in = __lasx_xvld(reinterpret_cast(buf), 0); + __m256i nextin = __lasx_xvld(reinterpret_cast(buf), 32); -using namespace simd; + // Check if no bits set above 16th + if (__lasx_xbz_v(__lasx_xvpickod_h(in, nextin))) { + // Pack UTF-32 to UTF-16 safely (without surrogate pairs) + // Apply UTF-16 => UTF-8 routine (lasx_convert_utf16_to_utf8.cpp) + __m256i utf16_packed = + __lasx_xvpermi_d(__lasx_xvpickev_h(nextin, in), 0b11011000); -simdutf_really_inline size_t count_code_points(const char* in, size_t size) { - size_t pos = 0; - size_t count = 0; - for(;pos + 64 <= size; pos += 64) { - simd8x64 input(reinterpret_cast(in + pos)); - uint64_t utf8_continuation_mask = input.gt(-65); - count += count_ones(utf8_continuation_mask); - } - return count + scalar::utf8::count_code_points(in + pos, size - pos); -} + if (__lasx_xbz_v(__lasx_xvslt_hu(__lasx_xvrepli_h(0x7F), + utf16_packed))) { // ASCII fast path!!!! + // 1. pack the bytes + // obviously suboptimal. + __m256i utf8_packed = __lasx_xvpermi_d( + __lasx_xvpickev_b(utf16_packed, utf16_packed), 0b00001000); + // 2. store (8 bytes) + __lsx_vst(lasx_extracti128_lo(utf8_packed), utf8_output, 0); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } -simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) { - size_t pos = 0; - size_t count = 0; - // This algorithm could no doubt be improved! - for(;pos + 64 <= size; pos += 64) { - simd8x64 input(reinterpret_cast(in + pos)); - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - // We count one word for anything that is not a continuation (so - // leading bytes). - count += 64 - count_ones(utf8_continuation_mask); - int64_t utf8_4byte = input.gteq_unsigned(240); - count += count_ones(utf8_4byte); - } - return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos); -} -} // utf8 namespace -} // unnamed namespace -} // namespace ppc64 -} // namespace simdutf -/* end file src/generic/utf8.h */ -/* begin file src/generic/utf16.h */ -namespace simdutf { -namespace ppc64 { -namespace { -namespace utf16 { + if (__lasx_xbz_v(__lasx_xvslt_hu(v_07ff, utf16_packed))) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 -template -simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) { - size_t pos = 0; - size_t count = 0; - for(;pos < size/32*32; pos += 32) { - simd16x32 input(reinterpret_cast(in + pos)); - if (!match_system(big_endian)) { input.swap_bytes(); } - uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF); - count += count_ones(not_pair) / 2; - } - return count + scalar::utf16::count_code_points(in + pos, size - pos); -} + // t0 = [000a|aaaa|bbbb|bb00] + const __m256i t0 = __lasx_xvslli_h(utf16_packed, 2); + // t1 = [000a|aaaa|0000|0000] + const __m256i t1 = __lasx_xvand_v(t0, lasx_splat_u16(0x1f00)); + // t2 = [0000|0000|00bb|bbbb] + const __m256i t2 = __lasx_xvand_v(utf16_packed, __lasx_xvrepli_h(0x3f)); + // t3 = [000a|aaaa|00bb|bbbb] + const __m256i t3 = __lasx_xvor_v(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const __m256i t4 = __lasx_xvor_v(t3, v_c080); + // 2. merge ASCII and 2-byte codewords + __m256i one_byte_bytemask = + __lasx_xvsle_hu(utf16_packed, __lasx_xvrepli_h(0x7F /*0x007F*/)); + __m256i utf8_unpacked = + __lasx_xvbitsel_v(t4, utf16_packed, one_byte_bytemask); + // 3. prepare bitmask for 8-bit lookup + __m256i mask = __lasx_xvmskltz_h(one_byte_bytemask); + uint32_t m1 = __lasx_xvpickve2gr_wu(mask, 0); + uint32_t m2 = __lasx_xvpickve2gr_wu(mask, 4); + // 4. pack the bytes + const uint8_t *row1 = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes + [lasx_1_2_utf8_bytes_mask[m1]][0]; + __m128i shuffle1 = __lsx_vld(row1, 1); + __m128i utf8_packed1 = __lsx_vshuf_b( + zero_128, lasx_extracti128_lo(utf8_unpacked), shuffle1); + + const uint8_t *row2 = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes + [lasx_1_2_utf8_bytes_mask[m2]][0]; + __m128i shuffle2 = __lsx_vld(row2, 1); + __m128i utf8_packed2 = __lsx_vshuf_b( + zero_128, lasx_extracti128_hi(utf8_unpacked), shuffle2); + // 5. store bytes + __lsx_vst(utf8_packed1, utf8_output, 0); + utf8_output += row1[0]; -template -simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) { - size_t pos = 0; - size_t count = 0; - // This algorithm could no doubt be improved! - for(;pos < size/32*32; pos += 32) { - simd16x32 input(reinterpret_cast(in + pos)); - if (!match_system(big_endian)) { input.swap_bytes(); } - uint64_t ascii_mask = input.lteq(0x7F); - uint64_t twobyte_mask = input.lteq(0x7FF); - uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF); + __lsx_vst(utf8_packed2, utf8_output, 0); + utf8_output += row2[0]; - size_t ascii_count = count_ones(ascii_mask) / 2; - size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2; - size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2; - size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2; - count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count; - } - return count + scalar::utf16::utf8_length_from_utf16(in + pos, size - pos); -} + buf += 16; + continue; + } else { + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes + forbidden_bytemask = __lasx_xvor_v( + __lasx_xvand_v( + __lasx_xvsle_h(utf16_packed, v_dfff), // utf16_packed <= 0xdfff + __lasx_xvsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800 + forbidden_bytemask); + if (__lasx_xbnz_v(forbidden_bytemask)) { + return std::make_pair(result(error_code::SURROGATE, buf - start), + reinterpret_cast(utf8_output)); + } + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - + single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - + two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - + three UTF-8 bytes -template -simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) { - return count_code_points(in, size); -} + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. -simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) { - size_t pos = 0; + We precompute byte 1 for case #1 and the common byte for cases #2 & + #3 in register t2. - while (pos < size/32*32) { - simd16x32 input(reinterpret_cast(in + pos)); - input.swap_bytes(); - input.store(reinterpret_cast(output)); - pos += 32; - output += 32; - } + We precompute byte 1 for case #3 and -- **conditionally** -- + precompute either byte 1 for case #2 or byte 2 for case #3. Note that + they differ by exactly one bit. - scalar::utf16::change_endianness_utf16(in + pos, size - pos, output); -} + Finally from these two code units we build proper UTF-8 sequence, + taking into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + __m256i t0 = __lasx_xvpickev_b(utf16_packed, utf16_packed); + t0 = __lasx_xvilvl_b(t0, t0); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + __m256i v_3f7f = __lasx_xvreplgr2vr_h(uint16_t(0x3F7F)); + __m256i t1 = __lasx_xvand_v(t0, v_3f7f); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + __m256i t2 = __lasx_xvor_v(t1, lasx_splat_u16(0x8000)); -} // utf16 -} // unnamed namespace -} // namespace ppc64 -} // namespace simdutf -/* end file src/generic/utf16.h */ + // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] + __m256i s0 = __lasx_xvsrli_h(utf16_packed, 12); + // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] + __m256i s1 = __lasx_xvslli_h(utf16_packed, 2); + // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000] + s1 = __lasx_xvand_v(s1, lasx_splat_u16(0x3F00)); + // [00bb|bbbb|0000|aaaa] + __m256i s2 = __lasx_xvor_v(s0, s1); + // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + __m256i v_c0e0 = __lasx_xvreplgr2vr_h(uint16_t(0xC0E0)); + __m256i s3 = __lasx_xvor_v(s2, v_c0e0); + // __m256i v_07ff = vmovq_n_u16((uint16_t)0x07FF); + __m256i one_or_two_bytes_bytemask = + __lasx_xvsle_hu(utf16_packed, v_07ff); + __m256i m0 = + __lasx_xvandn_v(one_or_two_bytes_bytemask, lasx_splat_u16(0x4000)); + __m256i s4 = __lasx_xvxor_v(s3, m0); -// -// Implementation-specific overrides -// -namespace simdutf { -namespace ppc64 { + // 4. expand code units 16-bit => 32-bit + __m256i out0 = __lasx_xvilvl_h(s4, t2); + __m256i out1 = __lasx_xvilvh_h(s4, t2); -simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept { - // If there is a BOM, then we trust it. - auto bom_encoding = simdutf::BOM::check_bom(input, length); - if(bom_encoding != encoding_type::unspecified) { return bom_encoding; } - int out = 0; - if(validate_utf8(input, length)) { out |= encoding_type::UTF8; } - if((length % 2) == 0) { - if(validate_utf16(reinterpret_cast(input), length/2)) { out |= encoding_type::UTF16_LE; } - } - if((length % 4) == 0) { - if(validate_utf32(reinterpret_cast(input), length/4)) { out |= encoding_type::UTF32_LE; } - } + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + __m256i one_byte_bytemask = + __lasx_xvsle_hu(utf16_packed, __lasx_xvrepli_h(0x7F)); + + __m256i one_or_two_bytes_bytemask_u16_to_u32_low = + __lasx_xvilvl_h(one_or_two_bytes_bytemask, zero); + __m256i one_or_two_bytes_bytemask_u16_to_u32_high = + __lasx_xvilvh_h(one_or_two_bytes_bytemask, zero); + + __m256i one_byte_bytemask_u16_to_u32_low = + __lasx_xvilvl_h(one_byte_bytemask, one_byte_bytemask); + __m256i one_byte_bytemask_u16_to_u32_high = + __lasx_xvilvh_h(one_byte_bytemask, one_byte_bytemask); + + __m256i mask0 = __lasx_xvmskltz_h( + __lasx_xvor_v(one_or_two_bytes_bytemask_u16_to_u32_low, + one_byte_bytemask_u16_to_u32_low)); + __m256i mask1 = __lasx_xvmskltz_h( + __lasx_xvor_v(one_or_two_bytes_bytemask_u16_to_u32_high, + one_byte_bytemask_u16_to_u32_high)); + + uint32_t mask = __lasx_xvpickve2gr_wu(mask0, 0); + const uint8_t *row0 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF] + [0]; + __m128i shuffle0 = __lsx_vld(row0, 1); + __m128i utf8_0 = + __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out0), shuffle0); + __lsx_vst(utf8_0, utf8_output, 0); + utf8_output += row0[0]; - return out; -} + mask = __lasx_xvpickve2gr_wu(mask1, 0); + const uint8_t *row1 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF] + [0]; + __m128i shuffle1 = __lsx_vld(row1, 1); + __m128i utf8_1 = + __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out1), shuffle1); + __lsx_vst(utf8_1, utf8_output, 0); + utf8_output += row1[0]; -simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { - return ppc64::utf8_validation::generic_validate_utf8(buf,len); -} + mask = __lasx_xvpickve2gr_wu(mask0, 4); + const uint8_t *row2 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF] + [0]; + __m128i shuffle2 = __lsx_vld(row2, 1); + __m128i utf8_2 = + __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out0), shuffle2); + __lsx_vst(utf8_2, utf8_output, 0); + utf8_output += row2[0]; -simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept { - return ppc64::utf8_validation::generic_validate_utf8_with_errors(buf,len); -} + mask = __lasx_xvpickve2gr_wu(mask1, 4); + const uint8_t *row3 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF] + [0]; + __m128i shuffle3 = __lsx_vld(row3, 1); + __m128i utf8_3 = + __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out1), shuffle3); + __lsx_vst(utf8_3, utf8_output, 0); + utf8_output += row3[0]; -simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept { - return ppc64::utf8_validation::generic_validate_ascii(buf,len); -} + buf += 16; + } + // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> + // will produce four UTF-8 bytes. + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint32_t word = buf[k]; + if ((word & 0xFFFFFF80) == 0) { + *utf8_output++ = char(word); + } else if ((word & 0xFFFFF800) == 0) { + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if ((word & 0xFFFF0000) == 0) { + if (word >= 0xD800 && word <= 0xDFFF) { + return std::make_pair( + result(error_code::SURROGATE, buf - start + k), + reinterpret_cast(utf8_output)); + } + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + if (word > 0x10FFFF) { + return std::make_pair( + result(error_code::TOO_LARGE, buf - start + k), + reinterpret_cast(utf8_output)); + } + *utf8_output++ = char((word >> 18) | 0b11110000); + *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while -simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept { - return ppc64::utf8_validation::generic_validate_ascii_with_errors(buf,len); + return std::make_pair(result(error_code::SUCCESS, buf - start), + reinterpret_cast(utf8_output)); } +/* end file src/lasx/lasx_convert_utf32_to_utf8.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +/* begin file src/lasx/lasx_convert_utf32_to_utf16.cpp */ +template +std::pair +lasx_convert_utf32_to_utf16(const char32_t *buf, size_t len, + char16_t *utf16_out) { + uint16_t *utf16_output = reinterpret_cast(utf16_out); + const char32_t *end = buf + len; -simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept { - return scalar::utf16::validate(buf, len); -} + // Performance degradation when memory address is not 32-byte aligned + while (((uint64_t)utf16_output & 0x1F) && buf < end) { + uint32_t word = *buf++; + if ((word & 0xFFFF0000) == 0) { + // will not generate a surrogate pair + if (word >= 0xD800 && word <= 0xDFFF) { + return std::make_pair(nullptr, + reinterpret_cast(utf16_output)); + } + *utf16_output++ = !match_system(big_endian) + ? char16_t(word >> 8 | word << 8) + : char16_t(word); + // buf++; + } else { + // will generate a surrogate pair + if (word > 0x10FFFF) { + return std::make_pair(nullptr, + reinterpret_cast(utf16_output)); + } + word -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); + if simdutf_constexpr (!match_system(big_endian)) { + high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8); + low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + // buf++; + } + } -simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept { - return scalar::utf16::validate(buf, len); -} + __m256i forbidden_bytemask = __lasx_xvrepli_h(0); + __m256i v_d800 = lasx_splat_u16(0xd800); + __m256i v_dfff = lasx_splat_u16(0xdfff); + while (end - buf >= 16) { + __m256i in0 = __lasx_xvld(reinterpret_cast(buf), 0); + __m256i in1 = __lasx_xvld(reinterpret_cast(buf), 32); -simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept { - return scalar::utf16::validate_with_errors(buf, len); -} + // Check if no bits set above 16th + if (__lasx_xbz_v(__lasx_xvpickod_h(in1, in0))) { + __m256i utf16_packed = + __lasx_xvpermi_d(__lasx_xvpickev_h(in1, in0), 0b11011000); + forbidden_bytemask = __lasx_xvor_v( + __lasx_xvand_v( + __lasx_xvsle_h(utf16_packed, v_dfff), // utf16_packed <= 0xdfff + __lasx_xvsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800 + forbidden_bytemask); + + if simdutf_constexpr (!match_system(big_endian)) { + utf16_packed = lasx_swap_bytes(utf16_packed); + } + __lasx_xvst(utf16_packed, utf16_output, 0); + utf16_output += 16; + buf += 16; + } else { + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint32_t word = buf[k]; + if ((word & 0xFFFF0000) == 0) { + // will not generate a surrogate pair + if (word >= 0xD800 && word <= 0xDFFF) { + return std::make_pair(nullptr, + reinterpret_cast(utf16_output)); + } + *utf16_output++ = !match_system(big_endian) + ? char16_t(word >> 8 | word << 8) + : char16_t(word); + } else { + // will generate a surrogate pair + if (word > 0x10FFFF) { + return std::make_pair(nullptr, + reinterpret_cast(utf16_output)); + } + word -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); + if simdutf_constexpr (!match_system(big_endian)) { + high_surrogate = + uint16_t(high_surrogate >> 8 | high_surrogate << 8); + low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + } + } + buf += k; + } + } -simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept { - return scalar::utf16::validate_with_errors(buf, len); + // check for invalid input + if (__lasx_xbnz_v(forbidden_bytemask)) { + return std::make_pair(nullptr, reinterpret_cast(utf16_output)); + } + return std::make_pair(buf, reinterpret_cast(utf16_output)); } -simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept { - return scalar::utf32::validate_with_errors(buf, len); -} +template +std::pair +lasx_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len, + char16_t *utf16_out) { + uint16_t *utf16_output = reinterpret_cast(utf16_out); + const char32_t *start = buf; + const char32_t *end = buf + len; -simdutf_warn_unused bool implementation::validate_utf32(const char16_t *buf, size_t len) const noexcept { - return scalar::utf32::validate(buf, len); -} + // Performance degradation when memory address is not 32-byte aligned + while (((uint64_t)utf16_output & 0x1F) && buf < end) { + uint32_t word = *buf++; + if ((word & 0xFFFF0000) == 0) { + // will not generate a surrogate pair + if (word >= 0xD800 && word <= 0xDFFF) { + return std::make_pair(result(error_code::SURROGATE, buf - start - 1), + reinterpret_cast(utf16_output)); + } + *utf16_output++ = !match_system(big_endian) + ? char16_t(word >> 8 | word << 8) + : char16_t(word); + } else { + // will generate a surrogate pair + if (word > 0x10FFFF) { + return std::make_pair(result(error_code::TOO_LARGE, buf - start - 1), + reinterpret_cast(utf16_output)); + } + word -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); + if simdutf_constexpr (!match_system(big_endian)) { + high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8); + low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + } + } -simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept { - return 0; // stub -} + __m256i forbidden_bytemask = __lasx_xvrepli_h(0); + __m256i v_d800 = lasx_splat_u16(0xd800); + __m256i v_dfff = lasx_splat_u16(0xdfff); + while (end - buf >= 16) { + __m256i in0 = __lasx_xvld(reinterpret_cast(buf), 0); + __m256i in1 = __lasx_xvld(reinterpret_cast(buf), 32); -simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept { - return 0; // stub -} + // Check if no bits set above 16th + if (__lasx_xbz_v(__lasx_xvpickod_h(in1, in0))) { + __m256i utf16_packed = + __lasx_xvpermi_d(__lasx_xvpickev_h(in1, in0), 0b11011000); + forbidden_bytemask = __lasx_xvor_v( + __lasx_xvand_v( + __lasx_xvsle_h(utf16_packed, v_dfff), // utf16_packed <= 0xdfff + __lasx_xvsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800 + forbidden_bytemask); + if (__lasx_xbnz_v(forbidden_bytemask)) { + return std::make_pair(result(error_code::SURROGATE, buf - start), + reinterpret_cast(utf16_output)); + } -simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept { - return result(error_code::OTHER, 0); // stub -} + if simdutf_constexpr (!match_system(big_endian)) { + utf16_packed = lasx_swap_bytes(utf16_packed); + } -simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept { - return result(error_code::OTHER, 0); // stub -} + __lasx_xvst(utf16_packed, utf16_output, 0); + utf16_output += 16; + buf += 16; + } else { + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint32_t word = buf[k]; + if ((word & 0xFFFF0000) == 0) { + // will not generate a surrogate pair + if (word >= 0xD800 && word <= 0xDFFF) { + return std::make_pair( + result(error_code::SURROGATE, buf - start + k), + reinterpret_cast(utf16_output)); + } + *utf16_output++ = !match_system(big_endian) + ? char16_t(word >> 8 | word << 8) + : char16_t(word); + } else { + // will generate a surrogate pair + if (word > 0x10FFFF) { + return std::make_pair( + result(error_code::TOO_LARGE, buf - start + k), + reinterpret_cast(utf16_output)); + } + word -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); + if simdutf_constexpr (!match_system(big_endian)) { + high_surrogate = + uint16_t(high_surrogate >> 8 | high_surrogate << 8); + low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + } + } + buf += k; + } + } -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept { - return 0; // stub + return std::make_pair(result(error_code::SUCCESS, buf - start), + reinterpret_cast(utf16_output)); } +/* end file src/lasx/lasx_convert_utf32_to_utf16.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_BASE64 +/* begin file src/lasx/lasx_base64.cpp */ +/** + * References and further reading: + * + * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the + * speed of a memory copy, Software: Practice and Experience 50 (2), 2020. + * https://arxiv.org/abs/1910.05109 + * + * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2 + * Instructions, ACM Transactions on the Web 12 (3), 2018. + * https://arxiv.org/abs/1704.00605 + * + * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings. + * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force, + * Request for Comments: 4648. + * + * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization. + * http://www.alfredklomp.com/programming/sse-base64/. (2014). + * + * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD + * acceleration. https://github.com/aklomp/base64. (2014). + * + * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014). + * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/ + * + * Nick Kopp. 2013. Base64 Encoding on a GPU. + * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013). + */ -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept { - return 0; // stub -} +template +size_t encode_base64(char *dst, const char *src, size_t srclen, + base64_options options) { + // credit: Wojciech Muła + // SSE (lookup: pshufb improved unrolled) + const uint8_t *input = (const uint8_t *)src; + static const char *lookup_tbl = + isbase64url + ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_" + : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + uint8_t *out = (uint8_t *)dst; -simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept { - return 0; // stub -} + v32u8 shuf; + __m256i v_fc0fc00, v_3f03f0, shift_r, shift_l, base64_tbl0, base64_tbl1, + base64_tbl2, base64_tbl3; + if (srclen >= 28) { + shuf = v32u8{1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10, + 1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10}; + + v_fc0fc00 = __lasx_xvreplgr2vr_w(uint32_t(0x0fc0fc00)); + v_3f03f0 = __lasx_xvreplgr2vr_w(uint32_t(0x003f03f0)); + shift_r = __lasx_xvreplgr2vr_w(uint32_t(0x0006000a)); + shift_l = __lasx_xvreplgr2vr_w(uint32_t(0x00080004)); + base64_tbl0 = ____m256i(__lsx_vld(lookup_tbl, 0)); + base64_tbl1 = ____m256i(__lsx_vld(lookup_tbl, 16)); + base64_tbl2 = ____m256i(__lsx_vld(lookup_tbl, 32)); + base64_tbl3 = ____m256i(__lsx_vld(lookup_tbl, 48)); + } + size_t i = 0; + for (; i + 100 <= srclen; i += 96) { + __m128i in0_lo = + __lsx_vld(reinterpret_cast(input + i), 4 * 3 * 0); + __m128i in0_hi = + __lsx_vld(reinterpret_cast(input + i), 4 * 3 * 1); + __m128i in1_lo = + __lsx_vld(reinterpret_cast(input + i), 4 * 3 * 2); + __m128i in1_hi = + __lsx_vld(reinterpret_cast(input + i), 4 * 3 * 3); + __m128i in2_lo = + __lsx_vld(reinterpret_cast(input + i), 4 * 3 * 4); + __m128i in2_hi = + __lsx_vld(reinterpret_cast(input + i), 4 * 3 * 5); + __m128i in3_lo = + __lsx_vld(reinterpret_cast(input + i), 4 * 3 * 6); + __m128i in3_hi = + __lsx_vld(reinterpret_cast(input + i), 4 * 3 * 7); + + __m256i in0 = lasx_set_q(in0_hi, in0_lo); + __m256i in1 = lasx_set_q(in1_hi, in1_lo); + __m256i in2 = lasx_set_q(in2_hi, in2_lo); + __m256i in3 = lasx_set_q(in3_hi, in3_lo); + + in0 = __lasx_xvshuf_b(in0, in0, (__m256i)shuf); + in1 = __lasx_xvshuf_b(in1, in1, (__m256i)shuf); + in2 = __lasx_xvshuf_b(in2, in2, (__m256i)shuf); + in3 = __lasx_xvshuf_b(in3, in3, (__m256i)shuf); + + __m256i t0_0 = __lasx_xvand_v(in0, v_fc0fc00); + __m256i t0_1 = __lasx_xvand_v(in1, v_fc0fc00); + __m256i t0_2 = __lasx_xvand_v(in2, v_fc0fc00); + __m256i t0_3 = __lasx_xvand_v(in3, v_fc0fc00); + + __m256i t1_0 = __lasx_xvsrl_h(t0_0, shift_r); + __m256i t1_1 = __lasx_xvsrl_h(t0_1, shift_r); + __m256i t1_2 = __lasx_xvsrl_h(t0_2, shift_r); + __m256i t1_3 = __lasx_xvsrl_h(t0_3, shift_r); + + __m256i t2_0 = __lasx_xvand_v(in0, v_3f03f0); + __m256i t2_1 = __lasx_xvand_v(in1, v_3f03f0); + __m256i t2_2 = __lasx_xvand_v(in2, v_3f03f0); + __m256i t2_3 = __lasx_xvand_v(in3, v_3f03f0); + + __m256i t3_0 = __lasx_xvsll_h(t2_0, shift_l); + __m256i t3_1 = __lasx_xvsll_h(t2_1, shift_l); + __m256i t3_2 = __lasx_xvsll_h(t2_2, shift_l); + __m256i t3_3 = __lasx_xvsll_h(t2_3, shift_l); + + __m256i input0 = __lasx_xvor_v(t1_0, t3_0); + __m256i input0_shuf0 = __lasx_xvshuf_b(base64_tbl1, base64_tbl0, input0); + __m256i input0_shuf1 = __lasx_xvshuf_b( + base64_tbl3, base64_tbl2, __lasx_xvsub_b(input0, __lasx_xvldi(32))); + __m256i input0_mask = __lasx_xvslei_bu(input0, 31); + __m256i input0_result = + __lasx_xvbitsel_v(input0_shuf1, input0_shuf0, input0_mask); + __lasx_xvst(input0_result, reinterpret_cast<__m256i *>(out), 0); + out += 32; -simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept { - return result(error_code::OTHER, 0); // stub -} + __m256i input1 = __lasx_xvor_v(t1_1, t3_1); + __m256i input1_shuf0 = __lasx_xvshuf_b(base64_tbl1, base64_tbl0, input1); + __m256i input1_shuf1 = __lasx_xvshuf_b( + base64_tbl3, base64_tbl2, __lasx_xvsub_b(input1, __lasx_xvldi(32))); + __m256i input1_mask = __lasx_xvslei_bu(input1, 31); + __m256i input1_result = + __lasx_xvbitsel_v(input1_shuf1, input1_shuf0, input1_mask); + __lasx_xvst(input1_result, reinterpret_cast<__m256i *>(out), 0); + out += 32; -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept { - return 0; // stub -} + __m256i input2 = __lasx_xvor_v(t1_2, t3_2); + __m256i input2_shuf0 = __lasx_xvshuf_b(base64_tbl1, base64_tbl0, input2); + __m256i input2_shuf1 = __lasx_xvshuf_b( + base64_tbl3, base64_tbl2, __lasx_xvsub_b(input2, __lasx_xvldi(32))); + __m256i input2_mask = __lasx_xvslei_bu(input2, 31); + __m256i input2_result = + __lasx_xvbitsel_v(input2_shuf1, input2_shuf0, input2_mask); + __lasx_xvst(input2_result, reinterpret_cast<__m256i *>(out), 0); + out += 32; -simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - return scalar::utf16_to_utf8::convert(buf, len, utf8_output); -} + __m256i input3 = __lasx_xvor_v(t1_3, t3_3); + __m256i input3_shuf0 = __lasx_xvshuf_b(base64_tbl1, base64_tbl0, input3); + __m256i input3_shuf1 = __lasx_xvshuf_b( + base64_tbl3, base64_tbl2, __lasx_xvsub_b(input3, __lasx_xvldi(32))); + __m256i input3_mask = __lasx_xvslei_bu(input3, 31); + __m256i input3_result = + __lasx_xvbitsel_v(input3_shuf1, input3_shuf0, input3_mask); + __lasx_xvst(input3_result, reinterpret_cast<__m256i *>(out), 0); + out += 32; + } + for (; i + 28 <= srclen; i += 24) { -simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - return scalar::utf16_to_utf8::convert(buf, len, utf8_output); -} + __m128i in_lo = __lsx_vld(reinterpret_cast(input + i), 0); + __m128i in_hi = + __lsx_vld(reinterpret_cast(input + i), 4 * 3 * 1); -simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - return scalar::utf16_to_utf8::convert_with_errors(buf, len, utf8_output); -} + __m256i in = lasx_set_q(in_hi, in_lo); -simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - return scalar::utf16_to_utf8::convert_with_errors(buf, len, utf8_output); -} + // bytes from groups A, B and C are needed in separate 32-bit lanes + // in = [DDDD|CCCC|BBBB|AAAA] + // + // an input triplet has layout + // [????????|ccdddddd|bbbbcccc|aaaaaabb] + // byte 3 byte 2 byte 1 byte 0 -- byte 3 comes from the next + // triplet + // + // shuffling changes the order of bytes: 1, 0, 2, 1 + // [bbbbcccc|ccdddddd|aaaaaabb|bbbbcccc] + // ^^^^ ^^^^^^^^ ^^^^^^^^ ^^^^ + // processed bits + in = __lasx_xvshuf_b(in, in, (__m256i)shuf); -simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - return scalar::utf16_to_utf8::convert_valid(buf, len, utf8_output); -} + // unpacking + // t0 = [0000cccc|cc000000|aaaaaa00|00000000] + __m256i t0 = __lasx_xvand_v(in, v_fc0fc00); + // t1 = [00000000|00cccccc|00000000|00aaaaaa] + // ((c >> 6), (a >> 10)) + __m256i t1 = __lasx_xvsrl_h(t0, shift_r); -simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - return scalar::utf16_to_utf8::convert_valid(buf, len, utf8_output); -} + // t2 = [00000000|00dddddd|000000bb|bbbb0000] + __m256i t2 = __lasx_xvand_v(in, v_3f03f0); + // t3 = [00dddddd|00000000|00bbbbbb|00000000] + // ((d << 8), (b << 4)) + __m256i t3 = __lasx_xvsll_h(t2, shift_l); -simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { - return scalar::utf32_to_utf8::convert(buf, len, utf8_output); -} + // res = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] = t1 | t3 + __m256i indices = __lasx_xvor_v(t1, t3); + + __m256i indices_shuf0 = __lasx_xvshuf_b(base64_tbl1, base64_tbl0, indices); + __m256i indices_shuf1 = __lasx_xvshuf_b( + base64_tbl3, base64_tbl2, __lasx_xvsub_b(indices, __lasx_xvldi(32))); + __m256i indices_mask = __lasx_xvslei_bu(indices, 31); + __m256i indices_result = + __lasx_xvbitsel_v(indices_shuf1, indices_shuf0, indices_mask); + __lasx_xvst(indices_result, reinterpret_cast<__m256i *>(out), 0); + out += 32; + } -simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept { - return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output); + return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i, + srclen - i, options); } -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { - return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output); -} +static inline void compress(__m128i data, uint16_t mask, char *output) { + if (mask == 0) { + __lsx_vst(data, reinterpret_cast<__m128i *>(output), 0); + return; + } + // this particular implementation was inspired by work done by @animetosho + // we do it in two steps, first 8 bytes and then second 8 bytes + uint8_t mask1 = uint8_t(mask); // least significant 8 bits + uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits + // next line just loads the 64-bit values thintable_epi8[mask1] and + // thintable_epi8[mask2] into a 128-bit register, using only + // two instructions on most compilers. -simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::utf32_to_utf16::convert(buf, len, utf16_output); -} + v2u64 shufmask = {tables::base64::thintable_epi8[mask1], + tables::base64::thintable_epi8[mask2]}; -simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::utf32_to_utf16::convert(buf, len, utf16_output); -} + // we increment by 0x08 the second half of the mask + const v4u32 hi = {0, 0, 0x08080808, 0x08080808}; + __m128i shufmask1 = __lsx_vadd_b((__m128i)shufmask, (__m128i)hi); -simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::utf32_to_utf16::convert_with_errors(buf, len, utf16_output); -} + // this is the version "nearly pruned" + __m128i pruned = __lsx_vshuf_b(data, data, shufmask1); + // we still need to put the two halves together. + // we compute the popcount of the first half: + int pop1 = tables::base64::BitsSetTable256mul2[mask1]; + // then load the corresponding mask, what it does is to write + // only the first pop1 bytes from the first 8 bytes, and then + // it fills in with the bytes from the second 8 bytes + some filling + // at the end. + __m128i compactmask = + __lsx_vld(reinterpret_cast( + tables::base64::pshufb_combine_table + pop1 * 8), + 0); + __m128i answer = __lsx_vshuf_b(pruned, pruned, compactmask); -simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::utf32_to_utf16::convert_with_errors(buf, len, utf16_output); + __lsx_vst(answer, reinterpret_cast<__m128i *>(output), 0); } -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::utf32_to_utf16::convert_valid(buf, len, utf16_output); -} +struct block64 { + __m256i chunks[2]; +}; -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::utf32_to_utf16::convert_valid(buf, len, utf16_output); -} +template +static inline uint32_t to_base64_mask(__m256i *src, bool *error) { + __m256i ascii_space_tbl = + ____m256i((__m128i)v16u8{0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0}); + // credit: aqrit + __m256i delta_asso; + if (default_or_url) { + delta_asso = + ____m256i((__m128i)v16u8{0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x11, 0x0, 0x16}); + } else { + delta_asso = + ____m256i((__m128i)v16u8{0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, + 0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF}); + } + __m256i delta_values; + if (default_or_url) { + delta_values = ____m256i( + (__m128i)v16i8{int8_t(0xBF), int8_t(0xE0), int8_t(0xB9), int8_t(0x13), + int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), + int8_t(0xB9), int8_t(0x00), int8_t(0xFF), int8_t(0x11), + int8_t(0xFF), int8_t(0xBF), int8_t(0x10), int8_t(0xB9)}); + } else if (base64_url) { + delta_values = ____m256i( + (__m128i)v16i8{int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), + int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), + int8_t(0xB9), int8_t(0x00), int8_t(0x11), int8_t(0xC3), + int8_t(0xBF), int8_t(0xE0), int8_t(0xB9), int8_t(0xB9)}); + } else { + delta_values = ____m256i( + (__m128i)v16i8{int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), + int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), + int8_t(0xB9), int8_t(0x00), int8_t(0x10), int8_t(0xC3), + int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9)}); + } -simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - return scalar::utf16_to_utf32::convert(buf, len, utf32_output); -} + __m256i check_asso; + if (default_or_url) { + check_asso = ____m256i((__m128i)v16u8{0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x03, 0x07, + 0x0B, 0x0E, 0x0B, 0x06}); + + } else if (base64_url) { + check_asso = ____m256i((__m128i)v16u8{0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x03, 0x07, + 0x0B, 0x06, 0x0B, 0x12}); + } else { + check_asso = ____m256i((__m128i)v16u8{0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x03, 0x07, + 0x0B, 0x0B, 0x0B, 0x0F}); + } -simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - return scalar::utf16_to_utf32::convert(buf, len, utf32_output); -} + __m256i check_values; + if (default_or_url) { + + check_values = ____m256i( + (__m128i)v16i8{int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), + int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), + int8_t(0xB5), int8_t(0xA1), int8_t(0x00), int8_t(0x80), + int8_t(0x00), int8_t(0x80), int8_t(0x00), int8_t(0x80)}); + } else if (base64_url) { + check_values = ____m256i( + (__m128i)v16i8{int8_t(0x0), int8_t(0x80), int8_t(0x80), int8_t(0x80), + int8_t(0xCF), int8_t(0xBF), int8_t(0xD3), int8_t(0xA6), + int8_t(0xB5), int8_t(0x86), int8_t(0xD0), int8_t(0x80), + int8_t(0xB0), int8_t(0x80), int8_t(0x0), int8_t(0x0)}); + } else { + check_values = ____m256i( + (__m128i)v16i8{int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), + int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), + int8_t(0xB5), int8_t(0x86), int8_t(0xD1), int8_t(0x80), + int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80)}); + } + + __m256i shifted = __lasx_xvsrli_b(*src, 3); + __m256i asso_index = __lasx_xvand_v(*src, __lasx_xvldi(0xF)); + __m256i delta_hash = __lasx_xvavgr_bu( + __lasx_xvshuf_b(delta_asso, delta_asso, asso_index), shifted); + __m256i check_hash = __lasx_xvavgr_bu( + __lasx_xvshuf_b(check_asso, check_asso, asso_index), shifted); + + __m256i out = __lasx_xvsadd_b( + __lasx_xvshuf_b(delta_values, delta_values, delta_hash), *src); + __m256i chk = __lasx_xvsadd_b( + __lasx_xvshuf_b(check_values, check_values, check_hash), *src); + __m256i chk_ltz = __lasx_xvmskltz_b(chk); + unsigned int mask = __lasx_xvpickve2gr_wu(chk_ltz, 0); + mask = mask | (__lsx_vpickve2gr_hu(lasx_extracti128_hi(chk_ltz), 0) << 16); + if (mask) { + __m256i ascii_space = __lasx_xvseq_b( + __lasx_xvshuf_b(ascii_space_tbl, ascii_space_tbl, asso_index), *src); + __m256i ascii_space_ltz = __lasx_xvmskltz_b(ascii_space); + unsigned int ascii_space_mask = __lasx_xvpickve2gr_wu(ascii_space_ltz, 0); + ascii_space_mask = + ascii_space_mask | + (__lsx_vpickve2gr_hu(lasx_extracti128_hi(ascii_space_ltz), 0) << 16); + *error |= (mask != ascii_space_mask); + } -simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - return scalar::utf16_to_utf32::convert_with_errors(buf, len, utf32_output); + *src = out; + return (uint32_t)mask; } -simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - return scalar::utf16_to_utf32::convert_with_errors(buf, len, utf32_output); +template +static inline uint64_t to_base64_mask(block64 *b, bool *error) { + *error = 0; + uint64_t m0 = + to_base64_mask(&b->chunks[0], error); + uint64_t m1 = + to_base64_mask(&b->chunks[1], error); + return m0 | (m1 << 32); } -simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - return scalar::utf16_to_utf32::convert_valid(buf, len, utf32_output); +static inline void copy_block(block64 *b, char *output) { + __lasx_xvst(b->chunks[0], reinterpret_cast<__m256i *>(output), 0); + __lasx_xvst(b->chunks[1], reinterpret_cast<__m256i *>(output), 32); } -simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - return scalar::utf16_to_utf32::convert_valid(buf, len, utf32_output); +static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) { + uint64_t nmask = ~mask; + uint64_t count = + __lsx_vpickve2gr_d(__lsx_vpcnt_h(__lsx_vreplgr2vr_d(nmask)), 0); + uint16_t *count_ptr = (uint16_t *)&count; + compress(lasx_extracti128_lo(b->chunks[0]), uint16_t(mask), output); + compress(lasx_extracti128_hi(b->chunks[0]), uint16_t(mask >> 16), + output + count_ptr[0]); + compress(lasx_extracti128_lo(b->chunks[1]), uint16_t(mask >> 32), + output + count_ptr[0] + count_ptr[1]); + compress(lasx_extracti128_hi(b->chunks[1]), uint16_t(mask >> 48), + output + count_ptr[0] + count_ptr[1] + count_ptr[2]); + return count_ones(nmask); +} + +template bool is_power_of_two(T x) { return (x & (x - 1)) == 0; } + +inline size_t compress_block_single(block64 *b, uint64_t mask, char *output) { + const size_t pos64 = trailing_zeroes(mask); + const int8_t pos = pos64 & 0xf; + + // Predefine the index vector + const v16u8 v1 = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + switch (pos64 >> 4) { + case 0b00: { + const __m128i lane0 = lasx_extracti128_lo(b->chunks[0]); + const __m128i lane1 = lasx_extracti128_hi(b->chunks[0]); + + const __m128i v0 = __lsx_vreplgr2vr_b((uint8_t)(pos - 1)); + const __m128i v2 = __lsx_vslt_b(v0, (__m128i)v1); // v1 > v0 + const __m128i sh = __lsx_vsub_b((__m128i)v1, v2); + const __m128i compressed = __lsx_vshuf_b(lane0, lane0, sh); + + __lsx_vst(compressed, reinterpret_cast<__m128i *>(output + 0 * 16), 0); + __lsx_vst(lane1, reinterpret_cast<__m128i *>(output + 1 * 16 - 1), 0); + __lasx_xvst(b->chunks[1], reinterpret_cast<__m256i *>(output + 2 * 16 - 1), + 0); + } break; + case 0b01: { + const __m128i lane0 = lasx_extracti128_lo(b->chunks[0]); + const __m128i lane1 = lasx_extracti128_hi(b->chunks[0]); + __lsx_vst(lane0, reinterpret_cast<__m128i *>(output + 0 * 16), 0); + + const __m128i v0 = __lsx_vreplgr2vr_b((uint8_t)(pos - 1)); + const __m128i v2 = __lsx_vslt_b(v0, (__m128i)v1); + const __m128i sh = __lsx_vsub_b((__m128i)v1, v2); + const __m128i compressed = __lsx_vshuf_b(lane1, lane1, sh); + + __lsx_vst(compressed, reinterpret_cast<__m128i *>(output + 1 * 16), 0); + __lasx_xvst(b->chunks[1], reinterpret_cast<__m256i *>(output + 2 * 16 - 1), + 0); + } break; + case 0b10: { + __lasx_xvst(b->chunks[0], reinterpret_cast<__m256i *>(output + 0 * 16), 0); + + const __m128i lane2 = lasx_extracti128_lo(b->chunks[1]); + const __m128i lane3 = lasx_extracti128_hi(b->chunks[1]); + + const __m128i v0 = __lsx_vreplgr2vr_b((uint8_t)(pos - 1)); + const __m128i v2 = __lsx_vslt_b(v0, (__m128i)v1); + const __m128i sh = __lsx_vsub_b((__m128i)v1, v2); + const __m128i compressed = __lsx_vshuf_b(lane2, lane2, sh); + + __lsx_vst(compressed, reinterpret_cast<__m128i *>(output + 2 * 16), 0); + __lsx_vst(lane3, reinterpret_cast<__m128i *>(output + 3 * 16 - 1), 0); + } break; + case 0b11: { + __lasx_xvst(b->chunks[0], reinterpret_cast<__m256i *>(output + 0 * 16), 0); + __lsx_vst(lasx_extracti128_lo(b->chunks[1]), + reinterpret_cast<__m128i *>(output + 2 * 16), 0); + + const __m128i lane3 = lasx_extracti128_hi(b->chunks[1]); + + const __m128i v0 = __lsx_vreplgr2vr_b((uint8_t)(pos - 1)); + const __m128i v2 = __lsx_vslt_b(v0, (__m128i)v1); + const __m128i sh = __lsx_vsub_b((__m128i)v1, v2); + const __m128i compressed = __lsx_vshuf_b(lane3, lane3, sh); + + __lsx_vst(compressed, reinterpret_cast<__m128i *>(output + 3 * 16), 0); + } break; + } + return 63; +} + +// The caller of this function is responsible to ensure that there are 64 bytes +// available from reading at src. The data is read into a block64 structure. +static inline void load_block(block64 *b, const char *src) { + b->chunks[0] = __lasx_xvld(reinterpret_cast(src), 0); + b->chunks[1] = __lasx_xvld(reinterpret_cast(src), 32); } -void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept { - scalar::utf16::change_endianness_utf16(input, length, output); +// The caller of this function is responsible to ensure that there are 128 bytes +// available from reading at src. The data is read into a block64 structure. +static inline void load_block(block64 *b, const char16_t *src) { + __m256i m1 = __lasx_xvld(reinterpret_cast(src), 0); + __m256i m2 = __lasx_xvld(reinterpret_cast(src), 32); + __m256i m3 = __lasx_xvld(reinterpret_cast(src), 64); + __m256i m4 = __lasx_xvld(reinterpret_cast(src), 96); + b->chunks[0] = __lasx_xvpermi_d(__lasx_xvssrlni_bu_h(m2, m1, 0), 0b11011000); + b->chunks[1] = __lasx_xvpermi_d(__lasx_xvssrlni_bu_h(m4, m3, 0), 0b11011000); } -simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept { - return scalar::utf16::count_code_points(input, length); -} +static inline void base64_decode(char *out, __m256i str) { + __m256i t0 = __lasx_xvor_v( + __lasx_xvslli_w(str, 26), + __lasx_xvslli_w(__lasx_xvand_v(str, lasx_splat_u32(0x0000ff00)), 12)); + __m256i t1 = + __lasx_xvsrli_w(__lasx_xvand_v(str, lasx_splat_u32(0x003f0000)), 2); + __m256i t2 = __lasx_xvor_v(t0, t1); + __m256i t3 = __lasx_xvor_v(t2, __lasx_xvsrli_w(str, 16)); + __m256i pack_shuffle = ____m256i( + (__m128i)v16u8{3, 2, 1, 7, 6, 5, 11, 10, 9, 15, 14, 13, 0, 0, 0, 0}); + t3 = __lasx_xvshuf_b(t3, t3, (__m256i)pack_shuffle); -simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept { - return scalar::utf16::count_code_points(input, length); + // Store the output: + __lsx_vst(lasx_extracti128_lo(t3), out, 0); + __lsx_vst(lasx_extracti128_hi(t3), out, 12); } - -simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept { - return utf8::count_code_points(input, length); +// decode 64 bytes and output 48 bytes +static inline void base64_decode_block(char *out, const char *src) { + base64_decode(out, __lasx_xvld(reinterpret_cast(src), 0)); + base64_decode(out + 24, + __lasx_xvld(reinterpret_cast(src), 32)); } -simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept { - return scalar::utf16::utf8_length_from_utf16(input, length); +static inline void base64_decode_block_safe(char *out, const char *src) { + base64_decode(out, __lasx_xvld(reinterpret_cast(src), 0)); + alignas(32) char buffer[32]; + base64_decode(buffer, + __lasx_xvld(reinterpret_cast(src), 32)); + std::memcpy(out + 24, buffer, 24); } -simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept { - return scalar::utf16::utf8_length_from_utf16(input, length); +static inline void base64_decode_block(char *out, block64 *b) { + base64_decode(out, b->chunks[0]); + base64_decode(out + 24, b->chunks[1]); } - -simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept { - return scalar::utf16::utf32_length_from_utf16(input, length); +static inline void base64_decode_block_safe(char *out, block64 *b) { + base64_decode(out, b->chunks[0]); + alignas(32) char buffer[32]; + base64_decode(buffer, b->chunks[1]); + std::memcpy(out + 24, buffer, 24); } -simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept { - return scalar::utf16::utf32_length_from_utf16(input, length); -} +template +full_result +compress_decode_base64(char *dst, const chartype *src, size_t srclen, + base64_options options, + last_chunk_handling_options last_chunk_options) { + const uint8_t *to_base64 = + default_or_url ? tables::base64::to_base64_default_or_url_value + : (base64_url ? tables::base64::to_base64_url_value + : tables::base64::to_base64_value); + auto ri = simdutf::scalar::base64::find_end(src, srclen, options); + size_t equallocation = ri.equallocation; + size_t equalsigns = ri.equalsigns; + srclen = ri.srclen; + size_t full_input_length = ri.full_input_length; + if (srclen == 0) { + if (!ignore_garbage && equalsigns > 0) { + return {INVALID_BASE64_CHARACTER, equallocation, 0}; + } + return {SUCCESS, full_input_length, 0}; + } + char *end_of_safe_64byte_zone = + (srclen + 3) / 4 * 3 >= 63 ? dst + (srclen + 3) / 4 * 3 - 63 : dst; -simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept { - return scalar::utf8::utf16_length_from_utf8(input, length); -} + const chartype *const srcinit = src; + const char *const dstinit = dst; + const chartype *const srcend = src + srclen; -simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept { - return scalar::utf32::utf8_length_from_utf32(input, length); -} + constexpr size_t block_size = 6; + static_assert(block_size >= 2, "block_size must be at least two"); + char buffer[block_size * 64]; + char *bufferptr = buffer; + if (srclen >= 64) { + const chartype *const srcend64 = src + srclen - 64; + while (src <= srcend64) { + block64 b; + load_block(&b, src); + src += 64; + bool error = false; + uint64_t badcharmask = + to_base64_mask(&b, &error); + if (error && !ignore_garbage) { + src -= 64; + while (src < srcend && scalar::base64::is_eight_byte(*src) && + to_base64[uint8_t(*src)] <= 64) { + src++; + } + return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit), + size_t(dst - dstinit)}; + } + if (badcharmask != 0) { + if (is_power_of_two(badcharmask)) { + bufferptr += compress_block_single(&b, badcharmask, bufferptr); + } else { + bufferptr += compress_block(&b, badcharmask, bufferptr); + } + } else if (bufferptr != buffer) { + copy_block(&b, bufferptr); + bufferptr += 64; + } else { + if (dst >= end_of_safe_64byte_zone) { + base64_decode_block_safe(dst, &b); + } else { + base64_decode_block(dst, &b); + } + dst += 48; + } + if (bufferptr >= (block_size - 1) * 64 + buffer) { + for (size_t i = 0; i < (block_size - 2); i++) { + base64_decode_block(dst, buffer + i * 64); + dst += 48; + } + if (dst >= end_of_safe_64byte_zone) { + base64_decode_block_safe(dst, buffer + (block_size - 2) * 64); + } else { + base64_decode_block(dst, buffer + (block_size - 2) * 64); + } + dst += 48; + std::memcpy(buffer, buffer + (block_size - 1) * 64, + 64); // 64 might be too much + bufferptr -= (block_size - 1) * 64; + } + } + } -simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept { - return scalar::utf32::utf16_length_from_utf32(input, length); -} + char *buffer_start = buffer; + // Optimization note: if this is almost full, then it is worth our + // time, otherwise, we should just decode directly. + int last_block = (int)((bufferptr - buffer_start) % 64); + if (last_block != 0 && srcend - src + last_block >= 64) { -simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept { - return scalar::utf8::count_code_points(input, length); -} + while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) { + uint8_t val = to_base64[uint8_t(*src)]; + *bufferptr = char(val); + if ((!scalar::base64::is_eight_byte(*src) || val > 64) && + !ignore_garbage) { + return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit), + size_t(dst - dstinit)}; + } + bufferptr += (val <= 63); + src++; + } + } + for (; buffer_start + 64 <= bufferptr; buffer_start += 64) { + if (dst >= end_of_safe_64byte_zone) { + base64_decode_block_safe(dst, buffer_start); + } else { + base64_decode_block(dst, buffer_start); + } + dst += 48; + } + if ((bufferptr - buffer_start) % 64 != 0) { + while (buffer_start + 4 < bufferptr) { + uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) + + (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) + + (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) + + (uint32_t(uint8_t(buffer_start[3])) << 0 * 6)) + << 8; + // lasx is little-endian + triple = scalar::u32_swap_bytes(triple); + std::memcpy(dst, &triple, 4); -simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char * input, size_t length) const noexcept { - return scalar::base64::maximal_binary_length_from_base64(input, length); -} + dst += 3; + buffer_start += 4; + } + if (buffer_start + 4 <= bufferptr) { + uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) + + (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) + + (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) + + (uint32_t(uint8_t(buffer_start[3])) << 0 * 6)) + << 8; + // lasx is little-endian + triple = scalar::u32_swap_bytes(triple); + std::memcpy(dst, &triple, 3); -simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept { - // skip trailing spaces - while(length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) { - length--; - } - size_t equallocation = length; // location of the first padding character if any - size_t equalsigns = 0; - if(length > 0 && input[length - 1] == '=') { - length -= 1; - equalsigns++; - while(length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) { - length--; + dst += 3; + buffer_start += 4; } - if(length > 0 && input[length - 1] == '=') { - equalsigns++; - length -= 1; + // we may have 1, 2 or 3 bytes left and we need to decode them so let us + // backtrack + int leftover = int(bufferptr - buffer_start); + while (leftover > 0) { + if (!ignore_garbage) { + while (to_base64[uint8_t(*(src - 1))] == 64) { + src--; + } + } else { + while (to_base64[uint8_t(*(src - 1))] >= 64) { + src--; + } + } + src--; + leftover--; } } - if(length == 0) { - if(equalsigns > 0) { - return {INVALID_BASE64_CHARACTER, equallocation};; + if (src < srcend + equalsigns) { + full_result r = scalar::base64::base64_tail_decode( + dst, src, srcend - src, equalsigns, options, last_chunk_options); + r = scalar::base64::patch_tail_result( + r, size_t(src - srcinit), size_t(dst - dstinit), equallocation, + full_input_length, last_chunk_options); + // When is_partial(last_chunk_options) is true, we must either end with + // the end of the stream (beyond whitespace) or right after a non-ignorable + // character or at the very beginning of the stream. + // See https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64 + if (is_partial(last_chunk_options) && r.error == error_code::SUCCESS && + r.input_count < full_input_length) { + // First check if we can extend the input to the end of the stream + while (r.input_count < full_input_length && + base64_ignorable(*(srcinit + r.input_count), options)) { + r.input_count++; + } + // If we are still not at the end of the stream, then we must backtrack + // to the last non-ignorable character. + if (r.input_count < full_input_length) { + while (r.input_count > 0 && + base64_ignorable(*(srcinit + r.input_count - 1), options)) { + r.input_count--; + } + } } - return {SUCCESS, 0}; + return r; } - result r = scalar::base64::base64_tail_decode(output, input, length, options); - if(r.error == error_code::SUCCESS && equalsigns > 0) { - // additional checks - if((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) { - return {INVALID_BASE64_CHARACTER, equallocation}; + if (equalsigns > 0 && !ignore_garbage) { + if ((size_t(dst - dstinit) % 3 == 0) || + ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) { + return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)}; } } - return r; + return {SUCCESS, srclen, size_t(dst - dstinit)}; } +/* end file src/lasx/lasx_base64.cpp */ +/* begin file src/lasx/lasx_find.cpp */ +simdutf_really_inline const char *util_find(const char *start, const char *end, + char character) noexcept { + if (start >= end) + return end; -simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept { - return scalar::base64::maximal_binary_length_from_base64(input, length); -} + const int step = 32; + __m256i char_vec = __lasx_xvreplgr2vr_b(static_cast(character)); -simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept { - // skip trailing spaces - while(length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) { - length--; - } - size_t equallocation = length; // location of the first padding character if any - size_t equalsigns = 0; - if(length > 0 && input[length - 1] == '=') { - length -= 1; - equalsigns++; - while(length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) { - length--; - } - if(length > 0 && input[length - 1] == '=') { - equalsigns++; - length -= 1; + while (end - start >= step) { + __m256i data = __lasx_xvld(reinterpret_cast(start), 0); + __m256i cmp = __lasx_xvseq_b(data, char_vec); + if (__lasx_xbnz_v(cmp)) { + __m256i res = __lasx_xvmsknz_b(cmp); + uint32_t mask0 = __lasx_xvpickve2gr_wu(res, 0); + uint32_t mask1 = __lasx_xvpickve2gr_wu(res, 4); + uint32_t mask = (mask0 | (mask1 << 16)); + return start + trailing_zeroes(mask); } + + start += step; } - if(length == 0) { - if(equalsigns > 0) { - return {INVALID_BASE64_CHARACTER, equallocation}; - } - return {SUCCESS, 0}; - } - result r = scalar::base64::base64_tail_decode(output, input, length, options); - if(r.error == error_code::SUCCESS && equalsigns > 0) { - // additional checks - if((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) { - return {INVALID_BASE64_CHARACTER, equallocation}; + + // Handle remaining bytes with scalar loop + for (; start < end; ++start) { + if (*start == character) { + return start; } } - return r; -} - -simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length, base64_options options) const noexcept { - return scalar::base64::base64_length_from_binary(length, options); -} -size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept { - return scalar::base64::binary_to_base64(input, length, output, options); + return end; } -} // namespace ppc64 -} // namespace simdutf - -/* begin file src/simdutf/ppc64/end.h */ -/* end file src/simdutf/ppc64/end.h */ -/* end file src/ppc64/implementation.cpp */ -#endif -#if SIMDUTF_IMPLEMENTATION_RVV -/* begin file src/rvv/implementation.cpp */ +simdutf_really_inline const char16_t *util_find(const char16_t *start, + const char16_t *end, + char16_t character) noexcept { + if (start >= end) + return end; + const int step = 16; + __m256i char_vec = __lasx_xvreplgr2vr_h(static_cast(character)); + while (end - start >= step) { + __m256i data = __lasx_xvld(reinterpret_cast(start), 0); + __m256i cmp = __lasx_xvseq_h(data, char_vec); + if (__lasx_xbnz_v(cmp)) { + __m256i res = __lasx_xvmsknz_b(cmp); + uint32_t mask0 = __lasx_xvpickve2gr_wu(res, 0); + uint32_t mask1 = __lasx_xvpickve2gr_wu(res, 4); + uint32_t mask = (mask0 | (mask1 << 16)); + return start + trailing_zeroes(mask) / 2; + } + start += step; + } -/* begin file src/simdutf/rvv/begin.h */ -// redefining SIMDUTF_IMPLEMENTATION to "rvv" -// #define SIMDUTF_IMPLEMENTATION rvv + // Handle remaining elements with scalar loop + for (; start < end; ++start) { + if (*start == character) { + return start; + } + } -#if SIMDUTF_CAN_ALWAYS_RUN_RVV -// nothing needed. -#else -SIMDUTF_TARGET_RVV -#endif -/* end file src/simdutf/rvv/begin.h */ -namespace simdutf { -namespace rvv { -namespace { -#ifndef SIMDUTF_RVV_H -#error "rvv.h must be included" -#endif + return end; +} +/* end file src/lasx/lasx_find.cpp */ +#endif // SIMDUTF_FEATURE_BASE64 -} // unnamed namespace -} // namespace rvv +} // namespace +} // namespace lasx } // namespace simdutf -// -// Implementation-specific overrides -// +/* begin file src/generic/buf_block_reader.h */ namespace simdutf { -namespace rvv { - -/* begin file src/rvv/rvv_length_from.inl.cpp */ - -simdutf_warn_unused size_t implementation::count_utf16le(const char16_t *src, size_t len) const noexcept { - return utf32_length_from_utf16le(src, len); -} - -simdutf_warn_unused size_t implementation::count_utf16be(const char16_t *src, size_t len) const noexcept { - return utf32_length_from_utf16be(src, len); -} - -simdutf_warn_unused size_t implementation::count_utf8(const char *src, size_t len) const noexcept { - return utf32_length_from_utf8(src, len); -} - -simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char *src, size_t len) const noexcept { - return utf32_length_from_utf8(src, len); -} - -simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t len) const noexcept { - return len; -} - -simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t len) const noexcept { - return len; -} - -simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t len) const noexcept { - return len; -} +namespace lasx { +namespace { -simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t len) const noexcept { - return len; -} +// Walks through a buffer in block-sized increments, loading the last part with +// spaces +template struct buf_block_reader { +public: + simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len); + simdutf_really_inline size_t block_index(); + simdutf_really_inline bool has_full_block() const; + simdutf_really_inline const uint8_t *full_block() const; + /** + * Get the last block, padded with spaces. + * + * There will always be a last block, with at least 1 byte, unless len == 0 + * (in which case this function fills the buffer with spaces and returns 0. In + * particular, if len == STEP_SIZE there will be 0 full_blocks and 1 remainder + * block with STEP_SIZE bytes and no spaces for padding. + * + * @return the number of effective characters in the last block. + */ + simdutf_really_inline size_t get_remainder(uint8_t *dst) const; + simdutf_really_inline void advance(); -simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char *src, size_t len) const noexcept { - size_t count = 0; - for (size_t vl; len > 0; len -= vl, src += vl) { - vl = __riscv_vsetvl_e8m8(len); - vint8m8_t v = __riscv_vle8_v_i8m8((int8_t*)src, vl); - vbool1_t mask = __riscv_vmsgt_vx_i8m8_b1(v, -65, vl); - count += __riscv_vcpop_m_b1(mask, vl); - } - return count; -} +private: + const uint8_t *buf; + const size_t len; + const size_t lenminusstep; + size_t idx; +}; -template -simdutf_really_inline static size_t rvv_utf32_length_from_utf16(const char16_t *src, size_t len) { - size_t count = 0; - for (size_t vl; len > 0; len -= vl, src += vl) { - vl = __riscv_vsetvl_e16m8(len); - vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t*)src, vl); - v = simdutf_byteflip(v, vl); - vbool2_t notHigh = __riscv_vmor_mm_b2( - __riscv_vmsgtu_vx_u16m8_b2(v, 0xDFFF, vl), - __riscv_vmsltu_vx_u16m8_b2(v, 0xDC00, vl), vl); - count += __riscv_vcpop_m_b2(notHigh, vl); +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char *format_input_text_64(const uint8_t *text) { + static char *buf = + reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + for (size_t i = 0; i < sizeof(simd8x64); i++) { + buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]); } - return count; -} - -simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t *src, size_t len) const noexcept { - return rvv_utf32_length_from_utf16(src, len); -} - -simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t *src, size_t len) const noexcept { - if (supports_zvbb()) - return rvv_utf32_length_from_utf16(src, len); - else - return rvv_utf32_length_from_utf16(src, len); + buf[sizeof(simd8x64)] = '\0'; + return buf; } -simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char *src, size_t len) const noexcept { - size_t count = len; - for (size_t vl; len > 0; len -= vl, src += vl) { - vl = __riscv_vsetvl_e8m8(len); - vint8m8_t v = __riscv_vle8_v_i8m8((int8_t*)src, vl); - count += __riscv_vcpop_m_b1(__riscv_vmslt_vx_i8m8_b1(v, 0, vl), vl); +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char *format_input_text(const simd8x64 &in) { + static char *buf = + reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + in.store(reinterpret_cast(buf)); + for (size_t i = 0; i < sizeof(simd8x64); i++) { + if (buf[i] < ' ') { + buf[i] = '_'; + } } - return count; + buf[sizeof(simd8x64)] = '\0'; + return buf; } -template -simdutf_really_inline static size_t rvv_utf8_length_from_utf16(const char16_t *src, size_t len) { - size_t count = 0; - for (size_t vl; len > 0; len -= vl, src += vl) { - vl = __riscv_vsetvl_e16m8(len); - vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t*)src, vl); - v = simdutf_byteflip(v, vl); - vbool2_t m234 = __riscv_vmsgtu_vx_u16m8_b2(v, 0x7F, vl); - vbool2_t m34 = __riscv_vmsgtu_vx_u16m8_b2(v, 0x7FF, vl); - vbool2_t notSur = __riscv_vmor_mm_b2( - __riscv_vmsltu_vx_u16m8_b2(v, 0xD800, vl), - __riscv_vmsgtu_vx_u16m8_b2(v, 0xDFFF, vl), vl); - vbool2_t m3 = __riscv_vmand_mm_b2(m34, notSur, vl); - count += vl + __riscv_vcpop_m_b2(m234, vl) + __riscv_vcpop_m_b2(m3, vl); +simdutf_unused static char *format_mask(uint64_t mask) { + static char *buf = reinterpret_cast(malloc(64 + 1)); + for (size_t i = 0; i < 64; i++) { + buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' '; } - return count; -} - -simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t *src, size_t len) const noexcept { - return rvv_utf8_length_from_utf16(src, len); + buf[64] = '\0'; + return buf; } -simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t *src, size_t len) const noexcept { - if (supports_zvbb()) - return rvv_utf8_length_from_utf16(src, len); - else - return rvv_utf8_length_from_utf16(src, len); -} +template +simdutf_really_inline +buf_block_reader::buf_block_reader(const uint8_t *_buf, size_t _len) + : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, + idx{0} {} -simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t *src, size_t len) const noexcept { - size_t count = 0; - for (size_t vl; len > 0; len -= vl, src += vl) { - vl = __riscv_vsetvl_e32m8(len); - vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t*)src, vl); - vbool4_t m234 = __riscv_vmsgtu_vx_u32m8_b4(v, 0x7F, vl); - vbool4_t m34 = __riscv_vmsgtu_vx_u32m8_b4(v, 0x7FF, vl); - vbool4_t m4 = __riscv_vmsgtu_vx_u32m8_b4(v, 0xFFFF, vl); - count += vl + __riscv_vcpop_m_b4(m234, vl) + __riscv_vcpop_m_b4(m34, vl) + __riscv_vcpop_m_b4(m4, vl); - } - return count; +template +simdutf_really_inline size_t buf_block_reader::block_index() { + return idx; } -simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char *src, size_t len) const noexcept { - size_t count = 0; - for (size_t vl; len > 0; len -= vl, src += vl) { - vl = __riscv_vsetvl_e8m8(len); - vint8m8_t v = __riscv_vle8_v_i8m8((int8_t*)src, vl); - vbool1_t m1234 = __riscv_vmsgt_vx_i8m8_b1(v, -65, vl); - vbool1_t m4 = __riscv_vmsgtu_vx_u8m8_b1( - __riscv_vreinterpret_u8m8(v), (uint8_t)0b11101111, vl); - count += __riscv_vcpop_m_b1(m1234, vl) + __riscv_vcpop_m_b1(m4, vl); - } - return count; +template +simdutf_really_inline bool buf_block_reader::has_full_block() const { + return idx < lenminusstep; } -simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t *src, size_t len) const noexcept { - size_t count = 0; - for (size_t vl; len > 0; len -= vl, src += vl) { - vl = __riscv_vsetvl_e32m8(len); - vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t*)src, vl); - vbool4_t m4 = __riscv_vmsgtu_vx_u32m8_b4(v, 0xFFFF, vl); - count += vl + __riscv_vcpop_m_b4(m4, vl); - } - return count; +template +simdutf_really_inline const uint8_t * +buf_block_reader::full_block() const { + return &buf[idx]; } -/* end file src/rvv/rvv_length_from.inl.cpp */ -/* begin file src/rvv/rvv_validate.inl.cpp */ - - -simdutf_warn_unused bool implementation::validate_ascii(const char *src, size_t len) const noexcept { - size_t vlmax = __riscv_vsetvlmax_e8m8(); - vint8m8_t mask = __riscv_vmv_v_x_i8m8(0, vlmax); - for (size_t vl; len > 0; len -= vl, src += vl) { - vl = __riscv_vsetvl_e8m8(len); - vint8m8_t v = __riscv_vle8_v_i8m8((int8_t*)src, vl); - mask = __riscv_vor_vv_i8m8_tu(mask, mask, v, vl); - } - return __riscv_vfirst_m_b1(__riscv_vmslt_vx_i8m8_b1(mask, 0, vlmax), vlmax) < 0; +template +simdutf_really_inline size_t +buf_block_reader::get_remainder(uint8_t *dst) const { + if (len == idx) { + return 0; + } // memcpy(dst, null, 0) will trigger an error with some sanitizers + std::memset(dst, 0x20, + STEP_SIZE); // std::memset STEP_SIZE because it is more efficient + // to write out 8 or 16 bytes at once. + std::memcpy(dst, buf + idx, len - idx); + return len - idx; } -simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *src, size_t len) const noexcept { - const char *beg = src; - for (size_t vl; len > 0; len -= vl, src += vl) { - vl = __riscv_vsetvl_e8m8(len); - vint8m8_t v = __riscv_vle8_v_i8m8((int8_t*)src, vl); - long idx = __riscv_vfirst_m_b1(__riscv_vmslt_vx_i8m8_b1(v, 0, vl), vl); - if (idx >= 0) return result(error_code::TOO_LARGE, src - beg + idx); - } - return result(error_code::SUCCESS, src - beg); +template +simdutf_really_inline void buf_block_reader::advance() { + idx += STEP_SIZE; } -/* Returns a close estimation of the number of valid UTF-8 bytes up to the - * first invalid one, but never overestimating. */ -simdutf_really_inline static size_t rvv_count_valid_utf8(const char *src, size_t len) { - const char *beg = src; - if (len < 32) return 0; - - /* validate first three bytes */ - { - size_t idx = 3; - while (idx < len && (src[idx] >> 6) == 0b10) - ++idx; - if (idx > 3+3 || !scalar::utf8::validate(src, idx)) - return 0; - } - - static const uint64_t err1m[] = { 0x0202020202020202, 0x4915012180808080 }; - static const uint64_t err2m[] = { 0xCBCBCB8B8383A3E7, 0xCBCBDBCBCBCBCBCB }; - static const uint64_t err3m[] = { 0x0101010101010101, 0X01010101BABAAEE6 }; - - const vuint8m1_t err1tbl = __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err1m, 2)); - const vuint8m1_t err2tbl = __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err2m, 2)); - const vuint8m1_t err3tbl = __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err3m, 2)); - - size_t tail = 3; - size_t n = len - tail; - - for (size_t vl; n > 0; n -= vl, src += vl) { - vl = __riscv_vsetvl_e8m4(n); - vuint8m4_t v0 = __riscv_vle8_v_u8m4((uint8_t const*)src, vl); +} // unnamed namespace +} // namespace lasx +} // namespace simdutf +/* end file src/generic/buf_block_reader.h */ +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +namespace simdutf { +namespace lasx { +namespace { +namespace utf8_validation { - uint8_t next0 = src[vl+0]; - uint8_t next1 = src[vl+1]; - uint8_t next2 = src[vl+2]; +using namespace simd; - /* fast path: ASCII */ - if (__riscv_vfirst_m_b2(__riscv_vmsgtu_vx_u8m4_b2(v0, 0b01111111, vl), vl) < 0 && (next0|next1|next2) < 0b10000000) - continue; +simdutf_really_inline simd8 +check_special_cases(const simd8 input, const simd8 prev1) { + // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) + // Bit 1 = Too Long (ASCII followed by continuation) + // Bit 2 = Overlong 3-byte + // Bit 4 = Surrogate + // Bit 5 = Overlong 2-byte + // Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1 << 6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4); + constexpr const uint8_t CARRY = + TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = + (prev1 & 0x0F) + .lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, - /* see "Validating UTF-8 In Less Than One Instruction Per Byte" - * https://arxiv.org/abs/2010.03090 */ - vuint8m4_t v1 = __riscv_vslide1down_vx_u8m4(v0, next0, vl); - vuint8m4_t v2 = __riscv_vslide1down_vx_u8m4(v1, next1, vl); - vuint8m4_t v3 = __riscv_vslide1down_vx_u8m4(v2, next2, vl); + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | + OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - vuint8m4_t s1 = __riscv_vreinterpret_v_u16m4_u8m4(__riscv_vsrl_vx_u16m4(__riscv_vreinterpret_v_u8m4_u16m4(v2), 4, __riscv_vsetvlmax_e16m4())); - vuint8m4_t s3 = __riscv_vreinterpret_v_u16m4_u8m4(__riscv_vsrl_vx_u16m4(__riscv_vreinterpret_v_u8m4_u16m4(v3), 4, __riscv_vsetvlmax_e16m4())); + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); + return (byte_1_high & byte_1_low & byte_2_high); +} +simdutf_really_inline simd8 +check_multibyte_lengths(const simd8 input, + const simd8 prev_input, + const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = + simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; +} - vuint8m4_t idx2 = __riscv_vand_vx_u8m4(v2, 0xF, vl); - vuint8m4_t idx1 = __riscv_vand_vx_u8m4(s1, 0xF, vl); - vuint8m4_t idx3 = __riscv_vand_vx_u8m4(s3, 0xF, vl); +// +// Return nonzero if there are incomplete multibyte characters at the end of the +// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end. +// +simdutf_really_inline simd8 is_incomplete(const simd8 input) { + // If the previous input's last 3 bytes match this, they're too short (they + // ended at EOF): + // ... 1111____ 111_____ 11______ + static const uint8_t max_array[32] = {255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 0b11110000u - 1, + 0b11100000u - 1, + 0b11000000u - 1}; + const simd8 max_value( + &max_array[sizeof(max_array) - sizeof(simd8)]); + return input.gt_bits(max_value); +} + +struct utf8_checker { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + // The last input we received + simd8 prev_input_block; + // Whether the last input we received was incomplete (used for ASCII fast + // path) + simd8 prev_incomplete; - vuint8m4_t err1 = simdutf_vrgather_u8m1x4(err1tbl, idx1); - vuint8m4_t err2 = simdutf_vrgather_u8m1x4(err2tbl, idx2); - vuint8m4_t err3 = simdutf_vrgather_u8m1x4(err3tbl, idx3); - vint8m4_t errs = __riscv_vreinterpret_v_u8m4_i8m4(__riscv_vand_vv_u8m4(__riscv_vand_vv_u8m4(err1, err2, vl), err3, vl)); + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, + const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ + // lead bytes (2, 3, 4-byte leads become large positive numbers instead of + // small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + // The only problem that can happen at EOF is that a multibyte character is + // too short or a byte value too large in the last bytes: check_special_cases + // only checks for bytes too large in the first of two bytes. + simdutf_really_inline void check_eof() { + // If the previous block had incomplete UTF-8 characters at the end, an + // ASCII block can't possibly finish them. + this->error |= this->prev_incomplete; + } + + simdutf_really_inline void check_next_input(const simd8x64 &input) { + if (simdutf_likely(is_ascii(input))) { + this->error |= this->prev_incomplete; + } else { + // you might think that a for-loop would work, but under Visual Studio, it + // is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + this->prev_incomplete = + is_incomplete(input.chunks[simd8x64::NUM_CHUNKS - 1]); + this->prev_input_block = input.chunks[simd8x64::NUM_CHUNKS - 1]; + } + } - vbool2_t is_3 = __riscv_vmsgtu_vx_u8m4_b2(v1, 0b11100000-1, vl); - vbool2_t is_4 = __riscv_vmsgtu_vx_u8m4_b2(v0, 0b11110000-1, vl); - vbool2_t is_34 = __riscv_vmor_mm_b2(is_3, is_4, vl); - vbool2_t err34 = __riscv_vmxor_mm_b2(is_34, __riscv_vmslt_vx_i8m4_b2(errs, 0, vl), vl); - vbool2_t errm = __riscv_vmor_mm_b2(__riscv_vmsgt_vx_i8m4_b2(errs, 0, vl), err34, vl); - if (__riscv_vfirst_m_b2(errm , vl) >= 0) - break; + // do not forget to call check_eof! + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); } - /* we need to validate the last character */ - while (tail < len && (src[0] >> 6) == 0b10) --src, ++tail; - return src - beg; -} +}; // struct utf8_checker +} // namespace utf8_validation -simdutf_warn_unused bool implementation::validate_utf8(const char *src, size_t len) const noexcept { - size_t count = rvv_count_valid_utf8(src, len); - return scalar::utf8::validate(src + count, len - count); -} +using utf8_validation::utf8_checker; -simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *src, size_t len) const noexcept { - size_t count = rvv_count_valid_utf8(src, len); - result res = scalar::utf8::validate_with_errors(src + count, len - count); - return result(res.error, count + res.count); -} +} // unnamed namespace +} // namespace lasx +} // namespace simdutf +/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +/* begin file src/generic/utf8_validation/utf8_validator.h */ +namespace simdutf { +namespace lasx { +namespace { +namespace utf8_validation { -simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *src, size_t len) const noexcept { - return validate_utf16le_with_errors(src, len).error == error_code::SUCCESS; +/** + * Validates that the string is actual UTF-8. + */ +template +bool generic_validate_utf8(const uint8_t *input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + reader.advance(); + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + c.check_eof(); + return !c.errors(); } -simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *src, size_t len) const noexcept { - return validate_utf16be_with_errors(src, len).error == error_code::SUCCESS; +bool generic_validate_utf8(const char *input, size_t length) { + return generic_validate_utf8( + reinterpret_cast(input), length); } -template -simdutf_really_inline static result rvv_validate_utf16_with_errors(const char16_t *src, size_t len) { - const char16_t *beg = src; - uint16_t last = 0; - for (size_t vl; len > 0; len -= vl, src += vl, last = simdutf_byteflip(src[-1])) { - vl = __riscv_vsetvl_e16m8(len); - vuint16m8_t v1 = __riscv_vle16_v_u16m8((const uint16_t*)src, vl); - v1 = simdutf_byteflip(v1, vl); - vuint16m8_t v0 = __riscv_vslide1up_vx_u16m8(v1, last, vl); - - vbool2_t surhi = __riscv_vmseq_vx_u16m8_b2(__riscv_vand_vx_u16m8(v0, 0xFC00, vl), 0xD800, vl); - vbool2_t surlo = __riscv_vmseq_vx_u16m8_b2(__riscv_vand_vx_u16m8(v1, 0xFC00, vl), 0xDC00, vl); - - long idx = __riscv_vfirst_m_b2(__riscv_vmxor_mm_b2(surhi, surlo, vl), vl); - if (idx >= 0) { - last = idx > 0 ? simdutf_byteflip(src[idx-1]) : last; - return result(error_code::SURROGATE, src - beg + idx - (last - 0xD800u < 0x400u)); - break; +/** + * Validates that the string is actual UTF-8 and stops on errors. + */ +template +result generic_validate_utf8_with_errors(const uint8_t *input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + size_t count{0}; + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + if (c.errors()) { + if (count != 0) { + count--; + } // Sometimes the error is only detected in the next chunk + result res = scalar::utf8::rewind_and_validate_with_errors( + reinterpret_cast(input), + reinterpret_cast(input + count), length - count); + res.count += count; + return res; } + reader.advance(); + count += 64; + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + c.check_eof(); + if (c.errors()) { + if (count != 0) { + count--; + } // Sometimes the error is only detected in the next chunk + result res = scalar::utf8::rewind_and_validate_with_errors( + reinterpret_cast(input), + reinterpret_cast(input) + count, length - count); + res.count += count; + return res; + } else { + return result(error_code::SUCCESS, length); } - if (last - 0xD800u < 0x400u) - return result(error_code::SURROGATE, src - beg - 1); /* end on high surrogate */ - else - return result(error_code::SUCCESS, src - beg); } -simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *src, size_t len) const noexcept { - return rvv_validate_utf16_with_errors(src, len); +result generic_validate_utf8_with_errors(const char *input, size_t length) { + return generic_validate_utf8_with_errors( + reinterpret_cast(input), length); } -simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *src, size_t len) const noexcept { - if (supports_zvbb()) - return rvv_validate_utf16_with_errors(src, len); - else - return rvv_validate_utf16_with_errors(src, len); -} +} // namespace utf8_validation +} // unnamed namespace +} // namespace lasx +} // namespace simdutf +/* end file src/generic/utf8_validation/utf8_validator.h */ +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_ASCII +/* begin file src/generic/ascii_validation.h */ +namespace simdutf { +namespace lasx { +namespace { +namespace ascii_validation { -simdutf_warn_unused bool implementation::validate_utf32(const char32_t *src, size_t len) const noexcept { - size_t vlmax = __riscv_vsetvlmax_e32m8(); - vuint32m8_t max = __riscv_vmv_v_x_u32m8(0x10FFFF, vlmax); - vuint32m8_t maxOff = __riscv_vmv_v_x_u32m8(0xFFFFF7FF, vlmax); - for (size_t vl; len > 0; len -= vl, src += vl) { - vl = __riscv_vsetvl_e32m8(len); - vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t*)src, vl); - vuint32m8_t off = __riscv_vadd_vx_u32m8(v, 0xFFFF2000, vl); - max = __riscv_vmaxu_vv_u32m8_tu(max, max, v, vl); - maxOff = __riscv_vmaxu_vv_u32m8_tu(maxOff, maxOff, off, vl); +result generic_validate_ascii_with_errors(const char *input, size_t length) { + buf_block_reader<64> reader(reinterpret_cast(input), length); + size_t count{0}; + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + if (!in.is_ascii()) { + result res = scalar::ascii::validate_with_errors( + reinterpret_cast(input + count), length - count); + return result(res.error, count + res.count); + } + reader.advance(); + + count += 64; + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + if (!in.is_ascii()) { + result res = scalar::ascii::validate_with_errors( + reinterpret_cast(input + count), length - count); + return result(res.error, count + res.count); + } else { + return result(error_code::SUCCESS, length); } - return __riscv_vfirst_m_b4(__riscv_vmor_mm_b4( - __riscv_vmsne_vx_u32m8_b4(max, 0x10FFFF, vlmax), - __riscv_vmsne_vx_u32m8_b4(maxOff, 0xFFFFF7FF, vlmax), vlmax), vlmax) < 0; } -simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *src, size_t len) const noexcept { - const char32_t *beg = src; - for (size_t vl; len > 0; len -= vl, src += vl) { - vl = __riscv_vsetvl_e32m8(len); - vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t*)src, vl); - vuint32m8_t off = __riscv_vadd_vx_u32m8(v, 0xFFFF2000, vl); - long idx; - idx = __riscv_vfirst_m_b4(__riscv_vmsgtu_vx_u32m8_b4(v, 0x10FFFF, vl), vl); - if (idx >= 0) return result(error_code::TOO_LARGE, src - beg + idx); - idx = __riscv_vfirst_m_b4(__riscv_vmsgtu_vx_u32m8_b4(off, 0xFFFFF7FF, vl), vl); - if (idx >= 0) return result(error_code::SURROGATE, src - beg + idx); +bool generic_validate_ascii(const char *input, size_t length) { + buf_block_reader<64> reader(reinterpret_cast(input), length); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + if (!in.is_ascii()) { + return false; + } + reader.advance(); } - return result(error_code::SUCCESS, src - beg); + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + return in.is_ascii(); } -/* end file src/rvv/rvv_validate.inl.cpp */ - -/* begin file src/rvv/rvv_latin1_to.inl.cpp */ - -simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char *src, size_t len, char *dst) const noexcept { - char *beg = dst; - for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut) { - vl = __riscv_vsetvl_e8m2(len); - vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t*)src, vl); - vbool4_t nascii = __riscv_vmslt_vx_i8m2_b4(__riscv_vreinterpret_v_u8m2_i8m2(v1), 0, vl); - size_t cnt = __riscv_vcpop_m_b4(nascii, vl); - vlOut = vl + cnt; - if (cnt == 0) { - __riscv_vse8_v_u8m2((uint8_t*)dst, v1, vlOut); - continue; - } +} // namespace ascii_validation +} // unnamed namespace +} // namespace lasx +} // namespace simdutf +/* end file src/generic/ascii_validation.h */ +#endif // SIMDUTF_FEATURE_ASCII - vuint8m2_t v0 = __riscv_vor_vx_u8m2(__riscv_vsrl_vx_u8m2(v1, 6, vl), 0b11000000, vl); - v1 = __riscv_vand_vx_u8m2_mu(nascii, v1, v1, 0b10111111, vl); +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + // transcoding from UTF-8 to Latin 1 +/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */ +namespace simdutf { +namespace lasx { +namespace { +namespace utf8_to_latin1 { +using namespace simd; - vuint8m4_t wide = __riscv_vreinterpret_v_u16m4_u8m4(__riscv_vwmaccu_vx_u16m4(__riscv_vwaddu_vv_u16m4(v0, v1, vl), 0xFF, v1, vl)); - vbool2_t mask = __riscv_vmsgtu_vx_u8m4_b2(__riscv_vsub_vx_u8m4(wide, 0b11000000, vl*2), 1, vl*2); - vuint8m4_t comp = __riscv_vcompress_vm_u8m4(wide, mask, vl*2); +simdutf_really_inline simd8 +check_special_cases(const simd8 input, const simd8 prev1) { + // For UTF-8 to Latin 1, we can allow any ASCII character, and any + // continuation byte, but the non-ASCII leading bytes must be 0b11000011 or + // 0b11000010 and nothing else. + // + // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) + // Bit 1 = Too Long (ASCII followed by continuation) + // Bit 2 = Overlong 3-byte + // Bit 4 = Surrogate + // Bit 5 = Overlong 2-byte + // Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1 << 6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____ + constexpr const uint8_t FORBIDDEN = 0xff; + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + FORBIDDEN, + // 1110____ ________ + FORBIDDEN, + // 1111____ ________ + FORBIDDEN); + constexpr const uint8_t CARRY = + TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = + (prev1 & 0x0F) + .lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, CARRY, + + // ____0100 ________ + FORBIDDEN, + // ____0101 ________ + FORBIDDEN, + // ____011_ ________ + FORBIDDEN, FORBIDDEN, + + // ____1___ ________ + FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, + // ____1101 ________ + FORBIDDEN, FORBIDDEN, FORBIDDEN); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, - __riscv_vse8_v_u8m4((uint8_t*)dst, comp, vlOut); - } - return dst - beg; -} + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | + OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, -simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char *src, size_t len, char16_t *dst) const noexcept { - char16_t *beg = dst; - for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) { - vl = __riscv_vsetvl_e8m4(len); - vuint8m4_t v = __riscv_vle8_v_u8m4((uint8_t*)src, vl); - __riscv_vse16_v_u16m8((uint16_t*)dst, __riscv_vzext_vf2_u16m8(v, vl), vl); - } - return dst - beg; + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); + return (byte_1_high & byte_1_low & byte_2_high); } -simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char *src, size_t len, char16_t *dst) const noexcept { - char16_t *beg = dst; - for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) { - vl = __riscv_vsetvl_e8m4(len); - vuint8m4_t v = __riscv_vle8_v_u8m4((uint8_t*)src, vl); - __riscv_vse16_v_u16m8((uint16_t*)dst, __riscv_vsll_vx_u16m8(__riscv_vzext_vf2_u16m8(v, vl), 8, vl), vl); - } - return dst - beg; -} +struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; -simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char *src, size_t len, char32_t *dst) const noexcept { - char32_t *beg = dst; - for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) { - vl = __riscv_vsetvl_e8m2(len); - vuint8m2_t v = __riscv_vle8_v_u8m2((uint8_t*)src, vl); - __riscv_vse32_v_u32m8((uint32_t*)dst, __riscv_vzext_vf4_u32m8(v, vl), vl); + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, + const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ + // lead bytes (2, 3, 4-byte leads become large positive numbers instead of + // small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + this->error |= check_special_cases(input, prev1); } - return dst - beg; -} - -/* end file src/rvv/rvv_latin1_to.inl.cpp */ -/* begin file src/rvv/rvv_utf8_to.inl.cpp */ -template -simdutf_really_inline static size_t rvv_utf32_store_utf16_m4(uint16_t *dst, vuint32m4_t utf32, size_t vl, vbool4_t m4even) { - /* convert [000000000000aaaa|aaaaaabbbbbbbbbb] - * to [110111bbbbbbbbbb|110110aaaaaaaaaa] */ - vuint32m4_t sur = __riscv_vsub_vx_u32m4(utf32, 0x10000, vl); - sur = __riscv_vor_vv_u32m4(__riscv_vsll_vx_u32m4(sur, 16, vl), - __riscv_vsrl_vx_u32m4(sur, 10, vl), vl); - sur = __riscv_vand_vx_u32m4(sur, 0x3FF03FF, vl); - sur = __riscv_vor_vx_u32m4(sur, 0xDC00D800, vl); - /* merge 1 byte utf32 and 2 byte sur */ - vbool8_t m4 = __riscv_vmsgtu_vx_u32m4_b8(utf32, 0xFFFF, vl); - vuint16m4_t utf32_16 = __riscv_vreinterpret_v_u32m4_u16m4(__riscv_vmerge_vvm_u32m4(utf32, sur, m4, vl)); - /* compress and store */ - vbool4_t mOut = __riscv_vmor_mm_b4(__riscv_vmsne_vx_u16m4_b4(utf32_16, 0, vl*2), m4even, vl*2); - vuint16m4_t vout = __riscv_vcompress_vm_u16m4(utf32_16, mOut, vl*2); - vl = __riscv_vcpop_m_b4(mOut, vl*2); - __riscv_vse16_v_u16m4(dst, simdutf_byteflip(vout, vl), vl); - return vl; -}; - -template -simdutf_really_inline static size_t rvv_utf8_to_common(char const *src, size_t len, Tdst *dst) { - static_assert(std::is_same() || std::is_same(), "invalid type"); - constexpr bool is16 = std::is_same(); - constexpr endianness endian = bflip == simdutf_ByteFlip::NONE ? endianness::LITTLE : endianness::BIG; - const auto scalar = [](char const *in, size_t count, Tdst *out) { - return is16 ? scalar::utf8_to_utf16::convert(in, count, (char16_t*)out) - : scalar::utf8_to_utf32::convert(in, count, (char32_t*)out); - }; - if (len < 32) return scalar(src, len, dst); - - /* validate first three bytes */ - if (validate) { - size_t idx = 3; - while (idx < len && (src[idx] >> 6) == 0b10) - ++idx; - if (idx > 3+3 || !scalar::utf8::validate(src, idx)) + simdutf_really_inline size_t convert(const char *in, size_t size, + char *latin1_output) { + size_t pos = 0; + char *start{latin1_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 16 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 16; margin--) { + leading_byte += (int8_t(in[margin - 1]) > + -65); // twos complement of -65 is 1011 1111 ... + } + // If the input is long enough, then we have that margin-1 is the eight last + // leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store((int8_t *)latin1_output); + latin1_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = + input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in + // this case, we also have ASCII to account for. + if (utf8_continuation_mask & 1) { + return 0; // error + } + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_latin1( + in + pos, utf8_end_of_code_point_mask, latin1_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (errors()) { return 0; + } + if (pos < size) { + size_t howmany = + scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output); + if (howmany == 0) { + return 0; + } + latin1_output += howmany; + } + return latin1_output - start; } - size_t tail = 3; - size_t n = len - tail; - Tdst *beg = dst; - - static const uint64_t err1m[] = { 0x0202020202020202, 0x4915012180808080 }; - static const uint64_t err2m[] = { 0xCBCBCB8B8383A3E7, 0xCBCBDBCBCBCBCBCB }; - static const uint64_t err3m[] = { 0x0101010101010101, 0X01010101BABAAEE6 }; + simdutf_really_inline result convert_with_errors(const char *in, size_t size, + char *latin1_output) { + size_t pos = 0; + char *start{latin1_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 8 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the eight last + // leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store((int8_t *)latin1_output); + latin1_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + if (errors()) { + // rewind_and_convert_with_errors will seek a potential error from + // in+pos onward, with the ability to go back up to pos bytes, and + // read size-pos bytes forward. + result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors( + pos, in + pos, size - pos, latin1_output); + res.count += pos; + return res; + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_latin1( + in + pos, utf8_end_of_code_point_mask, latin1_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (errors()) { + // rewind_and_convert_with_errors will seek a potential error from in+pos + // onward, with the ability to go back up to pos bytes, and read size-pos + // bytes forward. + result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors( + pos, in + pos, size - pos, latin1_output); + res.count += pos; + return res; + } + if (pos < size) { + // rewind_and_convert_with_errors will seek a potential error from in+pos + // onward, with the ability to go back up to pos bytes, and read size-pos + // bytes forward. + result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors( + pos, in + pos, size - pos, latin1_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + latin1_output += res.count; + } + } + return result(error_code::SUCCESS, latin1_output - start); + } - const vuint8m1_t err1tbl = __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err1m, 2)); - const vuint8m1_t err2tbl = __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err2m, 2)); - const vuint8m1_t err3tbl = __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err3m, 2)); + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } - size_t vl8m2 = __riscv_vsetvlmax_e8m2(); - vbool4_t m4even = __riscv_vmseq_vx_u8m2_b4(__riscv_vand_vx_u8m2(__riscv_vid_v_u8m2(vl8m2), 1, vl8m2), 0, vl8m2); +}; // struct utf8_checker +} // namespace utf8_to_latin1 +} // unnamed namespace +} // namespace lasx +} // namespace simdutf +/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */ +/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */ +namespace simdutf { +namespace lasx { +namespace { +namespace utf8_to_latin1 { +using namespace simd; - for (size_t vl, vlOut; n > 0; n -= vl, src += vl, dst += vlOut) { - vl = __riscv_vsetvl_e8m2(n); +simdutf_really_inline size_t convert_valid(const char *in, size_t size, + char *latin1_output) { + size_t pos = 0; + char *start{latin1_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last + // 16 bytes, and if the data is valid, then it is entirely safe because 16 + // UTF-8 bytes generate much more than 8 bytes. However, you cannot generally + // assume that you have valid UTF-8 input, so we are going to go back from the + // end counting 8 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > + -65); // twos complement of -65 is 1011 1111 ... + } + // If the input is long enough, then we have that margin-1 is the eight last + // leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store((int8_t *)latin1_output); + latin1_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it + // is not good enough. + uint64_t utf8_continuation_mask = + input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in + // this case, we also have ASCII to account for. + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_latin1( + in + pos, utf8_end_of_code_point_mask, latin1_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (pos < size) { + size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos, + latin1_output); + latin1_output += howmany; + } + return latin1_output - start; +} - vuint8m2_t v0 = __riscv_vle8_v_u8m2((uint8_t const*)src, vl); - uint64_t max = __riscv_vmv_x_s_u8m1_u8(__riscv_vredmaxu_vs_u8m2_u8m1(v0, __riscv_vmv_s_x_u8m1(0, vl), vl)); +} // namespace utf8_to_latin1 +} // namespace +} // namespace lasx +} // namespace simdutf + // namespace simdutf +/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + // transcoding from UTF-8 to UTF-16 +/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ +namespace simdutf { +namespace lasx { +namespace { +namespace utf8_to_utf16 { - uint8_t next0 = src[vl+0]; - uint8_t next1 = src[vl+1]; - uint8_t next2 = src[vl+2]; +using namespace simd; - /* fast path: ASCII */ - if ((max|next0|next1|next2) < 0b10000000) { - vlOut = vl; - if (is16) __riscv_vse16_v_u16m4((uint16_t*)dst, simdutf_byteflip(__riscv_vzext_vf2_u16m4(v0, vlOut), vlOut), vlOut); - else __riscv_vse32_v_u32m8((uint32_t*)dst, __riscv_vzext_vf4_u32m8(v0, vlOut), vlOut); - continue; +template +simdutf_warn_unused size_t convert_valid(const char *input, size_t size, + char16_t *utf16_output) noexcept { + // The implementation is not specific to haswell and should be moved to the + // generic directory. + size_t pos = 0; + char16_t *start{utf16_output}; + const size_t safety_margin = 16; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + // this loop could be unrolled further. For example, we could process the + // mask far more than 64 bytes. + simd8x64 in(reinterpret_cast(input + pos)); + if (in.is_ascii()) { + in.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // Slow path. We hope that the compiler will recognize that this is a slow + // path. Anything that is not a continuation mask is a 'leading byte', + // that is, the start of a new code point. + uint64_t utf8_continuation_mask = in.lt(-65 + 1); + // -65 is 0b10111111 in two-complement's, so largest possible continuation + // byte + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + // The *start* of code points is not so useful, rather, we want the *end* + // of code points. + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times when using solely + // the slow/regular path, and at least four times if there are fast paths. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + // + // Thus we may allow convert_masked_utf8_to_utf16 to process + // more bytes at a time under a fast-path mode where 16 bytes + // are consumed at once (e.g., when encountering ASCII). + size_t consumed = convert_masked_utf8_to_utf16( + input + pos, utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. } + } + utf16_output += scalar::utf8_to_utf16::convert_valid( + input + pos, size - pos, utf16_output); + return utf16_output - start; +} - /* see "Validating UTF-8 In Less Than One Instruction Per Byte" - * https://arxiv.org/abs/2010.03090 */ - vuint8m2_t v1 = __riscv_vslide1down_vx_u8m2(v0, next0, vl); - vuint8m2_t v2 = __riscv_vslide1down_vx_u8m2(v1, next1, vl); - vuint8m2_t v3 = __riscv_vslide1down_vx_u8m2(v2, next2, vl); - - if (validate) { - vuint8m2_t s1 = __riscv_vreinterpret_v_u16m2_u8m2(__riscv_vsrl_vx_u16m2(__riscv_vreinterpret_v_u8m2_u16m2(v2), 4, __riscv_vsetvlmax_e16m2())); - vuint8m2_t s3 = __riscv_vreinterpret_v_u16m2_u8m2(__riscv_vsrl_vx_u16m2(__riscv_vreinterpret_v_u8m2_u16m2(v3), 4, __riscv_vsetvlmax_e16m2())); +} // namespace utf8_to_utf16 +} // unnamed namespace +} // namespace lasx +} // namespace simdutf +/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ +/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */ +namespace simdutf { +namespace lasx { +namespace { +namespace utf8_to_utf16 { +using namespace simd; - vuint8m2_t idx2 = __riscv_vand_vx_u8m2(v2, 0xF, vl); - vuint8m2_t idx1 = __riscv_vand_vx_u8m2(s1, 0xF, vl); - vuint8m2_t idx3 = __riscv_vand_vx_u8m2(s3, 0xF, vl); +simdutf_really_inline simd8 +check_special_cases(const simd8 input, const simd8 prev1) { + // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) + // Bit 1 = Too Long (ASCII followed by continuation) + // Bit 2 = Overlong 3-byte + // Bit 4 = Surrogate + // Bit 5 = Overlong 2-byte + // Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1 << 6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4); + constexpr const uint8_t CARRY = + TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = + (prev1 & 0x0F) + .lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, - vuint8m2_t err1 = simdutf_vrgather_u8m1x2(err1tbl, idx1); - vuint8m2_t err2 = simdutf_vrgather_u8m1x2(err2tbl, idx2); - vuint8m2_t err3 = simdutf_vrgather_u8m1x2(err3tbl, idx3); - vint8m2_t errs = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vand_vv_u8m2(__riscv_vand_vv_u8m2(err1, err2, vl), err3, vl)); + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | + OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - vbool4_t is_3 = __riscv_vmsgtu_vx_u8m2_b4(v1, 0b11100000-1, vl); - vbool4_t is_4 = __riscv_vmsgtu_vx_u8m2_b4(v0, 0b11110000-1, vl); - vbool4_t is_34 = __riscv_vmor_mm_b4(is_3, is_4, vl); - vbool4_t err34 = __riscv_vmxor_mm_b4(is_34, __riscv_vmslt_vx_i8m2_b4(errs, 0, vl), vl); - vbool4_t errm = __riscv_vmor_mm_b4(__riscv_vmsgt_vx_i8m2_b4(errs, 0, vl), err34, vl); - if (__riscv_vfirst_m_b4(errm , vl) >= 0) + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); + return (byte_1_high & byte_1_low & byte_2_high); +} +simdutf_really_inline simd8 +check_multibyte_lengths(const simd8 input, + const simd8 prev_input, + const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = + simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; +} + +struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, + const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ + // lead bytes (2, 3, 4-byte leads become large positive numbers instead of + // small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + template + simdutf_really_inline size_t convert(const char *in, size_t size, + char16_t *utf16_output) { + size_t pos = 0; + char16_t *start{utf16_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 8 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the eight last + // leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (utf8_continuation_mask & 1) { + return 0; // error + } + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf16( + in + pos, utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (errors()) { + return 0; + } + if (pos < size) { + size_t howmany = scalar::utf8_to_utf16::convert( + in + pos, size - pos, utf16_output); + if (howmany == 0) { return 0; + } + utf16_output += howmany; } + return utf16_output - start; + } - /* decoding */ + template + simdutf_really_inline result convert_with_errors(const char *in, size_t size, + char16_t *utf16_output) { + size_t pos = 0; + char16_t *start{utf16_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 8 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the eight last + // leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (errors() || (utf8_continuation_mask & 1)) { + // rewind_and_convert_with_errors will seek a potential error from + // in+pos onward, with the ability to go back up to pos bytes, and + // read size-pos bytes forward. + result res = + scalar::utf8_to_utf16::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf16_output); + res.count += pos; + return res; + } + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf16( + in + pos, utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (errors()) { + // rewind_and_convert_with_errors will seek a potential error from in+pos + // onward, with the ability to go back up to pos bytes, and read size-pos + // bytes forward. + result res = + scalar::utf8_to_utf16::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf16_output); + res.count += pos; + return res; + } + if (pos < size) { + // rewind_and_convert_with_errors will seek a potential error from in+pos + // onward, with the ability to go back up to pos bytes, and read size-pos + // bytes forward. + result res = + scalar::utf8_to_utf16::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf16_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + utf16_output += res.count; + } + } + return result(error_code::SUCCESS, utf16_output - start); + } - /* mask of non continuation bytes */ - vbool4_t m = __riscv_vmsgt_vx_i8m2_b4(__riscv_vreinterpret_v_u8m2_i8m2(v0), -65, vl); - vlOut = __riscv_vcpop_m_b4(m, vl); + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } - /* extract first and second bytes */ - vuint8m2_t b1 = __riscv_vcompress_vm_u8m2(v0, m, vl); - vuint8m2_t b2 = __riscv_vcompress_vm_u8m2(v1, m, vl); +}; // struct utf8_checker +} // namespace utf8_to_utf16 +} // unnamed namespace +} // namespace lasx +} // namespace simdutf +/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */ +/* begin file src/generic/utf8/utf16_length_from_utf8_bytemask.h */ +namespace simdutf { +namespace lasx { +namespace { +namespace utf8 { - /* fast path: one and two byte */ - if (max < 0b11100000) { - b2 = __riscv_vand_vx_u8m2(b2, 0b00111111, vlOut); +using namespace simd; - vbool4_t m1 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b10111111, vlOut); - b1 = __riscv_vand_vx_u8m2_mu(m1, b1, b1, 63, vlOut); +simdutf_really_inline size_t utf16_length_from_utf8_bytemask(const char *in, + size_t size) { + using vector_i8 = simd8; + using vector_u8 = simd8; + using vector_u64 = simd64; - vuint16m4_t b12 = __riscv_vwmulu_vv_u16m4(b1, __riscv_vmerge_vxm_u8m2(__riscv_vmv_v_x_u8m2(1, vlOut), 1<<6, m1, vlOut), vlOut); - b12 = __riscv_vwaddu_wv_u16m4_mu(m1, b12, b12, b2, vlOut); - if (is16) __riscv_vse16_v_u16m4((uint16_t*)dst, simdutf_byteflip(b12, vlOut), vlOut); - else __riscv_vse32_v_u32m8((uint32_t*)dst, __riscv_vzext_vf2_u32m8(b12, vlOut), vlOut); - continue; - } + constexpr size_t N = vector_i8::SIZE; + constexpr size_t max_iterations = 255 / 2; - /* fast path: one, two and three byte */ - if (max < 0b11110000) { - vuint8m2_t b3 = __riscv_vcompress_vm_u8m2(v2, m, vl); + auto counters = vector_u64::zero(); + auto local = vector_u8::zero(); - b2 = __riscv_vand_vx_u8m2(b2, 0b00111111, vlOut); - b3 = __riscv_vand_vx_u8m2(b3, 0b00111111, vlOut); + size_t iterations = 0; + size_t pos = 0; + size_t count = 0; + for (; pos + N <= size; pos += N) { + const auto input = + vector_i8::load(reinterpret_cast(in + pos)); - vbool4_t m1 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b10111111, vlOut); - vbool4_t m3 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b11011111, vlOut); + const auto continuation = input > int8_t(-65); + const auto utf_4bytes = vector_u8(input.value) >= uint8_t(240); - vuint8m2_t t1 = __riscv_vand_vx_u8m2_mu(m1, b1, b1, 63, vlOut); - b1 = __riscv_vand_vx_u8m2_mu(m3, t1, b1, 15, vlOut); + local -= vector_u8(continuation); + local -= vector_u8(utf_4bytes); - vuint16m4_t b12 = __riscv_vwmulu_vv_u16m4(b1, __riscv_vmerge_vxm_u8m2(__riscv_vmv_v_x_u8m2(1, vlOut), 1<<6, m1, vlOut), vlOut); - b12 = __riscv_vwaddu_wv_u16m4_mu(m1, b12, b12, b2, vlOut); - vuint16m4_t b123 = __riscv_vwaddu_wv_u16m4_mu(m3, b12, __riscv_vsll_vx_u16m4_mu(m3, b12, b12, 6, vlOut), b3, vlOut); - if (is16) __riscv_vse16_v_u16m4((uint16_t*)dst, simdutf_byteflip(b123, vlOut), vlOut); - else __riscv_vse32_v_u32m8((uint32_t*)dst, __riscv_vzext_vf2_u32m8(b123, vlOut), vlOut); - continue; + iterations += 1; + if (iterations == max_iterations) { + counters += sum_8bytes(local); + local = vector_u8::zero(); + iterations = 0; } + } - /* extract third and fourth bytes */ - vuint8m2_t b3 = __riscv_vcompress_vm_u8m2(v2, m, vl); - vuint8m2_t b4 = __riscv_vcompress_vm_u8m2(v3, m, vl); + if (iterations > 0) { + count += local.sum_bytes(); + } - #define SIMDUTF_RVV_UTF8_TO_COMMON_M1(idx) \ - vuint8m1_t c1 = __riscv_vget_v_u8m2_u8m1(b1, idx); \ - vuint8m1_t c2 = __riscv_vget_v_u8m2_u8m1(b2, idx); \ - vuint8m1_t c3 = __riscv_vget_v_u8m2_u8m1(b3, idx); \ - vuint8m1_t c4 = __riscv_vget_v_u8m2_u8m1(b4, idx); \ - /* remove prefix from trailing bytes */ \ - c2 = __riscv_vand_vx_u8m1(c2, 0b00111111, vlOut); \ - c3 = __riscv_vand_vx_u8m1(c3, 0b00111111, vlOut); \ - c4 = __riscv_vand_vx_u8m1(c4, 0b00111111, vlOut); \ - /* remove prefix from leading bytes - * - * We could also use vrgather here, but it increases register pressure, - * and its performance varies widely on current platforms. It might be - * worth reconsidering, though, once there is more hardware available. - * Same goes for the __riscv_vsrl_vv_u32m4 correction step. - * - * We shift left and then right by the number of bytes in the prefix, - * which can be calculated as follows: - * x max(x-10, 0) - * 0xxx -> 0000-0111 -> sift by 0 or 1 -> 0 - * 10xx -> 1000-1011 -> don't care - * 110x -> 1100,1101 -> sift by 3 -> 2,3 - * 1110 -> 1110 -> sift by 4 -> 4 - * 1111 -> 1111 -> sift by 5 -> 5 - * - * vssubu.vx v, 10, (max(x-10, 0)) almost gives us what we want, we - * just need to manually detect and handle the one special case: - */ \ - vuint8m1_t shift = __riscv_vsrl_vx_u8m1(c1, 4, vlOut); \ - shift = __riscv_vmerge_vxm_u8m1(__riscv_vssubu_vx_u8m1(shift, 10, vlOut), 3, __riscv_vmseq_vx_u8m1_b8(shift, 12, vlOut), vlOut); \ - c1 = __riscv_vsll_vv_u8m1(c1, shift, vlOut); \ - c1 = __riscv_vsrl_vv_u8m1(c1, shift, vlOut); \ - /* unconditionally widen and combine to c1234 */ \ - vuint16m2_t c34 = __riscv_vwaddu_wv_u16m2(__riscv_vwmulu_vx_u16m2(c3, 1<<6, vlOut), c4, vlOut); \ - vuint16m2_t c12 = __riscv_vwaddu_wv_u16m2(__riscv_vwmulu_vx_u16m2(c1, 1<<6, vlOut), c2, vlOut); \ - vuint32m4_t c1234 = __riscv_vwaddu_wv_u32m4(__riscv_vwmulu_vx_u32m4(c12, 1 << 12, vlOut), c34, vlOut); \ - /* derive required right-shift amount from `shift` to reduce - * c1234 to the required number of bytes */ \ - c1234 = __riscv_vsrl_vv_u32m4(c1234, __riscv_vzext_vf4_u32m4(__riscv_vmul_vx_u8m1( \ - __riscv_vrsub_vx_u8m1( __riscv_vssubu_vx_u8m1(shift, 2, vlOut), 3, vlOut), 6, vlOut), vlOut), vlOut); \ - /* store result in desired format */ \ - if (is16) vlDst = rvv_utf32_store_utf16_m4((uint16_t*)dst, c1234, vlOut, m4even); \ - else vlDst = vlOut, __riscv_vse32_v_u32m4((uint32_t*)dst, c1234, vlOut); + count += counters.sum(); - /* Unrolling this manually reduces register pressure and allows - * us to terminate early. */ - { - size_t vlOutm2 = vlOut, vlDst; - vlOut = __riscv_vsetvl_e8m1(vlOut); - SIMDUTF_RVV_UTF8_TO_COMMON_M1(0) - if (vlOutm2 == vlOut) { - vlOut = vlDst; - continue; - } + return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos); +} - dst += vlDst; - vlOut = vlOutm2 - vlOut; - } - { - size_t vlDst; - SIMDUTF_RVV_UTF8_TO_COMMON_M1(1) - vlOut = vlDst; - } +} // namespace utf8 +} // unnamed namespace +} // namespace lasx +} // namespace simdutf +/* end file src/generic/utf8/utf16_length_from_utf8_bytemask.h */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + // transcoding from UTF-8 to UTF-32 +/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ +namespace simdutf { +namespace lasx { +namespace { +namespace utf8_to_utf32 { -#undef SIMDUTF_RVV_UTF8_TO_COMMON_M1 - } +using namespace simd; - /* validate the last character and reparse it + tail */ - if (len > tail) { - if ((src[0] >> 6) == 0b10) - --dst; - while ((src[0] >> 6) == 0b10 && tail < len) - --src, ++tail; - if (is16) { - /* go back one more, when on high surrogate */ - if (simdutf_byteflip((uint16_t)dst[-1]) >= 0xD800 && simdutf_byteflip((uint16_t)dst[-1]) <= 0xDBFF) - --dst; +simdutf_warn_unused size_t convert_valid(const char *input, size_t size, + char32_t *utf32_output) noexcept { + size_t pos = 0; + char32_t *start{utf32_output}; + const size_t safety_margin = 16; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 in(reinterpret_cast(input + pos)); + if (in.is_ascii()) { + in.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // -65 is 0b10111111 in two-complement's, so largest possible continuation + // byte + uint64_t utf8_continuation_mask = in.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + size_t max_starting_point = (pos + 64) - 12; + while (pos < max_starting_point) { + size_t consumed = convert_masked_utf8_to_utf32( + input + pos, utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } } } - size_t ret = scalar(src, tail, dst); - if (ret == 0) return 0; - return (size_t)(dst - beg) + ret; + utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, + utf32_output); + return utf32_output - start; } +} // namespace utf8_to_utf32 +} // unnamed namespace +} // namespace lasx +} // namespace simdutf +/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ +/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */ +namespace simdutf { +namespace lasx { +namespace { +namespace utf8_to_utf32 { +using namespace simd; -simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char *src, size_t len, char *dst) const noexcept { - const char *beg = dst; - uint8_t last = 0b10000000; - for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut, last = src[-1]) { - vl = __riscv_vsetvl_e8m2(len); - vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t*)src, vl); - vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl); - vlOut = __riscv_vcpop_m_b4(m, vl); - if (vlOut != vl || last > 0b01111111) { - vuint8m2_t v0 = __riscv_vslide1up_vx_u8m2(v1, last, vl); +simdutf_really_inline simd8 +check_special_cases(const simd8 input, const simd8 prev1) { + // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) + // Bit 1 = Too Long (ASCII followed by continuation) + // Bit 2 = Overlong 3-byte + // Bit 4 = Surrogate + // Bit 5 = Overlong 2-byte + // Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1 << 6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4); + constexpr const uint8_t CARRY = + TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = + (prev1 & 0x0F) + .lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, - vbool4_t leading0 = __riscv_vmsgtu_vx_u8m2_b4(v0, 0b10111111, vl); - vbool4_t trailing1 = __riscv_vmslt_vx_i8m2_b4(__riscv_vreinterpret_v_u8m2_i8m2(v1), (uint8_t)0b11000000, vl); - vbool4_t tobig = __riscv_vmand_mm_b4(leading0, __riscv_vmsgtu_vx_u8m2_b4(__riscv_vxor_vx_u8m2(v0, (uint8_t)-62, vl), 1, vl), vl); - if (__riscv_vfirst_m_b4(__riscv_vmor_mm_b4(tobig, __riscv_vmxor_mm_b4(leading0, trailing1, vl), vl), vl) >= 0) - return 0; + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | + OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - v1 = __riscv_vor_vx_u8m2_mu(__riscv_vmseq_vx_u8m2_b4(v0, 0b11000011, vl), v1, v1, 0b01000000, vl); - v1 = __riscv_vcompress_vm_u8m2(v1, m, vl); + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); + return (byte_1_high & byte_1_low & byte_2_high); +} +simdutf_really_inline simd8 +check_multibyte_lengths(const simd8 input, + const simd8 prev_input, + const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = + simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; +} + +struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, + const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ + // lead bytes (2, 3, 4-byte leads become large positive numbers instead of + // small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + simdutf_really_inline size_t convert(const char *in, size_t size, + char32_t *utf32_output) { + size_t pos = 0; + char32_t *start{utf32_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 words when calling convert_masked_utf8_to_utf32. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 16 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the fourth + // last leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (utf8_continuation_mask & 1) { + return 0; // we have an error + } + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf32( + in + pos, utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (errors()) { + return 0; } - __riscv_vse8_v_u8m2((uint8_t*)dst, v1, vlOut); + if (pos < size) { + size_t howmany = + scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output); + if (howmany == 0) { + return 0; + } + utf32_output += howmany; + } + return utf32_output - start; } - if (last > 0b10111111) - return 0; - return dst - beg; -} - -simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char *src, size_t len, char *dst) const noexcept { - size_t res = convert_utf8_to_latin1(src, len, dst); - if (res) return result(error_code::SUCCESS, res); - return scalar::utf8_to_latin1::convert_with_errors(src, len, dst); -} -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char *src, size_t len, char *dst) const noexcept { - const char *beg = dst; - uint8_t last = 0b11000000; - for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut, last = src[-1]) { - vl = __riscv_vsetvl_e8m2(len); - vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t*)src, vl); - vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl); - vlOut = __riscv_vcpop_m_b4(m, vl); - if (vlOut != vl || last > 0b01111111) { - vuint8m2_t v0 = __riscv_vslide1up_vx_u8m2(v1, last, vl); - v1 = __riscv_vor_vx_u8m2_mu(__riscv_vmseq_vx_u8m2_b4(v0, 0b11000011, vl), v1, v1, 0b01000000, vl); - v1 = __riscv_vcompress_vm_u8m2(v1, m, vl); + simdutf_really_inline result convert_with_errors(const char *in, size_t size, + char32_t *utf32_output) { + size_t pos = 0; + char32_t *start{utf32_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 8 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the fourth + // last leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (errors() || (utf8_continuation_mask & 1)) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf32_output); + res.count += pos; + return res; + } + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf32( + in + pos, utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (errors()) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf32_output); + res.count += pos; + return res; + } + if (pos < size) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf32_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + utf32_output += res.count; + } } - __riscv_vse8_v_u8m2((uint8_t*)dst, v1, vlOut); + return result(error_code::SUCCESS, utf32_output - start); } - return dst - beg; -} -simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char *src, size_t len, char16_t *dst) const noexcept { - return rvv_utf8_to_common(src, len, (uint16_t*)dst); -} + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } -simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char *src, size_t len, char16_t *dst) const noexcept { - if (supports_zvbb()) - return rvv_utf8_to_common(src, len, (uint16_t*)dst); - else - return rvv_utf8_to_common(src, len, (uint16_t*)dst); -} +}; // struct utf8_checker +} // namespace utf8_to_utf32 +} // unnamed namespace +} // namespace lasx +} // namespace simdutf +/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 -simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char *src, size_t len, char16_t *dst) const noexcept { - size_t res = convert_utf8_to_utf16le(src, len, dst); - if (res) return result(error_code::SUCCESS, res); - return scalar::utf8_to_utf16::convert_with_errors(src, len, dst); -} +#if SIMDUTF_FEATURE_UTF8 +/* begin file src/generic/utf8.h */ +namespace simdutf { +namespace lasx { +namespace { +namespace utf8 { -simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char *src, size_t len, char16_t *dst) const noexcept { - size_t res = convert_utf8_to_utf16be(src, len, dst); - if (res) return result(error_code::SUCCESS, res); - return scalar::utf8_to_utf16::convert_with_errors(src, len, dst); -} +using namespace simd; -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char *src, size_t len, char16_t *dst) const noexcept { - return rvv_utf8_to_common(src, len, (uint16_t*)dst); +simdutf_really_inline size_t count_code_points(const char *in, size_t size) { + size_t pos = 0; + size_t count = 0; + for (; pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.gt(-65); + count += count_ones(utf8_continuation_mask); + } + return count + scalar::utf8::count_code_points(in + pos, size - pos); } -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char *src, size_t len, char16_t *dst) const noexcept { - if (supports_zvbb()) - return rvv_utf8_to_common(src, len, (uint16_t*)dst); - else - return rvv_utf8_to_common(src, len, (uint16_t*)dst); -} +#ifdef SIMDUTF_SIMD_HAS_BYTEMASK +simdutf_really_inline size_t count_code_points_bytemask(const char *in, + size_t size) { + using vector_i8 = simd8; + using vector_u8 = simd8; + using vector_u64 = simd64; -simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char *src, size_t len, char32_t *dst) const noexcept { - return rvv_utf8_to_common(src, len, (uint32_t*)dst); -} + constexpr size_t N = vector_i8::SIZE; + constexpr size_t max_iterations = 255 / 4; -simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char *src, size_t len, char32_t *dst) const noexcept { - size_t res = convert_utf8_to_utf32(src, len, dst); - if (res) return result(error_code::SUCCESS, res); - return scalar::utf8_to_utf32::convert_with_errors(src, len, dst); -} + size_t pos = 0; + size_t count = 0; -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char *src, size_t len, char32_t *dst) const noexcept { - return rvv_utf8_to_common(src, len, (uint32_t*)dst); -} + auto counters = vector_u64::zero(); + auto local = vector_u8::zero(); + size_t iterations = 0; + for (; pos + 4 * N <= size; pos += 4 * N) { + const auto input0 = + simd8::load(reinterpret_cast(in + pos + 0 * N)); + const auto input1 = + simd8::load(reinterpret_cast(in + pos + 1 * N)); + const auto input2 = + simd8::load(reinterpret_cast(in + pos + 2 * N)); + const auto input3 = + simd8::load(reinterpret_cast(in + pos + 3 * N)); + const auto mask0 = input0 > int8_t(-65); + const auto mask1 = input1 > int8_t(-65); + const auto mask2 = input2 > int8_t(-65); + const auto mask3 = input3 > int8_t(-65); -/* end file src/rvv/rvv_utf8_to.inl.cpp */ -/* begin file src/rvv/rvv_utf16_to.inl.cpp */ -#include + local -= vector_u8(mask0); + local -= vector_u8(mask1); + local -= vector_u8(mask2); + local -= vector_u8(mask3); -template -simdutf_really_inline static result rvv_utf16_to_latin1_with_errors(const char16_t *src, size_t len, char *dst) { - const char16_t *const beg = src; - for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) { - vl = __riscv_vsetvl_e16m8(len); - vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t*)src, vl); - v = simdutf_byteflip(v, vl); - long idx = __riscv_vfirst_m_b2(__riscv_vmsgtu_vx_u16m8_b2(v, 255, vl), vl); - if (idx >= 0) - return result(error_code::TOO_LARGE, beg - src + idx); - __riscv_vse8_v_u8m4((uint8_t*)dst, __riscv_vncvt_x_x_w_u8m4(v, vl), vl); + iterations += 1; + if (iterations == max_iterations) { + counters += sum_8bytes(local); + local = vector_u8::zero(); + iterations = 0; + } } - return result(error_code::SUCCESS, src - beg); -} - -simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t *src, size_t len, char *dst) const noexcept { - result res = convert_utf16le_to_latin1_with_errors(src, len, dst); - return res.error == error_code::SUCCESS ? res.count : 0; -} -simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t *src, size_t len, char *dst) const noexcept { - result res = convert_utf16be_to_latin1_with_errors(src, len, dst); - return res.error == error_code::SUCCESS ? res.count : 0; -} - -simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t *src, size_t len, char *dst) const noexcept { - return rvv_utf16_to_latin1_with_errors(src, len, dst); -} + if (iterations > 0) { + count += local.sum_bytes(); + } -simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t *src, size_t len, char *dst) const noexcept { - if (supports_zvbb()) - return rvv_utf16_to_latin1_with_errors(src, len, dst); - else - return rvv_utf16_to_latin1_with_errors(src, len, dst); -} + count += counters.sum(); -simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t *src, size_t len, char *dst) const noexcept { - const char16_t *const beg = src; - for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) { - vl = __riscv_vsetvl_e16m8(len); - vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t*)src, vl); - __riscv_vse8_v_u8m4((uint8_t*)dst, __riscv_vncvt_x_x_w_u8m4(v, vl), vl); - } - return src - beg; + return count + scalar::utf8::count_code_points(in + pos, size - pos); } +#endif // SIMDUTF_SIMD_HAS_BYTEMASK -simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t *src, size_t len, char *dst) const noexcept { - const char16_t *const beg = src; - for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) { - vl = __riscv_vsetvl_e16m8(len); - vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t*)src, vl); - __riscv_vse8_v_u8m4((uint8_t*)dst, __riscv_vnsrl_wx_u8m4(v, 8, vl), vl); +simdutf_really_inline size_t utf16_length_from_utf8(const char *in, + size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for (; pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + // We count one word for anything that is not a continuation (so + // leading bytes). + count += 64 - count_ones(utf8_continuation_mask); + int64_t utf8_4byte = input.gteq_unsigned(240); + count += count_ones(utf8_4byte); } - return src - beg; + return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos); } -template -simdutf_really_inline static result rvv_utf16_to_utf8_with_errors(const char16_t *src, size_t len, char *dst) { - size_t n = len; - const char16_t *srcBeg = src; - const char *dstBeg = dst; - size_t vl8m4 = __riscv_vsetvlmax_e8m4(); - vbool2_t m4mulp2 = __riscv_vmseq_vx_u8m4_b2(__riscv_vand_vx_u8m4(__riscv_vid_v_u8m4(vl8m4), 3, vl8m4), 2, vl8m4); - - for (size_t vl, vlOut; n > 0; ) { - vl = __riscv_vsetvl_e16m2(n); +} // namespace utf8 +} // unnamed namespace +} // namespace lasx +} // namespace simdutf +/* end file src/generic/utf8.h */ +#endif // SIMDUTF_FEATURE_UTF8 - vuint16m2_t v = __riscv_vle16_v_u16m2((uint16_t const*)src, vl); - v = simdutf_byteflip(v, vl); - vbool8_t m234 = __riscv_vmsgtu_vx_u16m2_b8(v, 0x80-1, vl); +#if SIMDUTF_FEATURE_UTF16 +/* begin file src/generic/utf16/count_code_points_bytemask.h */ +namespace simdutf { +namespace lasx { +namespace { +namespace utf16 { - if (__riscv_vfirst_m_b8(m234,vl) < 0) { /* 1 byte utf8 */ - vlOut = vl; - __riscv_vse8_v_u8m1((uint8_t*)dst, __riscv_vncvt_x_x_w_u8m1(v, vlOut), vlOut); - n -= vl, src += vl, dst += vlOut; - continue; - } +using namespace simd; - vbool8_t m34 = __riscv_vmsgtu_vx_u16m2_b8(v, 0x800-1, vl); +template +simdutf_really_inline size_t count_code_points(const char16_t *in, + size_t size) { + using vector_u16 = simd16; + constexpr size_t N = vector_u16::ELEMENTS; - if (__riscv_vfirst_m_b8(m34,vl) < 0) { /* 1/2 byte utf8 */ - /* 0: [ aaa|aabbbbbb] - * 1: [aabbbbbb| ] vsll 8 - * 2: [ | aaaaa] vsrl 6 - * 3: [00111111|00011111] - * 4: [ bbbbbb|000aaaaa] (1|2)&3 - * 5: [11000000|11000000] - * 6: [10bbbbbb|110aaaaa] 4|5 */ - vuint16m2_t twoByte = - __riscv_vand_vx_u16m2(__riscv_vor_vv_u16m2( - __riscv_vsll_vx_u16m2(v, 8, vl), - __riscv_vsrl_vx_u16m2(v, 6, vl), - vl), 0b0011111100011111, vl); - vuint16m2_t vout16 = __riscv_vor_vx_u16m2_mu(m234, v, twoByte, 0b1000000011000000, vl); - vuint8m2_t vout = __riscv_vreinterpret_v_u16m2_u8m2(vout16); + size_t pos = 0; + size_t count = 0; - /* Every high byte that is zero should be compressed - * low bytes should never be compressed, so we set them - * to all ones, and then create a non-zero bytes mask */ - vbool4_t mcomp = __riscv_vmsne_vx_u8m2_b4(__riscv_vreinterpret_v_u16m2_u8m2(__riscv_vor_vx_u16m2(vout16, 0xFF, vl)), 0, vl*2); - vlOut = __riscv_vcpop_m_b4(mcomp, vl*2); + constexpr size_t max_iterations = 65535; + const auto one = vector_u16::splat(1); + const auto zero = vector_u16::zero(); - vout = __riscv_vcompress_vm_u8m2(vout, mcomp, vl*2); - __riscv_vse8_v_u8m2((uint8_t*)dst, vout, vlOut); + size_t iteration = 0; - n -= vl, src += vl, dst += vlOut; - continue; + auto counters = zero; + for (; pos < size / N * N; pos += N) { + auto input = vector_u16::load(in + pos); + if simdutf_constexpr (!match_system(big_endian)) { + input = input.swap_bytes(); } - vbool8_t sur = __riscv_vmseq_vx_u16m2_b8(__riscv_vand_vx_u16m2(v, 0xF800, vl), 0xD800, vl); - long first = __riscv_vfirst_m_b8(sur, vl); - size_t tail = vl - first; - vl = first < 0 ? vl : first; - - if (vl > 0) { /* 1/2/3 byte utf8 */ - /* in: [aaaabbbb|bbcccccc] - * v1: [0bcccccc| ] vsll 8 - * v1: [10cccccc| ] vsll 8 & 0b00111111 | 0b10000000 - * v2: [ |110bbbbb] vsrl 6 & 0b00111111 | 0b11000000 - * v2: [ |10bbbbbb] vsrl 6 & 0b00111111 | 0b10000000 - * v3: [ |1110aaaa] vsrl 12 | 0b11100000 - * 1: [00000000|0bcccccc|00000000|00000000] => [0bcccccc] - * 2: [00000000|10cccccc|110bbbbb|00000000] => [110bbbbb] [10cccccc] - * 3: [00000000|10cccccc|10bbbbbb|1110aaaa] => [1110aaaa] [10bbbbbb] [10cccccc] - */ - vuint16m2_t v1, v2, v3, v12; - v1 = __riscv_vor_vx_u16m2_mu(m234, v, __riscv_vand_vx_u16m2(v, 0b00111111, vl), 0b10000000, vl); - v1 = __riscv_vsll_vx_u16m2(v1, 8, vl); + const auto t0 = input & uint16_t(0xfc00); + const auto t1 = t0 ^ uint16_t(0xdc00); - v2 = __riscv_vor_vx_u16m2(__riscv_vand_vx_u16m2(__riscv_vsrl_vx_u16m2(v, 6, vl), 0b00111111, vl), 0b10000000, vl); - v2 = __riscv_vor_vx_u16m2_mu(__riscv_vmnot_m_b8(m34,vl), v2, v2, 0b01000000, vl); - v3 = __riscv_vor_vx_u16m2(__riscv_vsrl_vx_u16m2(v, 12, vl), 0b11100000, vl); - v12 = __riscv_vor_vv_u16m2_mu(m234, v1, v1, v2, vl); - - vuint32m4_t w12 = __riscv_vwmulu_vx_u32m4(v12, 1<<8, vl); - vuint32m4_t w123 = __riscv_vwaddu_wv_u32m4_mu(m34, w12, w12, v3, vl); - vuint8m4_t vout = __riscv_vreinterpret_v_u32m4_u8m4(w123); + // t2[0] == 1 iff input[0] outside range 0xdc00..dfff (the word is not a + // high surrogate) + const auto t2 = min(t1, one); - vbool2_t mcomp = __riscv_vmor_mm_b2(m4mulp2, __riscv_vmsne_vx_u8m4_b2(vout, 0, vl*4), vl*4); - vlOut = __riscv_vcpop_m_b2(mcomp, vl*4); + counters += t2; - vout = __riscv_vcompress_vm_u8m4(vout, mcomp, vl*4); - __riscv_vse8_v_u8m4((uint8_t*)dst, vout, vlOut); - - n -= vl, src += vl, dst += vlOut; + iteration += 1; + if (iteration == max_iterations) { + count += counters.sum(); + counters = zero; + iteration = 0; } + } - if (tail) while (n) { - uint16_t word = simdutf_byteflip(src[0]); - if((word & 0xFF80)==0) { - break; - } else if((word & 0xF800)==0) { - break; - } else if ((word & 0xF800) != 0xD800) { - break; - } else { - // must be a surrogate pair - if (n <= 1) return result(error_code::SURROGATE, src - srcBeg); - uint16_t diff = word - 0xD800; - if (diff > 0x3FF) return result(error_code::SURROGATE, src - srcBeg); - uint16_t diff2 = simdutf_byteflip(src[1]) - 0xDC00; - if (diff2 > 0x3FF) return result(error_code::SURROGATE, src - srcBeg); - - uint32_t value = ((diff + 0x40) << 10) + diff2 ; - - // will generate four UTF-8 bytes - // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX - *dst++ = (char)( (value>>18) | 0b11110000); - *dst++ = (char)(((value>>12) & 0b111111) | 0b10000000); - *dst++ = (char)(((value>> 6) & 0b111111) | 0b10000000); - *dst++ = (char)(( value & 0b111111) | 0b10000000); - src += 2; - n -= 2; - } - } + if (iteration > 0) { + count += counters.sum(); } - return result(error_code::SUCCESS, dst - dstBeg); + return count + + scalar::utf16::count_code_points(in + pos, size - pos); } -simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t *src, size_t len, char *dst) const noexcept { - result res = convert_utf16le_to_utf8_with_errors(src, len, dst); - return res.error == error_code::SUCCESS ? res.count : 0; -} +} // namespace utf16 +} // unnamed namespace +} // namespace lasx +} // namespace simdutf +/* end file src/generic/utf16/count_code_points_bytemask.h */ +/* begin file src/generic/utf16/change_endianness.h */ +namespace simdutf { +namespace lasx { +namespace { +namespace utf16 { -simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t *src, size_t len, char *dst) const noexcept { - result res = convert_utf16be_to_utf8_with_errors(src, len, dst); - return res.error == error_code::SUCCESS ? res.count : 0; -} +simdutf_really_inline void +change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) { + size_t pos = 0; -simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t *src, size_t len, char *dst) const noexcept { - return rvv_utf16_to_utf8_with_errors(src, len, dst); -} + while (pos < size / 32 * 32) { + simd16x32 input(reinterpret_cast(in + pos)); + input.swap_bytes(); + input.store(reinterpret_cast(output)); + pos += 32; + output += 32; + } -simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t *src, size_t len, char *dst) const noexcept { - if (supports_zvbb()) - return rvv_utf16_to_utf8_with_errors(src, len, dst); - else - return rvv_utf16_to_utf8_with_errors(src, len, dst); + scalar::utf16::change_endianness_utf16(in + pos, size - pos, output); } -simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t *src, size_t len, char *dst) const noexcept { - return convert_utf16le_to_utf8(src, len, dst); -} +} // namespace utf16 +} // unnamed namespace +} // namespace lasx +} // namespace simdutf +/* end file src/generic/utf16/change_endianness.h */ +/* begin file src/generic/utf16/utf8_length_from_utf16_bytemask.h */ +namespace simdutf { +namespace lasx { +namespace { +namespace utf16 { + +using namespace simd; + +template +simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in, + size_t size) { + size_t pos = 0; -simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t *src, size_t len, char *dst) const noexcept { - return convert_utf16be_to_utf8(src, len, dst); -} + using vector_u16 = simd16; + constexpr size_t N = vector_u16::ELEMENTS; -template -simdutf_really_inline static result rvv_utf16_to_utf32_with_errors(const char16_t *src, size_t len, char32_t *dst) { - const char16_t *const srcBeg = src; - char32_t *const dstBeg = dst; + const auto one = vector_u16::splat(1); - constexpr const uint16_t ANY_SURROGATE_MASK = 0xf800; - constexpr const uint16_t ANY_SURROGATE_VALUE = 0xd800; - constexpr const uint16_t LO_SURROGATE_MASK = 0xfc00; - constexpr const uint16_t LO_SURROGATE_VALUE = 0xdc00; - constexpr const uint16_t HI_SURROGATE_MASK = 0xfc00; - constexpr const uint16_t HI_SURROGATE_VALUE = 0xd800; + auto v_count = vector_u16::zero(); - uint16_t last = 0; - while (len > 0) { - size_t vl = __riscv_vsetvl_e16m2(len); - vuint16m2_t v0 = __riscv_vle16_v_u16m2((uint16_t const*)src, vl); - v0 = simdutf_byteflip(v0, vl); + // each char16 yields at least one byte + size_t count = size / N * N; - { // check fast-path - const vuint16m2_t v = __riscv_vand_vx_u16m2(v0, ANY_SURROGATE_MASK, vl); - const vbool8_t any_surrogate = __riscv_vmseq_vx_u16m2_b8(v, ANY_SURROGATE_VALUE, vl); - if (__riscv_vfirst_m_b8(any_surrogate, vl) < 0) { - /* no surrogates */ - __riscv_vse32_v_u32m4((uint32_t*)dst, __riscv_vzext_vf2_u32m4(v0, vl), vl); - len -= vl; - src += vl; - dst += vl; - continue; - } - } + // in a single iteration the increment is 0, 1 or 2, despite we have + // three additions + constexpr size_t max_iterations = 65535 / 2; + size_t iteration = max_iterations; - if ((simdutf_byteflip(src[0]) & LO_SURROGATE_MASK) == LO_SURROGATE_VALUE) { - return result(error_code::SURROGATE, src - srcBeg); + for (; pos < size / N * N; pos += N) { + auto input = vector_u16::load(reinterpret_cast(in + pos)); + if simdutf_constexpr (!match_system(big_endian)) { + input = input.swap_bytes(); } + // 0xd800 .. 0xdbff - low surrogate + // 0xdc00 .. 0xdfff - high surrogate + const auto is_surrogate = ((input & uint16_t(0xf800)) == uint16_t(0xd800)); - // decode surrogates - vuint16m2_t v1 = __riscv_vslide1down_vx_u16m2(v0, 0, vl); - vl = __riscv_vsetvl_e16m2(vl - 1); - if (vl == 0) { - return result(error_code::SURROGATE, src - srcBeg); - } + // c0 - chars that yield 2- or 3-byte UTF-8 codes + const auto c0 = min(input & uint16_t(0xff80), one); - const vbool8_t surhi = __riscv_vmseq_vx_u16m2_b8(__riscv_vand_vx_u16m2(v0, HI_SURROGATE_MASK, vl), HI_SURROGATE_VALUE, vl); - const vbool8_t surlo = __riscv_vmseq_vx_u16m2_b8(__riscv_vand_vx_u16m2(v1, LO_SURROGATE_MASK, vl), LO_SURROGATE_VALUE, vl); + // c1 - chars that yield 3-byte UTF-8 codes (including surrogates) + const auto c1 = min(input & uint16_t(0xf800), one); - // compress everything but lo surrogates - const vbool8_t compress = __riscv_vmsne_vx_u16m2_b8(__riscv_vand_vx_u16m2(v0, LO_SURROGATE_MASK, vl), LO_SURROGATE_VALUE, vl); + /* + Explanation how the counting works. + + In the case of a non-surrogate character we count: + * always 1 -- see how `count` is initialized above; + * c0 = 1 if the current char yields 2 or 3 bytes; + * c1 = 1 if the current char yields 3 bytes. + + Thus, we always have correct count for the current char: + from 1, 2 or 3 bytes. + + A trickier part is how we count surrogate pairs. Whether + we encounter a surrogate (low or high), we count it as + 3 chars and then minus 1 (`is_surrogate` is -1 or 0). + Each surrogate char yields 2. A surrogate pair, that + is a low surrogate followed by a high one, yields + the expected 4 bytes. + + It also correctly handles cases when low surrogate is + processed by the this loop, but high surrogate is counted + by the scalar procedure. The scalar procedure uses exactly + the described approach, thanks to that for valid UTF-16 + strings it always count correctly. + */ + v_count += c0; + v_count += c1; + v_count += vector_u16(is_surrogate); - { - const vbool8_t diff = __riscv_vmxor_mm_b8(surhi, surlo, vl); - const long idx = __riscv_vfirst_m_b8(diff, vl); - if (idx >= 0) { - return result(error_code::SURROGATE, src - srcBeg + idx + 1); - } + iteration -= 1; + if (iteration == 0) { + count += v_count.sum(); + v_count = vector_u16::zero(); + iteration = max_iterations; } + } - last = simdutf_byteflip(src[vl]); - vuint32m4_t utf32 = __riscv_vzext_vf2_u32m4(v0, vl); + if (iteration > 0) { + count += v_count.sum(); + } - // v0 = 110110yyyyyyyyyy (0xd800 + yyyyyyyyyy) --- hi surrogate - // v1 = 110111xxxxxxxxxx (0xdc00 + xxxxxxxxxx) --- lo surrogate + return count + scalar::utf16::utf8_length_from_utf16(in + pos, + size - pos); +} - // t0 = u16( 0000_00yy_yyyy_yyyy) - const vuint32m4_t t0 = __riscv_vzext_vf2_u32m4(__riscv_vand_vx_u16m2(v0, 0x03ff, vl), vl); - // t1 = u32(0000_0000_0000_yyyy_yyyy_yy00_0000_0000) - const vuint32m4_t t1 = __riscv_vsll_vx_u32m4(t0, 10, vl); +template +simdutf_really_inline result +utf8_length_from_utf16_with_replacement(const char16_t *in, size_t size) { + using vector_u16 = simd16; + constexpr size_t N = vector_u16::ELEMENTS; + if (N + 1 > size) { + return scalar::utf16::utf8_length_from_utf16_with_replacement( + in, size); + } // special case for short inputs + size_t pos = 0; + bool any_surrogates = false; - // t2 = u32(0000_0000_0000_0000_0000_00xx_xxxx_xxxx) - const vuint32m4_t t2 = __riscv_vzext_vf2_u32m4(__riscv_vand_vx_u16m2(v1, 0x03ff, vl), vl); + const auto one = vector_u16::splat(1); - // t3 = u32(0000_0000_0000_yyyy_yyyy_yyxx_xxxx_xxxx) - const vuint32m4_t t3 = __riscv_vor_vv_u32m4(t1, t2, vl); + auto v_count = vector_u16::zero(); + auto v_mismatched_count = vector_u16::zero(); - // t4 = utf32 from surrogate pairs - const vuint32m4_t t4 = __riscv_vadd_vx_u32m4(t3, 0x10000, vl); + size_t count = 0; + size_t mismatched_count = 0; + + // in a single iteration the increment is 0, 1 or 2, despite we have + // three additions + constexpr size_t max_iterations = 65535 / 2; + size_t iteration = max_iterations; + + if (scalar::utf16::is_low_surrogate(in[0])) { + any_surrogates = true; + mismatched_count += 1; + } + + for (; pos < (size - 1) / N * N; pos += N) { + auto input = vector_u16::load(reinterpret_cast(in + pos)); + if simdutf_constexpr (!match_system(big_endian)) { + input = input.swap_bytes(); + } + // 0xd800 .. 0xdbff - low surrogate + // 0xdc00 .. 0xdfff - high surrogate + const auto is_surrogate = ((input & uint16_t(0xf800)) == uint16_t(0xd800)); + + // c0 - chars that yield 2- or 3-byte UTF-8 codes + const auto c0 = min(input & uint16_t(0xff80), one); + + // c1 - chars that yield 3-byte UTF-8 codes (including surrogates) + const auto c1 = min(input & uint16_t(0xf800), one); + + v_count += c0; + v_count += c1; + v_count += vector_u16(is_surrogate); + if (is_surrogate.to_bitmask() != 0 || + scalar::utf16::is_low_surrogate(in[pos + N])) { + any_surrogates = true; + auto input_next = + vector_u16::load(reinterpret_cast(in + pos + 1)); + if simdutf_constexpr (!match_system(big_endian)) { + input_next = input_next.swap_bytes(); + } - const vuint32m4_t result = __riscv_vmerge_vvm_u32m4(utf32, t4, surhi, vl); + const auto lb_masked = input & (0xfc00); + const auto block_masked = input_next & (0xfc00); - const vuint32m4_t comp = __riscv_vcompress_vm_u32m4(result, compress, vl); - const size_t vlOut = __riscv_vcpop_m_b8(compress, vl); - __riscv_vse32_v_u32m4((uint32_t*)dst, comp, vlOut); + const auto lb_is_high = lb_masked == (0xd800); + const auto block_is_low = block_masked == (0xdc00); - len -= vl; - src += vl; - dst += vlOut; + const auto illseq = min(vector_u16(lb_is_high ^ block_is_low), one); - if ((last & LO_SURROGATE_MASK) == LO_SURROGATE_VALUE) { - // last item is lo surrogate and got already consumed - len -= 1; - src += 1; + v_mismatched_count += illseq; + } + + iteration -= 1; + if (iteration == 0) { + count += v_count.sum(); + v_count = vector_u16::zero(); + mismatched_count += v_mismatched_count.sum(); + v_mismatched_count = vector_u16::zero(); + iteration = max_iterations; } } - return result(error_code::SUCCESS, dst - dstBeg); -} + if (iteration > 0) { + count += v_count.sum(); + mismatched_count += v_mismatched_count.sum(); + } -simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t *src, size_t len, char32_t *dst) const noexcept { - result res = convert_utf16le_to_utf32_with_errors(src, len, dst); - return res.error == error_code::SUCCESS ? res.count : 0; + if (scalar::utf16::is_low_surrogate(in[pos])) { + any_surrogates = true; + if (!scalar::utf16::is_high_surrogate(in[pos - 1])) { + mismatched_count -= 1; + count += 2; + pos += 1; + } + } + count += pos; + count += mismatched_count; + if (scalar::utf16::is_high_surrogate(in[pos - 1])) { + any_surrogates = true; + if (pos == size) { + count += 2; + } else if (scalar::utf16::is_low_surrogate(in[pos])) { + pos += 1; + count += 2; + } + } + result scalar_result = + scalar::utf16::utf8_length_from_utf16_with_replacement( + in + pos, size - pos); + return {any_surrogates ? SURROGATE : scalar_result.error, + count + scalar_result.count}; } -simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t *src, size_t len, char32_t *dst) const noexcept { - result res = convert_utf16be_to_utf32_with_errors(src, len, dst); - return res.error == error_code::SUCCESS ? res.count : 0; -} +} // namespace utf16 +} // unnamed namespace +} // namespace lasx +} // namespace simdutf +/* end file src/generic/utf16/utf8_length_from_utf16_bytemask.h */ +/* begin file src/generic/utf16/utf32_length_from_utf16.h */ +namespace simdutf { +namespace lasx { +namespace { +namespace utf16 { -simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t *src, size_t len, char32_t *dst) const noexcept { - return rvv_utf16_to_utf32_with_errors(src, len, dst); +template +simdutf_really_inline size_t utf32_length_from_utf16(const char16_t *in, + size_t size) { + return count_code_points(in, size); } -simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t *src, size_t len, char32_t *dst) const noexcept { - if (supports_zvbb()) - return rvv_utf16_to_utf32_with_errors(src, len, dst); - else - return rvv_utf16_to_utf32_with_errors(src, len, dst); -} +} // namespace utf16 +} // unnamed namespace +} // namespace lasx +} // namespace simdutf +/* end file src/generic/utf16/utf32_length_from_utf16.h */ +/* begin file src/generic/utf16/to_well_formed.h */ +namespace simdutf { +namespace lasx { +namespace { +namespace utf16 { -simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t *src, size_t len, char32_t *dst) const noexcept { - return convert_utf16le_to_utf32(src, len, dst); -} +// Note: this is direct translation of westmere implementation. -simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t *src, size_t len, char32_t *dst) const noexcept { - return convert_utf16be_to_utf32(src, len, dst); -} -/* end file src/rvv/rvv_utf16_to.inl.cpp */ -/* begin file src/rvv/rvv_utf32_to.inl.cpp */ +/* + * Process one block of 8 characters. If in_place is false, + * copy the block from in to out. If there is a sequencing + * error in the block, overwrite the illsequenced characters + * with the replacement character. This function reads one + * character before the beginning of the buffer as a lookback. + * If that character is illsequenced, it too is overwritten. + */ +template +simdutf_really_inline void utf16fix_block(char16_t *out, const char16_t *in) { + const char16_t replacement = scalar::utf16::replacement(); -simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t *src, size_t len, char *dst) const noexcept { - result res = convert_utf32_to_latin1_with_errors(src, len, dst); - return res.error == error_code::SUCCESS ? res.count : 0; -} + using vector_u16 = simd16; + auto swap_if_needed = [](uint16_t x) simdutf_constexpr -> uint16_t { + return scalar::utf16::swap_if_needed(x); + }; -simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t *src, size_t len, char *dst) const noexcept { - const char32_t *const beg = src; - for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) { - vl = __riscv_vsetvl_e32m8(len); - vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t*)src, vl); - long idx = __riscv_vfirst_m_b4(__riscv_vmsgtu_vx_u32m8_b4(v, 255, vl), vl); - if (idx >= 0) - return result(error_code::TOO_LARGE, src - beg + idx); - /* We don't use vcompress here, because its performance varies widely on current platforms. - * This might be worth reconsidering once there is more hardware available. */ - __riscv_vse8_v_u8m2((uint8_t*)dst, __riscv_vncvt_x_x_w_u8m2(__riscv_vncvt_x_x_w_u16m4(v, vl), vl), vl); + const auto lookback = vector_u16::load(in - 1); + const auto block = vector_u16::load(in); + + const auto lb_masked = lookback & swap_if_needed(0xfc00); + const auto block_masked = block & swap_if_needed(0xfc00); + + const auto lb_is_high = lb_masked == swap_if_needed(0xd800); + const auto block_is_low = block_masked == swap_if_needed(0xdc00); + const auto illseq = lb_is_high ^ block_is_low; + if (!illseq.is_zero()) { + /* compute the cause of the illegal sequencing */ + const auto lb_illseq = ~block_is_low & lb_is_high; + const auto block_illseq = + (~lb_is_high & block_is_low) | lb_illseq.template byte_right_shift<2>(); + + /* fix illegal sequencing in the lookback */ + const auto lb = lb_illseq.first(); + out[-1] = char16_t((lb & replacement) | (~lb & out[-1])); + /* fix illegal sequencing in the main block */ + const auto mask = as_vector_u16(block_illseq); + const auto fixed = (~mask & block) | (mask & replacement); + + fixed.store(reinterpret_cast(out)); + } else if (!in_place) { + block.store(reinterpret_cast(out)); } - return result(error_code::SUCCESS, src - beg); } -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t *src, size_t len, char *dst) const noexcept { - return convert_utf32_to_latin1(src, len, dst); -} +template +void to_well_formed(const char16_t *in, size_t n, char16_t *out) { + using vector_u16 = simd16; + constexpr size_t N = vector_u16::ELEMENTS; -simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t *src, size_t len, char *dst) const noexcept { - size_t n = len; - const char32_t *srcBeg = src; - const char *dstBeg = dst; - size_t vl8m4 = __riscv_vsetvlmax_e8m4(); - vbool2_t m4mulp2 = __riscv_vmseq_vx_u8m4_b2(__riscv_vand_vx_u8m4(__riscv_vid_v_u8m4(vl8m4), 3, vl8m4), 2, vl8m4); + if (n < N + 1) { + scalar::utf16::to_well_formed_utf16(in, n, out); + return; + } - for (size_t vl, vlOut; n > 0; ) { - vl = __riscv_vsetvl_e32m4(n); + const char16_t replacement = scalar::utf16::replacement(); - vuint32m4_t v = __riscv_vle32_v_u32m4((uint32_t const*)src, vl); - vbool8_t m234 = __riscv_vmsgtu_vx_u32m4_b8(v, 0x80-1, vl); - vuint16m2_t vn = __riscv_vncvt_x_x_w_u16m2(v, vl); + out[0] = + scalar::utf16::is_low_surrogate(in[0]) ? replacement : in[0]; - if (__riscv_vfirst_m_b8(m234, vl) < 0) { /* 1 byte utf8 */ - vlOut = vl; - __riscv_vse8_v_u8m1((uint8_t*)dst, __riscv_vncvt_x_x_w_u8m1(vn, vlOut), vlOut); - n -= vl, src += vl, dst += vlOut; - continue; + /* duplicate code to have the compiler specialise utf16fix_block() */ + if (in == out) { + constexpr bool inplace = true; + for (size_t i = 1; i + N < n; i += N) { + utf16fix_block(out + i, in + i); } - vbool8_t m34 = __riscv_vmsgtu_vx_u32m4_b8(v, 0x800-1, vl); + utf16fix_block(out + n - N, in + n - N); + } else { + constexpr bool copy_data = false; + for (size_t i = 1; i + N < n; i += N) { + utf16fix_block(out + i, in + i); + } - if (__riscv_vfirst_m_b8(m34,vl) < 0) { /* 1/2 byte utf8 */ - /* 0: [ aaa|aabbbbbb] - * 1: [aabbbbbb| ] vsll 8 - * 2: [ | aaaaa] vsrl 6 - * 3: [00111111|00111111] - * 4: [ bbbbbb|000aaaaa] (1|2)&3 - * 5: [10000000|11000000] - * 6: [10bbbbbb|110aaaaa] 4|5 */ - vuint16m2_t twoByte = - __riscv_vand_vx_u16m2(__riscv_vor_vv_u16m2( - __riscv_vsll_vx_u16m2(vn, 8, vl), - __riscv_vsrl_vx_u16m2(vn, 6, vl), - vl), 0b0011111100111111, vl); - vuint16m2_t vout16 = __riscv_vor_vx_u16m2_mu(m234, vn, twoByte, 0b1000000011000000, vl); - vuint8m2_t vout = __riscv_vreinterpret_v_u16m2_u8m2(vout16); + utf16fix_block(out + n - N, in + n - N); + } - /* Every high byte that is zero should be compressed - * low bytes should never be compressed, so we set them - * to all ones, and then create a non-zero bytes mask */ - vbool4_t mcomp = __riscv_vmsne_vx_u8m2_b4(__riscv_vreinterpret_v_u16m2_u8m2(__riscv_vor_vx_u16m2(vout16, 0xFF, vl)), 0, vl*2); - vlOut = __riscv_vcpop_m_b4(mcomp, vl*2); + out[n - 1] = scalar::utf16::is_high_surrogate(out[n - 1]) + ? replacement + : out[n - 1]; +} - vout = __riscv_vcompress_vm_u8m2(vout, mcomp, vl*2); - __riscv_vse8_v_u8m2((uint8_t*)dst, vout, vlOut); +} // namespace utf16 +} // unnamed namespace +} // namespace lasx +} // namespace simdutf +/* end file src/generic/utf16/to_well_formed.h */ +#endif // SIMDUTF_FEATURE_UTF16 - n -= vl, src += vl, dst += vlOut; - continue; - } +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +/* begin file src/generic/validate_utf16.h */ +namespace simdutf { +namespace lasx { +namespace { +namespace utf16 { +/* + UTF-16 validation + -------------------------------------------------- - vbool8_t sur = __riscv_vmseq_vx_u32m4_b8(__riscv_vand_vx_u32m4(v, 0xFFFFF800, vl), 0xD800, vl); - long idx = __riscv_vfirst_m_b8(sur, vl); - if (idx >= 0) return result(error_code::SURROGATE, src - srcBeg + idx); + In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning. - vbool8_t m4 = __riscv_vmsgtu_vx_u32m4_b8(v, 0x10000-1, vl); - long first = __riscv_vfirst_m_b8(m4, vl); - size_t tail = vl - first; - vl = first < 0 ? vl : first; + In a vectorized algorithm we want to examine the most significant + nibble in order to select a fast path. If none of highest nibbles + are 0xD (13), than we are sure that UTF-16 chunk in a vector + register is valid. - if (vl > 0) { /* 1/2/3 byte utf8 */ - /* vn: [aaaabbbb|bbcccccc] - * v1: [0bcccccc| ] vsll 8 - * v1: [10cccccc| ] vsll 8 & 0b00111111 | 0b10000000 - * v2: [ |110bbbbb] vsrl 6 & 0b00111111 | 0b11000000 - * v2: [ |10bbbbbb] vsrl 6 & 0b00111111 | 0b10000000 - * v3: [ |1110aaaa] vsrl 12 | 0b11100000 - * 1: [00000000|0bcccccc|00000000|00000000] => [0bcccccc] - * 2: [00000000|10cccccc|110bbbbb|00000000] => [110bbbbb] [10cccccc] - * 3: [00000000|10cccccc|10bbbbbb|1110aaaa] => [1110aaaa] [10bbbbbb] [10cccccc] - */ - vuint16m2_t v1, v2, v3, v12; - v1 = __riscv_vor_vx_u16m2_mu(m234, vn, __riscv_vand_vx_u16m2(vn, 0b00111111, vl), 0b10000000, vl); - v1 = __riscv_vsll_vx_u16m2(v1, 8, vl); + Let us analyze what we need to check if the nibble is 0xD. The + value of the preceding nibble determines what we have: - v2 = __riscv_vor_vx_u16m2(__riscv_vand_vx_u16m2(__riscv_vsrl_vx_u16m2(vn, 6, vl), 0b00111111, vl), 0b10000000, vl); - v2 = __riscv_vor_vx_u16m2_mu(__riscv_vmnot_m_b8(m34,vl), v2, v2, 0b01000000, vl); - v3 = __riscv_vor_vx_u16m2(__riscv_vsrl_vx_u16m2(vn, 12, vl), 0b11100000, vl); - v12 = __riscv_vor_vv_u16m2_mu(m234, v1, v1, v2, vl); + 0xd000 .. 0xd7ff - a valid word + 0xd800 .. 0xdbff - low surrogate + 0xdc00 .. 0xdfff - high surrogate - vuint32m4_t w12 = __riscv_vwmulu_vx_u32m4(v12, 1<<8, vl); - vuint32m4_t w123 = __riscv_vwaddu_wv_u32m4_mu(m34, w12, w12, v3, vl); - vuint8m4_t vout = __riscv_vreinterpret_v_u32m4_u8m4(w123); + Other constraints we have to consider: + - there must not be two consecutive low surrogates (0xd800 .. 0xdbff) + - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff) + - there must not be sole low surrogate nor high surrogate - vbool2_t mcomp = __riscv_vmor_mm_b2(m4mulp2, __riscv_vmsne_vx_u8m4_b2(vout, 0, vl*4), vl*4); - vlOut = __riscv_vcpop_m_b2(mcomp, vl*4); + We are going to build three bitmasks based on the 3rd nibble: + - V = valid word, + - L = low surrogate (0xd800 .. 0xdbff) + - H = high surrogate (0xdc00 .. 0xdfff) - vout = __riscv_vcompress_vm_u8m4(vout, mcomp, vl*4); - __riscv_vse8_v_u8m4((uint8_t*)dst, vout, vlOut); + 0 1 2 3 4 5 6 7 <--- word index + [ V | L | H | L | H | V | V | L ] + 1 0 0 0 0 1 1 0 - V = valid masks + 0 1 0 1 0 0 0 1 - L = low surrogate + 0 0 1 0 1 0 0 0 - H high surrogate - n -= vl, src += vl, dst += vlOut; - } - if (tail) while (n) { - uint32_t word = src[0]; - if (word < 0x10000) break; - if (word > 0x10FFFF) return result(error_code::TOO_LARGE, src - srcBeg); - *dst++ = (uint8_t)(( word>>18) | 0b11110000); - *dst++ = (uint8_t)(((word>>12) & 0b111111) | 0b10000000); - *dst++ = (uint8_t)(((word>> 6) & 0b111111) | 0b10000000); - *dst++ = (uint8_t)(( word & 0b111111) | 0b10000000); - ++src; - --n; + 1 0 0 0 0 1 1 0 V = valid masks + 0 1 0 1 0 0 0 0 a = L & (H >> 1) + 0 0 1 0 1 0 0 0 b = a << 1 + 1 1 1 1 1 1 1 0 c = V | a | b + ^ + the last bit can be zero, we just consume 7 + code units and recheck this word in the next iteration +*/ +template +const result validate_utf16_with_errors(const char16_t *input, size_t size) { + if (simdutf_unlikely(size == 0)) { + return result(error_code::SUCCESS, 0); + } + + const char16_t *start = input; + const char16_t *end = input + size; + + const auto v_d8 = simd8::splat(0xd8); + const auto v_f8 = simd8::splat(0xf8); + const auto v_fc = simd8::splat(0xfc); + const auto v_dc = simd8::splat(0xdc); + + while (input + simd16::SIZE * 2 < end) { + // 0. Load data: since the validation takes into account only higher + // byte of each word, we compress the two vectors into one which + // consists only the higher bytes. + auto in0 = simd16(input); + auto in1 = + simd16(input + simd16::SIZE / sizeof(char16_t)); + + // Function `utf16_gather_high_bytes` consumes two vectors of UTF-16 + // and yields a single vector having only higher bytes of characters. + const auto in = utf16_gather_high_bytes(in0, in1); + + // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). + const auto surrogates_wordmask = (in & v_f8) == v_d8; + const uint16_t surrogates_bitmask = + static_cast(surrogates_wordmask.to_bitmask()); + if (surrogates_bitmask == 0x0000) { + input += 16; + } else { + // 2. We have some surrogates that have to be distinguished: + // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) + // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) + // + // Fact: high surrogate has 11th bit set (3rd bit in the higher byte) + + // V - non-surrogate code units + // V = not surrogates_wordmask + const uint16_t V = static_cast(~surrogates_bitmask); + + // H - word-mask for high surrogates: the six highest bits are 0b1101'11 + const auto vH = (in & v_fc) == v_dc; + const uint16_t H = static_cast(vH.to_bitmask()); + + // L - word mask for low surrogates + // L = not H and surrogates_wordmask + const uint16_t L = static_cast(~H & surrogates_bitmask); + + const uint16_t a = static_cast( + L & (H >> 1)); // A low surrogate must be followed by high one. + // (A low surrogate placed in the 7th register's word + // is an exception we handle.) + const uint16_t b = static_cast( + a << 1); // Just mark that the opinput - startite fact is hold, + // thanks to that we have only two masks for valid case. + const uint16_t c = static_cast( + V | a | b); // Combine all the masks into the final one. + + if (c == 0xffff) { + // The whole input register contains valid UTF-16, i.e., + // either single code units or proper surrogate pairs. + input += 16; + } else if (c == 0x7fff) { + // The 15 lower code units of the input register contains valid UTF-16. + // The 15th word may be either a low or high surrogate. It the next + // iteration we 1) check if the low surrogate is followed by a high + // one, 2) reject sole high surrogate. + input += 15; + } else { + return result(error_code::SURROGATE, input - start); + } } } - return result(error_code::SUCCESS, dst - dstBeg); + return result(error_code::SUCCESS, input - start); } -simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t *src, size_t len, char *dst) const noexcept { - result res = convert_utf32_to_utf8_with_errors(src, len, dst); - return res.error == error_code::SUCCESS ? res.count : 0; -} +template +const result validate_utf16_as_ascii_with_errors(const char16_t *input, + size_t size) { + if (simdutf_unlikely(size == 0)) { + return result(error_code::SUCCESS, 0); + } + size_t pos = 0; + for (; pos < size / 32 * 32; pos += 32) { + simd16x32 input_vec( + reinterpret_cast(input + pos)); + if simdutf_constexpr (!match_system(big_endian)) { + input_vec.swap_bytes(); + } + uint64_t matches = input_vec.lteq(uint16_t(0x7f)); + if (~matches) { + // Found a match, return the first one + int index = trailing_zeroes(~matches) / 2; + return result(error_code::TOO_LARGE, pos + index); + } + } -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t *src, size_t len, char *dst) const noexcept { - return convert_utf32_to_utf8(src, len, dst); -} + // Scalar tail + while (pos < size) { -template -simdutf_really_inline static result rvv_convert_utf32_to_utf16_with_errors(const char32_t *src, size_t len, char16_t *dst) { - size_t vl8m2 = __riscv_vsetvlmax_e8m2(); - vbool4_t m4even = __riscv_vmseq_vx_u8m2_b4(__riscv_vand_vx_u8m2(__riscv_vid_v_u8m2(vl8m2), 1, vl8m2), 0, vl8m2); - const char16_t *dstBeg = dst; - const char32_t *srcBeg = src; - for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut) { - vl = __riscv_vsetvl_e32m4(len); - vuint32m4_t v = __riscv_vle32_v_u32m4((uint32_t*)src, vl); - vuint32m4_t off = __riscv_vadd_vx_u32m4(v, 0xFFFF2000, vl); - long idx; - idx = __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(off, 0xFFFFF7FF, vl), vl); - if (idx >= 0) return result(error_code::SURROGATE, src - srcBeg + idx); - idx = __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0xFFFF, vl), vl); - if (idx < 0) { - vlOut = vl; - vuint16m2_t n = simdutf_byteflip(__riscv_vncvt_x_x_w_u16m2(v, vlOut), vlOut); - __riscv_vse16_v_u16m2((uint16_t*)dst, n, vlOut); - continue; + char16_t v = scalar::utf16::swap_if_needed(input[pos]); + if (v > 0x7F) { + return result(error_code::TOO_LARGE, pos); } - idx = __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0x10FFFF, vl), vl); - if (idx >= 0) return result(error_code::TOO_LARGE, src - srcBeg + idx); - vlOut = rvv_utf32_store_utf16_m4((uint16_t*)dst, v, vl, m4even); + pos++; } - return result(error_code::SUCCESS, dst - dstBeg); + return result(error_code::SUCCESS, size); } -simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t *src, size_t len, char16_t *dst) const noexcept { - result res = convert_utf32_to_utf16le_with_errors(src, len, dst); - return res.error == error_code::SUCCESS ? res.count : 0; -} +} // namespace utf16 +} // unnamed namespace +} // namespace lasx +} // namespace simdutf +/* end file src/generic/validate_utf16.h */ +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING -simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t *src, size_t len, char16_t *dst) const noexcept { - result res = convert_utf32_to_utf16be_with_errors(src, len, dst); - return res.error == error_code::SUCCESS ? res.count : 0; -} +#if SIMDUTF_FEATURE_UTF32 +/* begin file src/generic/utf32.h */ +#include -simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t *src, size_t len, char16_t *dst) const noexcept { - return rvv_convert_utf32_to_utf16_with_errors(src, len, dst); -} +namespace simdutf { +namespace lasx { +namespace { +namespace utf32 { + +template T min(T a, T b) { return a <= b ? a : b; } + +simdutf_really_inline size_t utf8_length_from_utf32(const char32_t *input, + size_t length) { + using vector_u32 = simd32; + + const char32_t *start = input; + + // we add up to three ones in a single iteration (see the vectorized loop in + // section #2 below) + const size_t max_increment = 3; + + const size_t N = vector_u32::ELEMENTS; + +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + const auto v_0000007f = vector_u32::splat(0x0000007f); + const auto v_000007ff = vector_u32::splat(0x000007ff); + const auto v_0000ffff = vector_u32::splat(0x0000ffff); +#else + const auto v_ffffff80 = vector_u32::splat(0xffffff80); + const auto v_fffff800 = vector_u32::splat(0xfffff800); + const auto v_ffff0000 = vector_u32::splat(0xffff0000); + const auto one = vector_u32::splat(1); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP + + size_t counter = 0; + + // 1. vectorized loop unrolled 4 times + { + // we use vector of uint32 counters, this is why this limit is used + const size_t max_iterations = + std::numeric_limits::max() / (max_increment * 4); + size_t blocks = length / (N * 4); + length -= blocks * (N * 4); + while (blocks != 0) { + const size_t iterations = min(blocks, max_iterations); + blocks -= iterations; + + simd32 acc = vector_u32::zero(); + for (size_t i = 0; i < iterations; i++) { + const auto in0 = vector_u32(input + 0 * N); + const auto in1 = vector_u32(input + 1 * N); + const auto in2 = vector_u32(input + 2 * N); + const auto in3 = vector_u32(input + 3 * N); + +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + acc -= as_vector_u32(in0 > v_0000007f); + acc -= as_vector_u32(in1 > v_0000007f); + acc -= as_vector_u32(in2 > v_0000007f); + acc -= as_vector_u32(in3 > v_0000007f); + + acc -= as_vector_u32(in0 > v_000007ff); + acc -= as_vector_u32(in1 > v_000007ff); + acc -= as_vector_u32(in2 > v_000007ff); + acc -= as_vector_u32(in3 > v_000007ff); + + acc -= as_vector_u32(in0 > v_0000ffff); + acc -= as_vector_u32(in1 > v_0000ffff); + acc -= as_vector_u32(in2 > v_0000ffff); + acc -= as_vector_u32(in3 > v_0000ffff); +#else + acc += min(one, in0 & v_ffffff80); + acc += min(one, in1 & v_ffffff80); + acc += min(one, in2 & v_ffffff80); + acc += min(one, in3 & v_ffffff80); + + acc += min(one, in0 & v_fffff800); + acc += min(one, in1 & v_fffff800); + acc += min(one, in2 & v_fffff800); + acc += min(one, in3 & v_fffff800); + + acc += min(one, in0 & v_ffff0000); + acc += min(one, in1 & v_ffff0000); + acc += min(one, in2 & v_ffff0000); + acc += min(one, in3 & v_ffff0000); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP + + input += 4 * N; + } + + counter += acc.sum(); + } + } + + // 2. vectorized loop for tail + { + const size_t max_iterations = + std::numeric_limits::max() / max_increment; + size_t blocks = length / N; + length -= blocks * N; + while (blocks != 0) { + const size_t iterations = min(blocks, max_iterations); + blocks -= iterations; + + auto acc = vector_u32::zero(); + for (size_t i = 0; i < iterations; i++) { + const auto in = vector_u32(input); + +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + acc -= as_vector_u32(in > v_0000007f); + acc -= as_vector_u32(in > v_000007ff); + acc -= as_vector_u32(in > v_0000ffff); +#else + acc += min(one, in & v_ffffff80); + acc += min(one, in & v_fffff800); + acc += min(one, in & v_ffff0000); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP -simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t *src, size_t len, char16_t *dst) const noexcept { - if (supports_zvbb()) - return rvv_convert_utf32_to_utf16_with_errors(src, len, dst); - else - return rvv_convert_utf32_to_utf16_with_errors(src, len, dst); -} + input += N; + } -template -simdutf_really_inline static size_t rvv_convert_valid_utf32_to_utf16(const char32_t *src, size_t len, char16_t *dst) { - size_t vl8m2 = __riscv_vsetvlmax_e8m2(); - vbool4_t m4even = __riscv_vmseq_vx_u8m2_b4(__riscv_vand_vx_u8m2(__riscv_vid_v_u8m2(vl8m2), 1, vl8m2), 0, vl8m2); - char16_t *dstBeg = dst; - for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut) { - vl = __riscv_vsetvl_e32m4(len); - vuint32m4_t v = __riscv_vle32_v_u32m4((uint32_t*)src, vl); - if (__riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0xFFFF, vl), vl) < 0) { - vlOut = vl; - vuint16m2_t n = simdutf_byteflip(__riscv_vncvt_x_x_w_u16m2(v, vlOut), vlOut); - __riscv_vse16_v_u16m2((uint16_t*)dst, n, vlOut); - continue; + counter += acc.sum(); } - vlOut = rvv_utf32_store_utf16_m4((uint16_t*)dst, v, vl, m4even); } - return dst - dstBeg; -} -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t *src, size_t len, char16_t *dst) const noexcept { - return rvv_convert_valid_utf32_to_utf16(src, len, dst); -} + const size_t consumed = input - start; + if (consumed != 0) { + // We don't count 0th bytes in the vectorized loops above, this + // is why we need to count them in the end. + counter += consumed; + } -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t *src, size_t len, char16_t *dst) const noexcept { - if (supports_zvbb()) - return rvv_convert_valid_utf32_to_utf16(src, len, dst); - else - return rvv_convert_valid_utf32_to_utf16(src, len, dst); + return counter + scalar::utf32::utf8_length_from_utf32(input, length); } -/* end file src/rvv/rvv_utf32_to.inl.cpp */ -simdutf_warn_unused int implementation::detect_encodings(const char *input, size_t length) const noexcept { +} // namespace utf32 +} // unnamed namespace +} // namespace lasx +} // namespace simdutf +/* end file src/generic/utf32.h */ +#endif // SIMDUTF_FEATURE_UTF32 + +// +// Implementation-specific overrides +// +namespace simdutf { +namespace lasx { + +#if SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused int +implementation::detect_encodings(const char *input, + size_t length) const noexcept { // If there is a BOM, then we trust it. auto bom_encoding = simdutf::BOM::check_bom(input, length); - if (bom_encoding != encoding_type::unspecified) + // todo: reimplement as a one-pass algorithm. + if (bom_encoding != encoding_type::unspecified) { return bom_encoding; + } int out = 0; - if (validate_utf8(input, length)) + if (validate_utf8(input, length)) { out |= encoding_type::UTF8; - if (length % 2 == 0) { - if (validate_utf16(reinterpret_cast(input), length/2)) + } + if ((length % 2) == 0) { + if (validate_utf16le(reinterpret_cast(input), + length / 2)) { out |= encoding_type::UTF16_LE; + } } - if (length % 4 == 0) { - if (validate_utf32(reinterpret_cast(input), length/4)) + if ((length % 4) == 0) { + if (validate_utf32(reinterpret_cast(input), length / 4)) { out |= encoding_type::UTF32_LE; + } } - return out; } +#endif // SIMDUTF_FEATURE_DETECT_ENCODING -template -simdutf_really_inline static void rvv_change_endianness_utf16(const char16_t *src, size_t len, char16_t *dst) { - for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) { - vl = __riscv_vsetvl_e16m8(len); - vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t*)src, vl); - __riscv_vse16_v_u16m8((uint16_t *)dst, simdutf_byteflip(v, vl), vl); - } +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused bool +implementation::validate_utf8(const char *buf, size_t len) const noexcept { + return lasx::utf8_validation::generic_validate_utf8(buf, len); } +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING -void implementation::change_endianness_utf16(const char16_t *src, size_t len, char16_t *dst) const noexcept { - if (supports_zvbb()) - return rvv_change_endianness_utf16(src, len, dst); - else - return rvv_change_endianness_utf16(src, len, dst); +#if SIMDUTF_FEATURE_UTF8 +simdutf_warn_unused result implementation::validate_utf8_with_errors( + const char *buf, size_t len) const noexcept { + return lasx::utf8_validation::generic_validate_utf8_with_errors(buf, len); } +#endif // SIMDUTF_FEATURE_UTF8 -simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char * input, size_t length) const noexcept { - return scalar::base64::maximal_binary_length_from_base64(input, length); +#if SIMDUTF_FEATURE_ASCII +simdutf_warn_unused bool +implementation::validate_ascii(const char *buf, size_t len) const noexcept { + return lasx::ascii_validation::generic_validate_ascii(buf, len); } -simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept { - while(length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) { - length--; - } - size_t equallocation = length; // location of the first padding character if any - size_t equalsigns = 0; - if(length > 0 && input[length - 1] == '=') { - length -= 1; - equalsigns++; - while(length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) { - length--; - } - if(length > 0 && input[length - 1] == '=') { - equalsigns++; - length -= 1; - } - } - if(length == 0) { - if(equalsigns > 0) { - return {INVALID_BASE64_CHARACTER, equallocation}; - } - return {SUCCESS, 0}; - } - result r = scalar::base64::base64_tail_decode(output, input, length, options); - if(r.error == error_code::SUCCESS && equalsigns > 0) { - // additional checks - if((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) { - return {INVALID_BASE64_CHARACTER, equallocation}; - } - } - return r; +simdutf_warn_unused result implementation::validate_ascii_with_errors( + const char *buf, size_t len) const noexcept { + return lasx::ascii_validation::generic_validate_ascii_with_errors(buf, len); } - - -simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept { - return scalar::base64::maximal_binary_length_from_base64(input, length); +#endif // SIMDUTF_FEATURE_ASCII +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII +simdutf_warn_unused bool +implementation::validate_utf16le_as_ascii(const char16_t *buf, + size_t len) const noexcept { + return lasx::utf16::validate_utf16_as_ascii_with_errors( + buf, len) + .error == SUCCESS; } -simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept { - while(length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) { - length--; - } - size_t equallocation = length; // location of the first padding character if any - auto equalsigns = 0; - if(length > 0 && input[length - 1] == '=') { - length -= 1; - equalsigns++; - while(length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) { - length--; - } - if(length > 0 && input[length - 1] == '=') { - equalsigns++; - length -= 1; - } - } - if(length == 0) { - if(equalsigns > 0) { - return {INVALID_BASE64_CHARACTER, equallocation}; - } - return {SUCCESS, 0}; +simdutf_warn_unused bool +implementation::validate_utf16be_as_ascii(const char16_t *buf, + size_t len) const noexcept { + return lasx::utf16::validate_utf16_as_ascii_with_errors(buf, + len) + .error == SUCCESS; +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused bool +implementation::validate_utf16le(const char16_t *buf, + size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + // empty input is valid. protected the implementation from nullptr. + return true; } - result r = scalar::base64::base64_tail_decode(output, input, length, options); - if(r.error == error_code::SUCCESS && equalsigns > 0) { - // additional checks - if((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) { - return {INVALID_BASE64_CHARACTER, equallocation}; - } + const auto res = + lasx::utf16::validate_utf16_with_errors(buf, len); + if (res.is_err()) { + return false; } - return r; -} -simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length, base64_options options) const noexcept { - return scalar::base64::base64_length_from_binary(length, options); -} + if (res.count != len) { + return scalar::utf16::validate(buf + res.count, + len - res.count); + } -size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept { - return scalar::base64::tail_encode_base64(output, input, length, options); + return true; } -} // namespace rvv -} // namespace simdutf +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING -/* begin file src/simdutf/rvv/end.h */ -#if SIMDUTF_CAN_ALWAYS_RUN_RVV -// nothing needed. -#else -SIMDUTF_UNTARGET_REGION -#endif +#if SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused bool +implementation::validate_utf16be(const char16_t *buf, + size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + // empty input is valid. protected the implementation from nullptr. + return true; + } -/* end file src/simdutf/rvv/end.h */ -/* end file src/rvv/implementation.cpp */ -#endif -#if SIMDUTF_IMPLEMENTATION_WESTMERE -/* begin file src/westmere/implementation.cpp */ -/* begin file src/simdutf/westmere/begin.h */ -// redefining SIMDUTF_IMPLEMENTATION to "westmere" -// #define SIMDUTF_IMPLEMENTATION westmere + const auto res = + lasx::utf16::validate_utf16_with_errors(buf, len); + if (res.is_err()) { + return false; + } -#if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE -// nothing needed. -#else -SIMDUTF_TARGET_WESTMERE -#endif -/* end file src/simdutf/westmere/begin.h */ -namespace simdutf { -namespace westmere { -namespace { -#ifndef SIMDUTF_WESTMERE_H -#error "westmere.h must be included" -#endif -using namespace simd; + if (res.count != len) { + return scalar::utf16::validate(buf + res.count, + len - res.count); + } -simdutf_really_inline bool is_ascii(const simd8x64& input) { - return input.reduce_or().is_ascii(); + return true; } -simdutf_unused simdutf_really_inline simd8 must_be_continuation(const simd8 prev1, const simd8 prev2, const simd8 prev3) { - simd8 is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0 - simd8 is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0 - simd8 is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0 - // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine. - return simd8(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0); +simdutf_warn_unused result implementation::validate_utf16le_with_errors( + const char16_t *buf, size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + return result(error_code::SUCCESS, 0); + } + const result res = + lasx::utf16::validate_utf16_with_errors(buf, len); + if (res.count != len) { + const result scalar_res = + scalar::utf16::validate_with_errors( + buf + res.count, len - res.count); + return result(scalar_res.error, res.count + scalar_res.count); + } else { + return res; + } } -simdutf_really_inline simd8 must_be_2_3_continuation(const simd8 prev2, const simd8 prev3) { - simd8 is_third_byte = prev2.saturating_sub(0xe0u-0x80); // Only 111_____ will be >= 0x80 - simd8 is_fourth_byte = prev3.saturating_sub(0xf0u-0x80); // Only 1111____ will be >= 0x80 - return simd8(is_third_byte | is_fourth_byte); +simdutf_warn_unused result implementation::validate_utf16be_with_errors( + const char16_t *buf, size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + return result(error_code::SUCCESS, 0); + } + const result res = + lasx::utf16::validate_utf16_with_errors(buf, len); + if (res.count != len) { + const result scalar_res = + scalar::utf16::validate_with_errors(buf + res.count, + len - res.count); + return result(scalar_res.error, res.count + scalar_res.count); + } else { + return res; + } } -/* begin file src/westmere/internal/loader.cpp */ -namespace internal { -namespace westmere { - -/* begin file src/westmere/internal/write_v_u16_11bits_to_utf8.cpp */ -/* -* reads a vector of uint16 values -* bits after 11th are ignored -* first 11 bits are encoded into utf8 -* !important! utf8_output must have at least 16 writable bytes -*/ - -inline void write_v_u16_11bits_to_utf8( - const __m128i v_u16, - char*& utf8_output, - const __m128i one_byte_bytemask, - const uint16_t one_byte_bitmask -) { - // 0b1100_0000_1000_0000 - const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080); - // 0b0001_1111_0000_0000 - const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00); - // 0b0000_0000_0011_1111 - const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f); - - // 1. prepare 2-byte values - // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 - // expected output : [110a|aaaa|10bb|bbbb] x 8 - - // t0 = [000a|aaaa|bbbb|bb00] - const __m128i t0 = _mm_slli_epi16(v_u16, 2); - // t1 = [000a|aaaa|0000|0000] - const __m128i t1 = _mm_and_si128(t0, v_1f00); - // t2 = [0000|0000|00bb|bbbb] - const __m128i t2 = _mm_and_si128(v_u16, v_003f); - // t3 = [000a|aaaa|00bb|bbbb] - const __m128i t3 = _mm_or_si128(t1, t2); - // t4 = [110a|aaaa|10bb|bbbb] - const __m128i t4 = _mm_or_si128(t3, v_c080); - - // 2. merge ASCII and 2-byte codewords - const __m128i utf8_unpacked = _mm_blendv_epi8(t4, v_u16, one_byte_bytemask); - - // 3. prepare bitmask for 8-bit lookup - // one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB) - const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a - const uint16_t m1 = static_cast(m0 >> 7); // m1 = 00000000h0g0f0e0 - const uint8_t m2 = static_cast((m0 | m1) & 0xff); // m2 = hdgcfbea - // 4. pack the bytes - const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; - const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1)); - const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle); - - // 5. store bytes - _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); - - // 6. adjust pointers - utf8_output += row[0]; +void implementation::to_well_formed_utf16le(const char16_t *input, size_t len, + char16_t *output) const noexcept { + return utf16::to_well_formed(input, len, output); } -inline void write_v_u16_11bits_to_utf8( - const __m128i v_u16, - char*& utf8_output, - const __m128i v_0000, - const __m128i v_ff80 -) { - // no bits set above 7th bit - const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(v_u16, v_ff80), v_0000); - const uint16_t one_byte_bitmask = static_cast(_mm_movemask_epi8(one_byte_bytemask)); - - write_v_u16_11bits_to_utf8( - v_u16, utf8_output, one_byte_bytemask, one_byte_bitmask); +void implementation::to_well_formed_utf16be(const char16_t *input, size_t len, + char16_t *output) const noexcept { + return utf16::to_well_formed(input, len, output); } -/* end file src/westmere/internal/write_v_u16_11bits_to_utf8.cpp */ - -} // namespace westmere -} // namespace internal -/* end file src/westmere/internal/loader.cpp */ -/* begin file src/westmere/sse_detect_encodings.cpp */ -template -// len is known to be a multiple of 2 when this is called -int sse_detect_encodings(const char * buf, size_t len) { - const char* start = buf; - const char* end = buf + len; - - bool is_utf8 = true; - bool is_utf16 = true; - bool is_utf32 = true; - - int out = 0; - - const auto v_d8 = simd8::splat(0xd8); - const auto v_f8 = simd8::splat(0xf8); +#endif // SIMDUTF_FEATURE_UTF16 - __m128i currentmax = _mm_setzero_si128(); +#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused bool +implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + // empty input is valid. protected the implementation from nullptr. + return true; + } + const char32_t *tail = lasx_validate_utf32le(buf, len); + if (tail) { + return scalar::utf32::validate(tail, len - (tail - buf)); + } else { + return false; + } +} +#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING - checker check{}; +#if SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused result implementation::validate_utf32_with_errors( + const char32_t *buf, size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + return result(error_code::SUCCESS, 0); + } + result res = lasx_validate_utf32le_with_errors(buf, len); + if (res.count != len) { + result scalar_res = + scalar::utf32::validate_with_errors(buf + res.count, len - res.count); + return result(scalar_res.error, res.count + scalar_res.count); + } else { + return res; + } +} +#endif // SIMDUTF_FEATURE_UTF32 - while(buf + 64 <= end) { - __m128i in = _mm_loadu_si128((__m128i*)buf); - __m128i secondin = _mm_loadu_si128((__m128i*)buf+1); - __m128i thirdin = _mm_loadu_si128((__m128i*)buf+2); - __m128i fourthin = _mm_loadu_si128((__m128i*)buf+3); - - const auto u0 = simd16(in); - const auto u1 = simd16(secondin); - const auto u2 = simd16(thirdin); - const auto u3 = simd16(fourthin); - - const auto v0 = u0.shr<8>(); - const auto v1 = u1.shr<8>(); - const auto v2 = u2.shr<8>(); - const auto v3 = u3.shr<8>(); - - const auto in16 = simd16::pack(v0, v1); - const auto nextin16 = simd16::pack(v2, v3); - - const auto surrogates_wordmask0 = (in16 & v_f8) == v_d8; - const auto surrogates_wordmask1 = (nextin16 & v_f8) == v_d8; - uint16_t surrogates_bitmask0 = static_cast(surrogates_wordmask0.to_bitmask()); - uint16_t surrogates_bitmask1 = static_cast(surrogates_wordmask1.to_bitmask()); - - // Check for surrogates - if (surrogates_bitmask0 != 0x0 || surrogates_bitmask1 != 0x0) { - // Cannot be UTF8 - is_utf8 = false; - // Can still be either UTF-16LE or UTF-32 depending on the positions of the surrogates - // To be valid UTF-32, a surrogate cannot be in the two most significant bytes of any 32-bit word. - // On the other hand, to be valid UTF-16LE, at least one surrogate must be in the two most significant - // bytes of a 32-bit word since they always come in pairs in UTF-16LE. - // Note that we always proceed in multiple of 4 before this point so there is no offset in 32-bit code units. - - if (((surrogates_bitmask0 | surrogates_bitmask1) & 0xaaaa) != 0) { - is_utf32 = false; - // Code from sse_validate_utf16le.cpp - // Not efficient, we do not process surrogates_bitmask1 - const char16_t * input = reinterpret_cast(buf); - const char16_t* end16 = reinterpret_cast(start) + len/2; - - const auto v_fc = simd8::splat(0xfc); - const auto v_dc = simd8::splat(0xdc); - - const uint16_t V0 = static_cast(~surrogates_bitmask0); - - const auto vH0 = (in16 & v_fc) == v_dc; - const uint16_t H0 = static_cast(vH0.to_bitmask()); - - const uint16_t L0 = static_cast(~H0 & surrogates_bitmask0); - - const uint16_t a0 = static_cast(L0 & (H0 >> 1)); - - const uint16_t b0 = static_cast(a0 << 1); - - const uint16_t c0 = static_cast(V0 | a0 | b0); - - if (c0 == 0xffff) { - input += 16; - } else if (c0 == 0x7fff) { - input += 15; - } else { - is_utf16 = false; - break; - } - - while (input + simd16::SIZE * 2 < end16) { - const auto in0 = simd16(input); - const auto in1 = simd16(input + simd16::SIZE / sizeof(char16_t)); - - const auto t0 = in0.shr<8>(); - const auto t1 = in1.shr<8>(); - - const auto in_16 = simd16::pack(t0, t1); - - const auto surrogates_wordmask = (in_16 & v_f8) == v_d8; - const uint16_t surrogates_bitmask = static_cast(surrogates_wordmask.to_bitmask()); - if (surrogates_bitmask == 0x0) { - input += 16; - } else { - const uint16_t V = static_cast(~surrogates_bitmask); - - const auto vH = (in_16 & v_fc) == v_dc; - const uint16_t H = static_cast(vH.to_bitmask()); - - const uint16_t L = static_cast(~H & surrogates_bitmask); - - const uint16_t a = static_cast(L & (H >> 1)); - - const uint16_t b = static_cast(a << 1); - - const uint16_t c = static_cast(V | a | b); - - if (c == 0xffff) { - input += 16; - } else if (c == 0x7fff) { - input += 15; - } else { - is_utf16 = false; - break; - } - } - } - } else { - is_utf16 = false; - // Check for UTF-32 - if (len % 4 == 0) { - const char32_t * input = reinterpret_cast(buf); - const char32_t* end32 = reinterpret_cast(start) + len/4; - - // Must start checking for surrogates - __m128i currentoffsetmax = _mm_setzero_si128(); - const __m128i offset = _mm_set1_epi32(0xffff2000); - const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff); - - currentmax = _mm_max_epu32(in, currentmax); - currentmax = _mm_max_epu32(secondin, currentmax); - currentmax = _mm_max_epu32(thirdin, currentmax); - currentmax = _mm_max_epu32(fourthin, currentmax); - - currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax); - currentoffsetmax = _mm_max_epu32(_mm_add_epi32(secondin, offset), currentoffsetmax); - currentoffsetmax = _mm_max_epu32(_mm_add_epi32(thirdin, offset), currentoffsetmax); - currentoffsetmax = _mm_max_epu32(_mm_add_epi32(fourthin, offset), currentoffsetmax); - - while (input + 4 < end32) { - const __m128i in32 = _mm_loadu_si128((__m128i *)input); - currentmax = _mm_max_epu32(in32,currentmax); - currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in32, offset), currentoffsetmax); - input += 4; - } - - __m128i forbidden_words = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax); - if(_mm_testz_si128(forbidden_words, forbidden_words) == 0) { - is_utf32 = false; - } - } else { - is_utf32 = false; - } - } - break; - } - // If no surrogate, validate under other encodings as well +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_latin1_to_utf8( + const char *buf, size_t len, char *utf8_output) const noexcept { + std::pair ret = + lasx_convert_latin1_to_utf8(buf, len, utf8_output); + size_t converted_chars = ret.second - utf8_output; - // UTF-32 validation - currentmax = _mm_max_epu32(in, currentmax); - currentmax = _mm_max_epu32(secondin, currentmax); - currentmax = _mm_max_epu32(thirdin, currentmax); - currentmax = _mm_max_epu32(fourthin, currentmax); + if (ret.first != buf + len) { + const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + converted_chars += scalar_converted_chars; + } + return converted_chars; +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 - // UTF-8 validation - // Relies on ../generic/utf8_validation/utf8_lookup4_algorithm.h - simd::simd8x64 in8(in, secondin, thirdin, fourthin); - check.check_next_input(in8); +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + std::pair ret = + lasx_convert_latin1_to_utf16le(buf, len, utf16_output); + size_t converted_chars = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_converted_chars = + scalar::latin1_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + converted_chars += scalar_converted_chars; + } + return converted_chars; +} - buf += 64; - } +simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + std::pair ret = + lasx_convert_latin1_to_utf16be(buf, len, utf16_output); + size_t converted_chars = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_converted_chars = + scalar::latin1_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + converted_chars += scalar_converted_chars; + } + return converted_chars; +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 - // Check which encodings are possible +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_latin1_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept { + std::pair ret = + lasx_convert_latin1_to_utf32(buf, len, utf32_output); + size_t converted_chars = ret.second - utf32_output; + if (ret.first != buf + len) { + const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert( + ret.first, len - (ret.first - buf), ret.second); + converted_chars += scalar_converted_chars; + } + return converted_chars; +} +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 - if (is_utf8) { - if (static_cast(buf - start) != len) { - uint8_t block[64]{}; - std::memset(block, 0x20, 64); - std::memcpy(block, buf, len - (buf - start)); - simd::simd8x64 in(block); - check.check_next_input(in); - } - if (!check.errors()) { - out |= simdutf::encoding_type::UTF8; +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept { + size_t pos = 0; + char *output_start{latin1_output}; + // Performance degradation when memory address is not 32-byte aligned + while (((uint64_t)latin1_output & 0x1F) && pos < len) { + if (buf[pos] & 0x80) { + if (pos + 1 >= len) + return 0; + if ((buf[pos] & 0b11100000) == 0b11000000) { + if ((buf[pos + 1] & 0b11000000) != 0b10000000) + return 0; + uint32_t code_point = + (buf[pos] & 0b00011111) << 6 | (buf[pos + 1] & 0b00111111); + if (code_point < 0x80 || 0xFF < code_point) { + return 0; } + *latin1_output++ = char(code_point); + pos += 2; + } else { + return 0; + } + } else { + *latin1_output++ = char(buf[pos]); + pos++; } + } + size_t convert_size = latin1_output - output_start; + if (pos == len) + return convert_size; + utf8_to_latin1::validating_transcoder converter; + size_t convert_result = + converter.convert(buf + pos, len - pos, latin1_output); + return convert_result ? convert_size + convert_result : 0; +} - if (is_utf16 && scalar::utf16::validate(reinterpret_cast(buf), (len - (buf - start))/2)) { - out |= simdutf::encoding_type::UTF16_LE; - } - - if (is_utf32 && (len % 4 == 0)) { - const __m128i standardmax = _mm_set1_epi32(0x10ffff); - __m128i is_zero = _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax); - if (_mm_testz_si128(is_zero, is_zero) == 1 && scalar::utf32::validate(reinterpret_cast(buf), (len - (buf - start))/4)) { - out |= simdutf::encoding_type::UTF32_LE; +simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors( + const char *buf, size_t len, char *latin1_output) const noexcept { + size_t pos = 0; + char *output_start{latin1_output}; + // Performance degradation when memory address is not 32-byte aligned + while (((uint64_t)latin1_output & 0x1F) && pos < len) { + if (buf[pos] & 0x80) { + if ((buf[pos] & 0b11100000) == 0b11000000) { + if (pos + 1 >= len) + return result(error_code::TOO_SHORT, pos); + if ((buf[pos + 1] & 0b11000000) != 0b10000000) + return result(error_code::TOO_SHORT, pos); + uint32_t code_point = + (buf[pos] & 0b00011111) << 6 | (buf[pos + 1] & 0b00111111); + if (code_point < 0x80) + return result(error_code::OVERLONG, pos); + if (0xFF < code_point) + return result(error_code::TOO_LARGE, pos); + *latin1_output++ = char(code_point); + pos += 2; + } else if ((buf[pos] & 0b11110000) == 0b11100000) { + return result(error_code::TOO_LARGE, pos); + } else if ((buf[pos] & 0b11111000) == 0b11110000) { + return result(error_code::TOO_LARGE, pos); + } else { + if ((buf[pos] & 0b11000000) == 0b10000000) { + return result(error_code::TOO_LONG, pos); } + return result(error_code::HEADER_BITS, pos); + } + } else { + *latin1_output++ = char(buf[pos]); + pos++; } + } + size_t convert_size = latin1_output - output_start; + if (pos == len) + return result(error_code::SUCCESS, convert_size); - return out; + utf8_to_latin1::validating_transcoder converter; + result res = + converter.convert_with_errors(buf + pos, len - pos, latin1_output); + return res.error ? result(res.error, res.count + pos) + : result(res.error, res.count + convert_size); } -/* end file src/westmere/sse_detect_encodings.cpp */ -/* begin file src/westmere/sse_validate_utf16.cpp */ -/* - In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning. +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept { + size_t pos = 0; + char *output_start{latin1_output}; + // Performance degradation when memory address is not 32-byte aligned + while (((uint64_t)latin1_output & 0x1F) && pos < len) { + if (buf[pos] & 0x80) { + if (pos + 1 >= len) + break; + if ((buf[pos] & 0b11100000) == 0b11000000) { + if ((buf[pos + 1] & 0b11000000) != 0b10000000) + return 0; + uint32_t code_point = + (buf[pos] & 0b00011111) << 6 | (buf[pos + 1] & 0b00111111); + *latin1_output++ = char(code_point); + pos += 2; + } else { + return 0; + } + } else { + *latin1_output++ = char(buf[pos]); + pos++; + } + } + size_t convert_size = latin1_output - output_start; + if (pos == len) + return convert_size; - In a vectorized algorithm we want to examine the most significant - nibble in order to select a fast path. If none of highest nibbles - are 0xD (13), than we are sure that UTF-16 chunk in a vector - register is valid. + size_t convert_result = + lasx::utf8_to_latin1::convert_valid(buf + pos, len - pos, latin1_output); + return convert_result ? convert_size + convert_result : 0; +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 - Let us analyze what we need to check if the nibble is 0xD. The - value of the preceding nibble determines what we have: +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert(buf, len, utf16_output); +} - 0xd000 .. 0xd7ff - a valid word - 0xd800 .. 0xdbff - low surrogate - 0xdc00 .. 0xdfff - high surrogate +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert(buf, len, utf16_output); +} - Other constraints we have to consider: - - there must not be two consecutive low surrogates (0xd800 .. 0xdbff) - - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff) - - there must not be sole low surrogate nor high surrogate +simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert_with_errors(buf, len, + utf16_output); +} - We're going to build three bitmasks based on the 3rd nibble: - - V = valid word, - - L = low surrogate (0xd800 .. 0xdbff) - - H = high surrogate (0xdc00 .. 0xdfff) +simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert_with_errors(buf, len, utf16_output); +} - 0 1 2 3 4 5 6 7 <--- word index - [ V | L | H | L | H | V | V | L ] - 1 0 0 0 0 1 1 0 - V = valid masks - 0 1 0 1 0 0 0 1 - L = low surrogate - 0 0 1 0 1 0 0 0 - H high surrogate +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le( + const char *input, size_t size, char16_t *utf16_output) const noexcept { + return utf8_to_utf16::convert_valid(input, size, + utf16_output); +} +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be( + const char *input, size_t size, char16_t *utf16_output) const noexcept { + return utf8_to_utf16::convert_valid(input, size, + utf16_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 - 1 0 0 0 0 1 1 0 V = valid masks - 0 1 0 1 0 0 0 0 a = L & (H >> 1) - 0 0 1 0 1 0 0 0 b = a << 1 - 1 1 1 1 1 1 1 0 c = V | a | b - ^ - the last bit can be zero, we just consume 7 code units - and recheck this word in the next iteration -*/ +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_utf8_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept { + utf8_to_utf32::validating_transcoder converter; + return converter.convert(buf, len, utf32_output); +} -/* Returns: - - pointer to the last unprocessed character (a scalar fallback should check the rest); - - nullptr if an error was detected. -*/ -template -const char16_t* sse_validate_utf16(const char16_t* input, size_t size) { - const char16_t* end = input + size; - - const auto v_d8 = simd8::splat(0xd8); - const auto v_f8 = simd8::splat(0xf8); - const auto v_fc = simd8::splat(0xfc); - const auto v_dc = simd8::splat(0xdc); - - while (input + simd16::SIZE * 2 < end) { - // 0. Load data: since the validation takes into account only higher - // byte of each word, we compress the two vectors into one which - // consists only the higher bytes. - auto in0 = simd16(input); - auto in1 = simd16(input + simd16::SIZE / sizeof(char16_t)); - if (big_endian) { - in0 = in0.swap_bytes(); - in1 = in1.swap_bytes(); - } +simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors( + const char *buf, size_t len, char32_t *utf32_output) const noexcept { + utf8_to_utf32::validating_transcoder converter; + return converter.convert_with_errors(buf, len, utf32_output); +} - const auto t0 = in0.shr<8>(); - const auto t1 = in1.shr<8>(); +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32( + const char *input, size_t size, char32_t *utf32_output) const noexcept { + return utf8_to_utf32::convert_valid(input, size, utf32_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 - const auto in = simd16::pack(t0, t1); +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair ret = + lasx_convert_utf16_to_latin1(buf, len, latin1_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - latin1_output; - // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). - const auto surrogates_wordmask = (in & v_f8) == v_d8; - const uint16_t surrogates_bitmask = static_cast(surrogates_wordmask.to_bitmask()); - if (surrogates_bitmask == 0x0000) { - input += 16; - } else { - // 2. We have some surrogates that have to be distinguished: - // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) - // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) - // - // Fact: high surrogate has 11th bit set (3rd bit in the higher word) - - // V - non-surrogate code units - // V = not surrogates_wordmask - const uint16_t V = static_cast(~surrogates_bitmask); - - // H - word-mask for high surrogates: the six highest bits are 0b1101'11 - const auto vH = (in & v_fc) == v_dc; - const uint16_t H = static_cast(vH.to_bitmask()); - - // L - word mask for low surrogates - // L = not H and surrogates_wordmask - const uint16_t L = static_cast(~H & surrogates_bitmask); - - const uint16_t a = static_cast(L & (H >> 1)); // A low surrogate must be followed by high one. - // (A low surrogate placed in the 7th register's word - // is an exception we handle.) - const uint16_t b = static_cast(a << 1); // Just mark that the opinput - startite fact is hold, - // thanks to that we have only two masks for valid case. - const uint16_t c = static_cast(V | a | b); // Combine all the masks into the final one. - - if (c == 0xffff) { - // The whole input register contains valid UTF-16, i.e., - // either single code units or proper surrogate pairs. - input += 16; - } else if (c == 0x7fff) { - // The 15 lower code units of the input register contains valid UTF-16. - // The 15th word may be either a low or high surrogate. It the next - // iteration we 1) check if the low surrogate is followed by a high - // one, 2) reject sole high surrogate. - input += 15; - } else { - return nullptr; - } - } + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_latin1::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; } - - return input; + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; } +simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair ret = + lasx_convert_utf16_to_latin1(buf, len, latin1_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - latin1_output; -template -const result sse_validate_utf16_with_errors(const char16_t* input, size_t size) { - const char16_t* start = input; - const char16_t* end = input + size; - - const auto v_d8 = simd8::splat(0xd8); - const auto v_f8 = simd8::splat(0xf8); - const auto v_fc = simd8::splat(0xfc); - const auto v_dc = simd8::splat(0xdc); - - while (input + simd16::SIZE * 2 < end) { - // 0. Load data: since the validation takes into account only higher - // byte of each word, we compress the two vectors into one which - // consists only the higher bytes. - auto in0 = simd16(input); - auto in1 = simd16(input + simd16::SIZE / sizeof(char16_t)); - - if (big_endian) { - in0 = in0.swap_bytes(); - in1 = in1.swap_bytes(); - } - - const auto t0 = in0.shr<8>(); - const auto t1 = in1.shr<8>(); + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_latin1::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} - const auto in = simd16::pack(t0, t1); +simdutf_warn_unused result +implementation::convert_utf16le_to_latin1_with_errors( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair ret = + lasx_convert_utf16_to_latin1_with_errors( + buf, len, latin1_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = + scalar::utf16_to_latin1::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + latin1_output; // Set count to the number of 8-bit code units written + return ret.first; +} - // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). - const auto surrogates_wordmask = (in & v_f8) == v_d8; - const uint16_t surrogates_bitmask = static_cast(surrogates_wordmask.to_bitmask()); - if (surrogates_bitmask == 0x0000) { - input += 16; - } else { - // 2. We have some surrogates that have to be distinguished: - // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) - // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) - // - // Fact: high surrogate has 11th bit set (3rd bit in the higher word) - - // V - non-surrogate code units - // V = not surrogates_wordmask - const uint16_t V = static_cast(~surrogates_bitmask); - - // H - word-mask for high surrogates: the six highest bits are 0b1101'11 - const auto vH = (in & v_fc) == v_dc; - const uint16_t H = static_cast(vH.to_bitmask()); - - // L - word mask for low surrogates - // L = not H and surrogates_wordmask - const uint16_t L = static_cast(~H & surrogates_bitmask); - - const uint16_t a = static_cast(L & (H >> 1)); // A low surrogate must be followed by high one. - // (A low surrogate placed in the 7th register's word - // is an exception we handle.) - const uint16_t b = static_cast(a << 1); // Just mark that the opinput - startite fact is hold, - // thanks to that we have only two masks for valid case. - const uint16_t c = static_cast(V | a | b); // Combine all the masks into the final one. - - if (c == 0xffff) { - // The whole input register contains valid UTF-16, i.e., - // either single code units or proper surrogate pairs. - input += 16; - } else if (c == 0x7fff) { - // The 15 lower code units of the input register contains valid UTF-16. - // The 15th word may be either a low or high surrogate. It the next - // iteration we 1) check if the low surrogate is followed by a high - // one, 2) reject sole high surrogate. - input += 15; - } else { - return result(error_code::SURROGATE, input - start); - } - } +simdutf_warn_unused result +implementation::convert_utf16be_to_latin1_with_errors( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair ret = + lasx_convert_utf16_to_latin1_with_errors(buf, len, + latin1_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = + scalar::utf16_to_latin1::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; } + } + ret.first.count = + ret.second - + latin1_output; // Set count to the number of 8-bit code units written + return ret.first; +} - return result(error_code::SUCCESS, input - start); +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + // optimization opportunity: implement a custom function. + return convert_utf16be_to_latin1(buf, len, latin1_output); } -/* end file src/westmere/sse_validate_utf16.cpp */ -/* begin file src/westmere/sse_validate_utf32le.cpp */ -/* Returns: - - pointer to the last unprocessed character (a scalar fallback should check the rest); - - nullptr if an error was detected. -*/ -const char32_t* sse_validate_utf32le(const char32_t* input, size_t size) { - const char32_t* end = input + size; - - const __m128i standardmax = _mm_set1_epi32(0x10ffff); - const __m128i offset = _mm_set1_epi32(0xffff2000); - const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff); - __m128i currentmax = _mm_setzero_si128(); - __m128i currentoffsetmax = _mm_setzero_si128(); - - while (input + 4 < end) { - const __m128i in = _mm_loadu_si128((__m128i *)input); - currentmax = _mm_max_epu32(in,currentmax); - currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax); - input += 4; - } - __m128i is_zero = _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax); - if(_mm_test_all_zeros(is_zero, is_zero) == 0) { - return nullptr; - } - is_zero = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax); - if(_mm_test_all_zeros(is_zero, is_zero) == 0) { - return nullptr; - } +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + // optimization opportunity: implement a custom function. + return convert_utf16le_to_latin1(buf, len, latin1_output); +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 - return input; +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + std::pair ret = + lasx_convert_utf16_to_utf8(buf, len, utf8_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf8_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; } +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + std::pair ret = + lasx_convert_utf16_to_utf8(buf, len, utf8_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf8_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} -const result sse_validate_utf32le_with_errors(const char32_t* input, size_t size) { - const char32_t* start = input; - const char32_t* end = input + size; +simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + lasx_convert_utf16_to_utf8_with_errors(buf, len, + utf8_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = + scalar::utf16_to_utf8::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + utf8_output; // Set count to the number of 8-bit code units written + return ret.first; +} - const __m128i standardmax = _mm_set1_epi32(0x10ffff); - const __m128i offset = _mm_set1_epi32(0xffff2000); - const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff); - __m128i currentmax = _mm_setzero_si128(); - __m128i currentoffsetmax = _mm_setzero_si128(); +simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + lasx_convert_utf16_to_utf8_with_errors(buf, len, + utf8_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = + scalar::utf16_to_utf8::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + utf8_output; // Set count to the number of 8-bit code units written + return ret.first; +} - while (input + 4 < end) { - const __m128i in = _mm_loadu_si128((__m128i *)input); - currentmax = _mm_max_epu32(in,currentmax); - currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax); +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + return convert_utf16le_to_utf8(buf, len, utf8_output); +} - __m128i is_zero = _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax); - if(_mm_test_all_zeros(is_zero, is_zero) == 0) { - return result(error_code::TOO_LARGE, input - start); - } +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + return convert_utf16be_to_utf8(buf, len, utf8_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 - is_zero = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax); - if(_mm_test_all_zeros(is_zero, is_zero) == 0) { - return result(error_code::SURROGATE, input - start); - } - input += 4; +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_output) const noexcept { + if (simdutf_unlikely(len == 0)) { + return 0; + } + std::pair ret = + lasx_convert_utf32_to_utf8(buf, len, utf8_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf8_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} - return result(error_code::SUCCESS, input - start); +simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors( + const char32_t *buf, size_t len, char *utf8_output) const noexcept { + if (simdutf_unlikely(len == 0)) { + return result(error_code::SUCCESS, 0); + } + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + lasx_convert_utf32_to_utf8_with_errors(buf, len, utf8_output); + if (ret.first.count != len) { + result scalar_res = scalar::utf32_to_utf8::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + utf8_output; // Set count to the number of 8-bit code units written + return ret.first; } -/* end file src/westmere/sse_validate_utf32le.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 -/* begin file src/westmere/sse_convert_latin1_to_utf8.cpp */ -std::pair sse_convert_latin1_to_utf8( - const char* latin_input, - const size_t latin_input_length, - char* utf8_output) { - const char* end = latin_input + latin_input_length; +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + std::pair ret = + lasx_convert_utf16_to_utf32(buf, len, utf32_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf32_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_utf32::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} - const __m128i v_0000 = _mm_setzero_si128(); - // 0b1000_0000 - const __m128i v_80 = _mm_set1_epi8((uint8_t)0x80); - // 0b1111_1111_1000_0000 - const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80); +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + std::pair ret = + lasx_convert_utf16_to_utf32(buf, len, utf32_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf32_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_utf32::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} - const __m128i latin_1_half_into_u16_byte_mask = _mm_setr_epi8( - 0, '\x80', - 1, '\x80', - 2, '\x80', - 3, '\x80', - 4, '\x80', - 5, '\x80', - 6, '\x80', - 7, '\x80' - ); - - const __m128i latin_2_half_into_u16_byte_mask = _mm_setr_epi8( - 8, '\x80', - 9, '\x80', - 10, '\x80', - 11, '\x80', - 12, '\x80', - 13, '\x80', - 14, '\x80', - 15, '\x80' - ); +simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + lasx_convert_utf16_to_utf32_with_errors(buf, len, + utf32_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = + scalar::utf16_to_utf32::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + utf32_output; // Set count to the number of 8-bit code units written + return ret.first; +} - // each latin1 takes 1-2 utf8 bytes - // slow path writes useful 8-15 bytes twice (eagerly writes 16 bytes and then adjust the pointer) - // so the last write can exceed the utf8_output size by 8-1 bytes - // by reserving 8 extra input bytes, we expect the output to have 8-16 bytes free - while (latin_input + 16 + 8 <= end) { - // Load 16 Latin1 characters (16 bytes) into a 128-bit register - __m128i v_latin = _mm_loadu_si128((__m128i*)latin_input); +simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + lasx_convert_utf16_to_utf32_with_errors(buf, len, + utf32_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = + scalar::utf16_to_utf32::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + utf32_output; // Set count to the number of 8-bit code units written + return ret.first; +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_utf32_to_latin1( + const char32_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair ret = + lasx_convert_utf32_to_latin1(buf, len, latin1_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - latin1_output; - if (_mm_testz_si128(v_latin, v_80)) {// ASCII fast path!!!! - _mm_storeu_si128((__m128i*)utf8_output, v_latin); - latin_input += 16; - utf8_output += 16; - continue; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} +simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors( + const char32_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair ret = + lasx_convert_utf32_to_latin1_with_errors(buf, len, latin1_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = scalar::utf32_to_latin1::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + latin1_output; // Set count to the number of 8-bit code units written + return ret.first; +} - // assuming a/b are bytes and A/B are uint16 of the same value - // aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA - __m128i v_u16_latin_1_half = _mm_shuffle_epi8(v_latin, latin_1_half_into_u16_byte_mask); - // aaaa_aaaa_bbbb_bbbb -> BBBB_BBBB - __m128i v_u16_latin_2_half = _mm_shuffle_epi8(v_latin, latin_2_half_into_u16_byte_mask); - +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1( + const char32_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair ret = + lasx_convert_utf32_to_latin1(buf, len, latin1_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - latin1_output; - internal::westmere::write_v_u16_11bits_to_utf8(v_u16_latin_1_half, utf8_output, v_0000, v_ff80); - internal::westmere::write_v_u16_11bits_to_utf8(v_u16_latin_2_half, utf8_output, v_0000, v_ff80); - latin_input += 16; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert_valid( + ret.first, len - (ret.first - buf), ret.second); + saved_bytes += scalar_saved_bytes; } + return saved_bytes; +} +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 - if (latin_input + 16 <= end) { - // Load 16 Latin1 characters (16 bytes) into a 128-bit register - __m128i v_latin = _mm_loadu_si128((__m128i*)latin_input); +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_output) const noexcept { + // optimization opportunity: implement a custom function. + return convert_utf32_to_utf8(buf, len, utf8_output); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 - if (_mm_testz_si128(v_latin, v_80)) {// ASCII fast path!!!! - _mm_storeu_si128((__m128i*)utf8_output, v_latin); - latin_input += 16; - utf8_output += 16; - } else { - // assuming a/b are bytes and A/B are uint16 of the same value - // aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA - __m128i v_u16_latin_1_half = _mm_shuffle_epi8(v_latin, latin_1_half_into_u16_byte_mask); - internal::westmere::write_v_u16_11bits_to_utf8(v_u16_latin_1_half, utf8_output, v_0000, v_ff80); - latin_input += 8; +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + std::pair ret = + lasx_convert_utf32_to_utf16(buf, len, utf16_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf32_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; } + saved_bytes += scalar_saved_bytes; } - return std::make_pair(latin_input, utf8_output); + return saved_bytes; } -/* end file src/westmere/sse_convert_latin1_to_utf8.cpp */ -/* begin file src/westmere/sse_convert_latin1_to_utf16.cpp */ -template -std::pair sse_convert_latin1_to_utf16(const char *latin1_input, size_t len, - char16_t *utf16_output) { - size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16 - for (size_t i = 0; i < rounded_len; i += 16) { - // Load 16 Latin1 characters into a 128-bit register - __m128i in = _mm_loadu_si128(reinterpret_cast(&latin1_input[i])); - __m128i out1 = big_endian ? _mm_unpacklo_epi8(_mm_setzero_si128(), in) - : _mm_unpacklo_epi8(in, _mm_setzero_si128()); - __m128i out2 = big_endian ? _mm_unpackhi_epi8(_mm_setzero_si128(), in) - : _mm_unpackhi_epi8(in, _mm_setzero_si128()); - // Zero extend each Latin1 character to 16-bit integers and store the results back to memory - _mm_storeu_si128(reinterpret_cast<__m128i*>(&utf16_output[i]), out1); - _mm_storeu_si128(reinterpret_cast<__m128i*>(&utf16_output[i + 8]), out2); - } - // return pointers pointing to where we left off - return std::make_pair(latin1_input + rounded_len, utf16_output + rounded_len); + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + std::pair ret = + lasx_convert_utf32_to_utf16(buf, len, utf16_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf32_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; } -/* end file src/westmere/sse_convert_latin1_to_utf16.cpp */ -/* begin file src/westmere/sse_convert_latin1_to_utf32.cpp */ -std::pair sse_convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) { - const char* end = buf + len; - - while (buf + 16 <= end) { - // Load 16 Latin1 characters (16 bytes) into a 128-bit register - __m128i in = _mm_loadu_si128((__m128i*)buf); - - // Shift input to process next 4 bytes - __m128i in_shifted1 = _mm_srli_si128(in, 4); - __m128i in_shifted2 = _mm_srli_si128(in, 8); - __m128i in_shifted3 = _mm_srli_si128(in, 12); - - // expand 8-bit to 32-bit unit - __m128i out1 = _mm_cvtepu8_epi32(in); - __m128i out2 = _mm_cvtepu8_epi32(in_shifted1); - __m128i out3 = _mm_cvtepu8_epi32(in_shifted2); - __m128i out4 = _mm_cvtepu8_epi32(in_shifted3); - - _mm_storeu_si128((__m128i*)utf32_output, out1); - _mm_storeu_si128((__m128i*)(utf32_output + 4), out2); - _mm_storeu_si128((__m128i*)(utf32_output + 8), out3); - _mm_storeu_si128((__m128i*)(utf32_output + 12), out4); - - utf32_output += 16; - buf += 16; + +simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + lasx_convert_utf32_to_utf16_with_errors(buf, len, + utf16_output); + if (ret.first.count != len) { + result scalar_res = + scalar::utf32_to_utf16::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; } + } + ret.first.count = + ret.second - + utf16_output; // Set count to the number of 8-bit code units written + return ret.first; +} - return std::make_pair(buf, utf32_output); +simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + lasx_convert_utf32_to_utf16_with_errors(buf, len, + utf16_output); + if (ret.first.count != len) { + result scalar_res = + scalar::utf32_to_utf16::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + utf16_output; // Set count to the number of 8-bit code units written + return ret.first; } -/* end file src/westmere/sse_convert_latin1_to_utf32.cpp */ +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + return convert_utf32_to_utf16le(buf, len, utf16_output); +} +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + return convert_utf32_to_utf16be(buf, len, utf16_output); +} -/* begin file src/westmere/sse_convert_utf8_to_utf16.cpp */ -// depends on "tables/utf8_to_utf16_tables.h" +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + return convert_utf16le_to_utf32(buf, len, utf32_output); +} +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + return convert_utf16be_to_utf32(buf, len, utf32_output); +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 -// Convert up to 12 bytes from utf8 to utf16 using a mask indicating the -// end of the code points. Only the least significant 12 bits of the mask -// are accessed. -// It returns how many bytes were consumed (up to 12). -template -size_t convert_masked_utf8_to_utf16(const char *input, - uint64_t utf8_end_of_code_point_mask, - char16_t *&utf16_output) { - // we use an approach where we try to process up to 12 input bytes. - // Why 12 input bytes and not 16? Because we are concerned with the size of - // the lookup tables. Also 12 is nicely divisible by two and three. - // - // - // Optimization note: our main path below is load-latency dependent. Thus it is maybe - // beneficial to have fast paths that depend on branch prediction but have less latency. - // This results in more instructions but, potentially, also higher speeds. - // - // We first try a few fast paths. - const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - const __m128i in = _mm_loadu_si128((__m128i *)input); - const uint16_t input_utf8_end_of_code_point_mask = - utf8_end_of_code_point_mask & 0xfff; - if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) { - // We process the data in chunks of 16 bytes. - __m128i ascii_first = _mm_cvtepu8_epi16(in); - __m128i ascii_second = _mm_cvtepu8_epi16(_mm_srli_si128(in,8)); - if (big_endian) { - ascii_first = _mm_shuffle_epi8(ascii_first, swap); - ascii_second = _mm_shuffle_epi8(ascii_second, swap); - } - _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output), ascii_first); - _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + 8), ascii_second); - utf16_output += 16; // We wrote 16 16-bit characters. - return 16; // We consumed 16 bytes. - } - if(((utf8_end_of_code_point_mask & 0xFFFF) == 0xaaaa)) { - // We want to take 8 2-byte UTF-8 code units and turn them into 8 2-byte UTF-16 code units. - // There is probably a more efficient sequence, but the following might do. - const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - const __m128i perm = _mm_shuffle_epi8(in, sh); - const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); - const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); - __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); - if (big_endian) composed = _mm_shuffle_epi8(composed, swap); - _mm_storeu_si128((__m128i *)utf16_output, composed); - utf16_output += 8; // We wrote 16 bytes, 8 code points. - return 16; - } - if(input_utf8_end_of_code_point_mask == 0x924) { - // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte UTF-16 code units. - // There is probably a more efficient sequence, but the following might do. - const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1); - const __m128i perm = _mm_shuffle_epi8(in, sh); - const __m128i ascii = - _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits - const __m128i middlebyte = - _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits - const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); - const __m128i highbyte = - _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits - const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); - const __m128i composed = - _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); - __m128i composed_repacked = _mm_packus_epi32(composed, composed); - if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap); - _mm_storeu_si128((__m128i *)utf16_output, composed_repacked); - utf16_output += 4; - return 12; - } - /// We do not have a fast path available, so we fallback. +#if SIMDUTF_FEATURE_UTF16 +void implementation::change_endianness_utf16(const char16_t *input, + size_t length, + char16_t *output) const noexcept { + utf16::change_endianness_utf16(input, length, output); +} - const uint8_t idx = - tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; - const uint8_t consumed = - tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; - if (idx < 64) { - // SIX (6) input code-code units - // this is a relatively easy scenario - // we process SIX (6) input code-code units. The max length in bytes of six code - // code units spanning between 1 and 2 bytes each is 12 bytes. On processors - // where pdep/pext is fast, we might be able to use a small lookup table. - const __m128i sh = - _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); - const __m128i perm = _mm_shuffle_epi8(in, sh); - const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); - const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); - __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); - if (big_endian) composed = _mm_shuffle_epi8(composed, swap); - _mm_storeu_si128((__m128i *)utf16_output, composed); - utf16_output += 6; // We wrote 12 bytes, 6 code points. - } else if (idx < 145) { - // FOUR (4) input code-code units - const __m128i sh = - _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); - const __m128i perm = _mm_shuffle_epi8(in, sh); - const __m128i ascii = - _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits - const __m128i middlebyte = - _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits - const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); - const __m128i highbyte = - _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits - const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); - const __m128i composed = - _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); - __m128i composed_repacked = _mm_packus_epi32(composed, composed); - if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap); - _mm_storeu_si128((__m128i *)utf16_output, composed_repacked); - utf16_output += 4; - } else if (idx < 209) { - // TWO (2) input code-code units - ////////////// - // There might be garbage inputs where a leading byte mascarades as a four-byte - // leading byte (by being followed by 3 continuation byte), but is not greater than - // 0xf0. This could trigger a buffer overflow if we only counted leading - // bytes of the form 0xf0 as generating surrogate pairs, without further UTF-8 validation. - // Thus we must be careful to ensure that only leading bytes at least as large as 0xf0 generate surrogate pairs. - // We do as at the cost of an extra mask. - ///////////// - const __m128i sh = - _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); - const __m128i perm = _mm_shuffle_epi8(in, sh); - const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); - const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); - const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); - __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000)); - // correct for spurious high bit - const __m128i correct = - _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1); - middlehighbyte = _mm_xor_si128(correct, middlehighbyte); - const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4); - // We deliberately carry the leading four bits in highbyte if they are present, - // we remove them later when computing hightenbits. - const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0xff000000)); - const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6); - // When we need to generate a surrogate pair (leading byte > 0xF0), then - // the corresponding 32-bit value in 'composed' will be greater than - // > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the - // location of the surrogate pairs. - const __m128i composed = - _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), - _mm_or_si128(highbyte_shifted, middlehighbyte_shifted)); - const __m128i composedminus = - _mm_sub_epi32(composed, _mm_set1_epi32(0x10000)); - const __m128i lowtenbits = - _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff)); - // Notice the 0x3ff mask: - const __m128i hightenbits = _mm_and_si128(_mm_srli_epi32(composedminus, 10), _mm_set1_epi32(0x3ff)); - const __m128i lowtenbitsadd = - _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00)); - const __m128i hightenbitsadd = - _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800)); - const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16); - __m128i surrogates = - _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted); - uint32_t basic_buffer[4]; - uint32_t basic_buffer_swap[4]; - if (big_endian) { - _mm_storeu_si128((__m128i *)basic_buffer_swap, _mm_shuffle_epi8(composed, swap)); - surrogates = _mm_shuffle_epi8(surrogates, swap); - } - _mm_storeu_si128((__m128i *)basic_buffer, composed); - uint32_t surrogate_buffer[4]; - _mm_storeu_si128((__m128i *)surrogate_buffer, surrogates); - for (size_t i = 0; i < 3; i++) { - if(basic_buffer[i] > 0x3c00000) { - utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff); - utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16); - utf16_output += 2; - } else { - utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]); - utf16_output++; - } - } - } else { - // here we know that there is an error but we do not handle errors - } - return consumed; +simdutf_warn_unused size_t implementation::count_utf16le( + const char16_t *input, size_t length) const noexcept { + return utf16::count_code_points(input, length); } -/* end file src/westmere/sse_convert_utf8_to_utf16.cpp */ -/* begin file src/westmere/sse_convert_utf8_to_utf32.cpp */ -// depends on "tables/utf8_to_utf16_tables.h" +simdutf_warn_unused size_t implementation::count_utf16be( + const char16_t *input, size_t length) const noexcept { + return utf16::count_code_points(input, length); +} +#endif // SIMDUTF_FEATURE_UTF16 -// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the -// end of the code points. Only the least significant 12 bits of the mask -// are accessed. -// It returns how many bytes were consumed (up to 12). -size_t convert_masked_utf8_to_utf32(const char *input, - uint64_t utf8_end_of_code_point_mask, - char32_t *&utf32_output) { - // we use an approach where we try to process up to 12 input bytes. - // Why 12 input bytes and not 16? Because we are concerned with the size of - // the lookup tables. Also 12 is nicely divisible by two and three. - // - // - // Optimization note: our main path below is load-latency dependent. Thus it is maybe - // beneficial to have fast paths that depend on branch prediction but have less latency. - // This results in more instructions but, potentially, also higher speeds. - // - // We first try a few fast paths. - const __m128i in = _mm_loadu_si128((__m128i *)input); - const uint16_t input_utf8_end_of_code_point_mask = - utf8_end_of_code_point_mask & 0xfff; - if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) { - // We process the data in chunks of 16 bytes. - _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu8_epi32(in)); - _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu8_epi32(_mm_srli_si128(in,4))); - _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+8), _mm_cvtepu8_epi32(_mm_srli_si128(in,8))); - _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+12), _mm_cvtepu8_epi32(_mm_srli_si128(in,12))); - utf32_output += 16; // We wrote 16 32-bit characters. - return 16; // We consumed 16 bytes. - } - if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) { - // We want to take 8 2-byte UTF-8 code units and turn them into 8 4-byte UTF-32 code units. - // There is probably a more efficient sequence, but the following might do. - const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - const __m128i perm = _mm_shuffle_epi8(in, sh); - const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); - const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); - const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); - _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(composed)); - _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(composed,8))); - utf32_output += 8; // We wrote 32 bytes, 8 code points. - return 16; - } - if(input_utf8_end_of_code_point_mask == 0x924) { - // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte UTF-32 code units. - // There is probably a more efficient sequence, but the following might do. - const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1); - const __m128i perm = _mm_shuffle_epi8(in, sh); - const __m128i ascii = - _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits - const __m128i middlebyte = - _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits - const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); - const __m128i highbyte = - _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits - const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); - const __m128i composed = - _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); - _mm_storeu_si128((__m128i *)utf32_output, composed); - utf32_output += 4; - return 12; +#if SIMDUTF_FEATURE_UTF8 +simdutf_warn_unused size_t +implementation::count_utf8(const char *input, size_t length) const noexcept { + size_t pos = 0; + size_t count = 0; + // Performance degradation when memory address is not 32-byte aligned + while ((((uint64_t)input + pos) & 0x1F && pos < length)) { + if (input[pos++] > -65) { + count++; + } } - /// We do not have a fast path available, so we fallback. + __m256i v_bf = __lasx_xvldi(0xBF); // 0b10111111 + for (; pos + 32 <= length; pos += 32) { + __m256i in = __lasx_xvld(reinterpret_cast(input + pos), 0); + __m256i utf8_count = + __lasx_xvpcnt_h(__lasx_xvmskltz_b(__lasx_xvslt_b(v_bf, in))); + count = count + __lasx_xvpickve2gr_wu(utf8_count, 0) + + __lasx_xvpickve2gr_wu(utf8_count, 4); + } + return count + scalar::utf8::count_code_points(input + pos, length - pos); +} +#endif // SIMDUTF_FEATURE_UTF8 - const uint8_t idx = - tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; - const uint8_t consumed = - tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; - if (idx < 64) { - // SIX (6) input code-code units - // this is a relatively easy scenario - // we process SIX (6) input code-code units. The max length in bytes of six code - // code units spanning between 1 and 2 bytes each is 12 bytes. On processors - // where pdep/pext is fast, we might be able to use a small lookup table. - const __m128i sh = - _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); - const __m128i perm = _mm_shuffle_epi8(in, sh); - const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); - const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); - const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); - _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(composed)); - _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(composed,8))); - utf32_output += 6; // We wrote 12 bytes, 6 code points. - } else if (idx < 145) { - // FOUR (4) input code-code units - const __m128i sh = - _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); - const __m128i perm = _mm_shuffle_epi8(in, sh); - const __m128i ascii = - _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits - const __m128i middlebyte = - _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits - const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); - const __m128i highbyte = - _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits - const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); - const __m128i composed = - _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); - _mm_storeu_si128((__m128i *)utf32_output, composed); - utf32_output += 4; - } else if (idx < 209) { - // TWO (2) input code-code units - const __m128i sh = - _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); - const __m128i perm = _mm_shuffle_epi8(in, sh); - const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); - const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); - const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); - __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000)); - // correct for spurious high bit - const __m128i correct = - _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1); - middlehighbyte = _mm_xor_si128(correct, middlehighbyte); - const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4); - const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000)); - const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6); - const __m128i composed = - _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), - _mm_or_si128(highbyte_shifted, middlehighbyte_shifted)); - _mm_storeu_si128((__m128i *)utf32_output, composed); - utf32_output += 3; - } else { - // here we know that there is an error but we do not handle errors +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::latin1_length_from_utf8( + const char *buf, size_t len) const noexcept { + return count_utf8(buf, len); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::utf8_length_from_latin1( + const char *input, size_t length) const noexcept { + const uint8_t *data = reinterpret_cast(input); + const uint8_t *data_end = data + length; + uint64_t result = 0; + while (data_end - data > 16) { + uint64_t two_bytes = 0; + __m128i input_vec = __lsx_vld(data, 0); + two_bytes = + __lsx_vpickve2gr_hu(__lsx_vpcnt_h(__lsx_vmskltz_b(input_vec)), 0); + result += 16 + two_bytes; + data += 16; } - return consumed; + return result + scalar::latin1::utf8_length_from_latin1((const char *)data, + data_end - data); } -/* end file src/westmere/sse_convert_utf8_to_utf32.cpp */ -/* begin file src/westmere/sse_convert_utf8_to_latin1.cpp */ -// depends on "tables/utf8_to_utf16_tables.h" +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::utf8_length_from_utf16le( + const char16_t *input, size_t length) const noexcept { + return utf16::utf8_length_from_utf16_bytemask(input, + length); +} -// Convert up to 12 bytes from utf8 to latin1 using a mask indicating the -// end of the code points. Only the least significant 12 bits of the mask -// are accessed. -// It returns how many bytes were consumed (up to 12). -size_t convert_masked_utf8_to_latin1(const char *input, - uint64_t utf8_end_of_code_point_mask, - char *&latin1_output) { - // we use an approach where we try to process up to 12 input bytes. - // Why 12 input bytes and not 16? Because we are concerned with the size of - // the lookup tables. Also 12 is nicely divisible by two and three. - // - // - // Optimization note: our main path below is load-latency dependent. Thus it is maybe - // beneficial to have fast paths that depend on branch prediction but have less latency. - // This results in more instructions but, potentially, also higher speeds. - // - const __m128i in = _mm_loadu_si128((__m128i *)input); - const uint16_t input_utf8_end_of_code_point_mask = - utf8_end_of_code_point_mask & 0xfff; //we're only processing 12 bytes in case it`s not all ASCII - if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) { - // We process the data in chunks of 16 bytes. - _mm_storeu_si128(reinterpret_cast<__m128i *>(latin1_output), in); - latin1_output += 16; // We wrote 16 characters. - return 16; // We consumed 16 bytes. +simdutf_warn_unused size_t implementation::utf8_length_from_utf16be( + const char16_t *input, size_t length) const noexcept { + return utf16::utf8_length_from_utf16_bytemask(input, length); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf32_length_from_utf16le( + const char16_t *input, size_t length) const noexcept { + return utf16::utf32_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf16be( + const char16_t *input, size_t length) const noexcept { + return utf16::utf32_length_from_utf16(input, length); +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::utf16_length_from_utf8( + const char *input, size_t length) const noexcept { + return utf8::utf16_length_from_utf8_bytemask(input, length); +} +simdutf_warn_unused result +implementation::utf8_length_from_utf16le_with_replacement( + const char16_t *input, size_t length) const noexcept { + return scalar::utf16::utf8_length_from_utf16_with_replacement< + endianness::LITTLE>(input, length); +} + +simdutf_warn_unused result +implementation::utf8_length_from_utf16be_with_replacement( + const char16_t *input, size_t length) const noexcept { + return scalar::utf16::utf8_length_from_utf16_with_replacement< + endianness::BIG>(input, length); +} + +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf8_length_from_utf32( + const char32_t *input, size_t length) const noexcept { + return utf32::utf8_length_from_utf32(input, length); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf16_length_from_utf32( + const char32_t *input, size_t length) const noexcept { + __m128i v_ffff = lsx_splat_u32(0x0000ffff); + size_t pos = 0; + size_t count = 0; + for (; pos + 4 <= length; pos += 4) { + __m128i in = __lsx_vld(reinterpret_cast(input + pos), 0); + __m128i surrogate_bytemask = __lsx_vslt_wu(v_ffff, in); + size_t surrogate_count = __lsx_vpickve2gr_bu( + __lsx_vpcnt_b(__lsx_vmskltz_w(surrogate_bytemask)), 0); + count += 4 + surrogate_count; } - /// We do not have a fast path available, so we fallback. - const uint8_t idx = - tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; - const uint8_t consumed = - tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; - // this indicates an invalid input: - if(idx >= 64) { return consumed; } - // Here we should have (idx < 64), if not, there is a bug in the validation or elsewhere. - // SIX (6) input code-code units - // this is a relatively easy scenario - // we process SIX (6) input code-code units. The max length in bytes of six code - // code units spanning between 1 and 2 bytes each is 12 bytes. On processors - // where pdep/pext is fast, we might be able to use a small lookup table. - const __m128i sh = - _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); - const __m128i perm = _mm_shuffle_epi8(in, sh); - const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); - const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); - __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); - const __m128i latin1_packed = _mm_packus_epi16(composed,composed); - // writing 8 bytes even though we only care about the first 6 bytes. - // performance note: it would be faster to use _mm_storeu_si128, we should investigate. - _mm_storel_epi64((__m128i *)latin1_output, latin1_packed); - latin1_output += 6; // We wrote 6 bytes. - return consumed; + return count + + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos); } -/* end file src/westmere/sse_convert_utf8_to_latin1.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 -/* begin file src/westmere/sse_convert_utf16_to_latin1.cpp */ -template -std::pair sse_convert_utf16_to_latin1(const char16_t* buf, size_t len, char* latin1_output) { - const char16_t* end = buf + len; - while (buf + 8 <= end) { - // Load 8 UTF-16 characters into 128-bit SSE register - __m128i in = _mm_loadu_si128(reinterpret_cast(buf)); +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf32_length_from_utf8( + const char *input, size_t length) const noexcept { + return utf8::count_code_points(input, length); +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 - if (!match_system(big_endian)) { - const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - in = _mm_shuffle_epi8(in, swap); +#if SIMDUTF_FEATURE_BASE64 +simdutf_warn_unused result implementation::base64_to_binary( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + if (options & base64_default_or_url) { + if (options == base64_options::base64_default_or_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } +} - __m128i high_byte_mask = _mm_set1_epi16((int16_t)0xFF00); - if (_mm_testz_si128(in, high_byte_mask)) { - // Pack 16-bit characters into 8-bit and store in latin1_output - __m128i latin1_packed = _mm_packus_epi16(in, in); - _mm_storel_epi64(reinterpret_cast<__m128i*>(latin1_output), latin1_packed); - // Adjust pointers for next iteration - buf += 8; - latin1_output += 8; +simdutf_warn_unused full_result implementation::base64_to_binary_details( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + if (options & base64_default_or_url) { + if (options == base64_options::base64_default_or_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); } else { - return std::make_pair(nullptr, reinterpret_cast(latin1_output)); + return compress_decode_base64( + output, input, length, options, last_chunk_options); } - } // while - return std::make_pair(buf, latin1_output); + } else if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } } -template -std::pair sse_convert_utf16_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) { - const char16_t* start = buf; - const char16_t* end = buf + len; - while (buf + 8 <= end) { - __m128i in = _mm_loadu_si128(reinterpret_cast(buf)); - - if (!big_endian) { - const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - in = _mm_shuffle_epi8(in, swap); +simdutf_warn_unused result implementation::base64_to_binary( + const char16_t *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + if (options & base64_default_or_url) { + if (options == base64_options::base64_default_or_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); } + } +} - __m128i high_byte_mask = _mm_set1_epi16((int16_t)0xFF00); - if (_mm_testz_si128(in, high_byte_mask)) { - __m128i latin1_packed = _mm_packus_epi16(in, in); - _mm_storel_epi64(reinterpret_cast<__m128i*>(latin1_output), latin1_packed); - buf += 8; - latin1_output += 8; +simdutf_warn_unused full_result implementation::base64_to_binary_details( + const char16_t *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + if (options & base64_default_or_url) { + if (options == base64_options::base64_default_or_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); } else { - // Fallback to scalar code for handling errors - for(int k = 0; k < 8; k++) { - uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; - if(word <= 0xff) { - *latin1_output++ = char(word); - } else { - return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), latin1_output); - } - } - buf += 8; + return compress_decode_base64( + output, input, length, options, last_chunk_options); } - } // while - return std::make_pair(result(error_code::SUCCESS, buf - start), latin1_output); + } else if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } } -/* end file src/westmere/sse_convert_utf16_to_latin1.cpp */ -/* begin file src/westmere/sse_convert_utf16_to_utf8.cpp */ -/* - The vectorized algorithm works on single SSE register i.e., it - loads eight 16-bit code units. - We consider three cases: - 1. an input register contains no surrogates and each value - is in range 0x0000 .. 0x07ff. - 2. an input register contains no surrogates and values are - is in range 0x0000 .. 0xffff. - 3. an input register contains surrogates --- i.e. codepoints - can have 16 or 32 bits. +size_t implementation::binary_to_base64(const char *input, size_t length, + char *output, + base64_options options) const noexcept { + if (options & base64_url) { + return encode_base64(output, input, length, options); + } else { + return encode_base64(output, input, length, options); + } +} - Ad 1. +size_t implementation::binary_to_base64_with_lines( + const char *input, size_t length, char *output, size_t line_length, + base64_options options) const noexcept { + return scalar::base64::tail_encode_base64_impl(output, input, length, + options, line_length); +} - When values are less than 0x0800, it means that a 16-bit code unit - can be converted into: 1) single UTF8 byte (when it's an ASCII - char) or 2) two UTF8 bytes. +const char *implementation::find(const char *start, const char *end, + char character) const noexcept { + return util_find(start, end, character); +} - For this case we do only some shuffle to obtain these 2-byte - codes and finally compress the whole SSE register with a single - shuffle. +const char16_t *implementation::find(const char16_t *start, const char16_t *end, + char16_t character) const noexcept { + return util_find(start, end, character); +} +#endif // SIMDUTF_FEATURE_BASE64 + +} // namespace lasx +} // namespace simdutf + +/* begin file src/simdutf/lasx/end.h */ +#undef SIMDUTF_SIMD_HAS_UNSIGNED_CMP + +#if SIMDUTF_CAN_ALWAYS_RUN_LASX +// nothing needed. +#else +SIMDUTF_UNTARGET_REGION +#endif +/* end file src/simdutf/lasx/end.h */ +/* end file src/lasx/implementation.cpp */ +#endif +#if SIMDUTF_IMPLEMENTATION_LSX +/* begin file src/lsx/implementation.cpp */ +/* begin file src/simdutf/lsx/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "lsx" +// #define SIMDUTF_IMPLEMENTATION lsx +#define SIMDUTF_SIMD_HAS_UNSIGNED_CMP 1 +/* end file src/simdutf/lsx/begin.h */ +namespace simdutf { +namespace lsx { +namespace { +#ifndef SIMDUTF_LSX_H + #error "lsx.h must be included" +#endif +using namespace simd; + +#if SIMDUTF_FEATURE_UTF8 +// convert vmskltz/vmskgez/vmsknz to +// simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes index +const uint8_t lsx_1_2_utf8_bytes_mask[] = { + 0, 1, 4, 5, 16, 17, 20, 21, 64, 65, 68, 69, 80, 81, 84, + 85, 2, 3, 6, 7, 18, 19, 22, 23, 66, 67, 70, 71, 82, 83, + 86, 87, 8, 9, 12, 13, 24, 25, 28, 29, 72, 73, 76, 77, 88, + 89, 92, 93, 10, 11, 14, 15, 26, 27, 30, 31, 74, 75, 78, 79, + 90, 91, 94, 95, 32, 33, 36, 37, 48, 49, 52, 53, 96, 97, 100, + 101, 112, 113, 116, 117, 34, 35, 38, 39, 50, 51, 54, 55, 98, 99, + 102, 103, 114, 115, 118, 119, 40, 41, 44, 45, 56, 57, 60, 61, 104, + 105, 108, 109, 120, 121, 124, 125, 42, 43, 46, 47, 58, 59, 62, 63, + 106, 107, 110, 111, 122, 123, 126, 127, 128, 129, 132, 133, 144, 145, 148, + 149, 192, 193, 196, 197, 208, 209, 212, 213, 130, 131, 134, 135, 146, 147, + 150, 151, 194, 195, 198, 199, 210, 211, 214, 215, 136, 137, 140, 141, 152, + 153, 156, 157, 200, 201, 204, 205, 216, 217, 220, 221, 138, 139, 142, 143, + 154, 155, 158, 159, 202, 203, 206, 207, 218, 219, 222, 223, 160, 161, 164, + 165, 176, 177, 180, 181, 224, 225, 228, 229, 240, 241, 244, 245, 162, 163, + 166, 167, 178, 179, 182, 183, 226, 227, 230, 231, 242, 243, 246, 247, 168, + 169, 172, 173, 184, 185, 188, 189, 232, 233, 236, 237, 248, 249, 252, 253, + 170, 171, 174, 175, 186, 187, 190, 191, 234, 235, 238, 239, 250, 251, 254, + 255}; +#endif // SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32 +simdutf_really_inline __m128i lsx_swap_bytes(__m128i vec) { + return __lsx_vshuf4i_b(vec, 0b10110001); +} +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_ASCII || SIMDUTF_FEATURE_DETECT_ENCODING || \ + SIMDUTF_FEATURE_UTF8 +simdutf_really_inline bool is_ascii(const simd8x64 &input) { + return input.is_ascii(); +} +#endif // SIMDUTF_FEATURE_ASCII || SIMDUTF_FEATURE_DETECT_ENCODING || + // SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_really_inline simd8 +must_be_2_3_continuation(const simd8 prev2, + const simd8 prev3) { + simd8 is_third_byte = prev2 >= uint8_t(0b11100000u); + simd8 is_fourth_byte = prev3 >= uint8_t(0b11110000u); + return is_third_byte ^ is_fourth_byte; +} +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF8 && (SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32) +// common functions for utf8 conversions +simdutf_really_inline __m128i convert_utf8_3_byte_to_utf16(__m128i in) { + // Low half contains 10bbbbbb|10cccccc + // High half contains 1110aaaa|1110aaaa + const v16u8 sh = {2, 1, 5, 4, 8, 7, 11, 10, 0, 0, 3, 3, 6, 6, 9, 9}; + const v8u16 v0fff = {0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff}; + + __m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, (__m128i)sh); + // 1110aaaa => aaaa0000 + __m128i perm_high = __lsx_vslli_b(__lsx_vbsrl_v(perm, 8), 4); + // 10bbbbbb 10cccccc => 0010bbbb bbcccccc + __m128i composed = __lsx_vbitsel_v(__lsx_vsrli_h(perm, 2), /* perm >> 2*/ + perm, __lsx_vrepli_h(0x3f) /* 0x003f */); + // 0010bbbb bbcccccc => aaaabbbb bbcccccc + composed = __lsx_vbitsel_v(perm_high, composed, (__m128i)v0fff); + + return composed; +} + +simdutf_really_inline __m128i convert_utf8_2_byte_to_utf16(__m128i in) { + // 10bbbbb 110aaaaa => 00bbbbb 000aaaaa + __m128i composed = __lsx_vand_v(in, __lsx_vldi(0x3f)); + // 00bbbbbb 000aaaaa => 00000aaa aabbbbbb + composed = __lsx_vbitsel_v( + __lsx_vsrli_h(__lsx_vslli_h(composed, 8), 2), /* (aaaaa << 8) >> 2 */ + __lsx_vsrli_h(composed, 8), /* bbbbbb >> 8 */ + __lsx_vrepli_h(0x3f)); /* 0x003f */ + return composed; +} + +simdutf_really_inline __m128i +convert_utf8_1_to_2_byte_to_utf16(__m128i in, size_t shufutf8_idx) { + // Converts 6 1-2 byte UTF-8 characters to 6 UTF-16 characters. + // This is a relatively easy scenario + // we process SIX (6) input code-code units. The max length in bytes of six + // code code units spanning between 1 and 2 bytes each is 12 bytes. + __m128i sh = + __lsx_vld(reinterpret_cast( + simdutf::tables::utf8_to_utf16::shufutf8[shufutf8_idx]), + 0); + // Shuffle + // 1 byte: 00000000 0bbbbbbb + // 2 byte: 110aaaaa 10bbbbbb + __m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, sh); + // 1 byte: 00000000 0bbbbbbb + // 2 byte: 00000000 00bbbbbb + __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_h(0x7f)); // 6 or 7 bits + // 1 byte: 00000000 00000000 + // 2 byte: 00000aaa aa000000 + const __m128i v1f00 = lsx_splat_u16(0x1f00); + __m128i composed = __lsx_vsrli_h(__lsx_vand_v(perm, v1f00), 2); // 5 bits + // Combine with a shift right accumulate + // 1 byte: 00000000 0bbbbbbb + // 2 byte: 00000aaa aabbbbbb + composed = __lsx_vadd_h(ascii, composed); + return composed; +} +#endif // SIMDUTF_FEATURE_UTF8 && (SIMDUTF_FEATURE_UTF16 || + // SIMDUTF_FEATURE_UTF32) + +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +/* begin file src/lsx/lsx_validate_utf16.cpp */ +template +simd8 utf16_gather_high_bytes(const simd16 in0, + const simd16 in1) { + if (big_endian) { + const auto mask = simd16(0x00ff); + const auto t0 = in0 & mask; + const auto t1 = in1 & mask; + + return simd16::pack(t0, t1); + } else { + return simd16::pack_shifted_right<8>(in0, in1); + } +} +/* end file src/lsx/lsx_validate_utf16.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING +/* begin file src/lsx/lsx_validate_utf32le.cpp */ +const char32_t *lsx_validate_utf32le(const char32_t *input, size_t size) { + const char32_t *end = input + size; + + __m128i offset = lsx_splat_u32(0xffff2000); + __m128i standardoffsetmax = lsx_splat_u32(0xfffff7ff); + __m128i standardmax = lsx_splat_u32(0x10ffff); + __m128i currentmax = lsx_splat_u32(0); + __m128i currentoffsetmax = lsx_splat_u32(0); + + while (input + 4 < end) { + __m128i in = __lsx_vld(reinterpret_cast(input), 0); + currentmax = __lsx_vmax_wu(in, currentmax); + // 0xD8__ + 0x2000 = 0xF8__ => 0xF8__ > 0xF7FF + currentoffsetmax = + __lsx_vmax_wu(__lsx_vadd_w(in, offset), currentoffsetmax); + + input += 4; + } + + __m128i is_zero = + __lsx_vxor_v(__lsx_vmax_wu(currentmax, standardmax), standardmax); + if (__lsx_bnz_v(is_zero)) { + return nullptr; + } - We need 256-entry lookup table to get a compression pattern - and the number of output bytes in the compressed vector register. - Each entry occupies 17 bytes. + is_zero = __lsx_vxor_v(__lsx_vmax_wu(currentoffsetmax, standardoffsetmax), + standardoffsetmax); + if (__lsx_bnz_v(is_zero)) { + return nullptr; + } - Ad 2. + return input; +} - When values fit in 16-bit code units, but are above 0x07ff, then - a single word may produce one, two or three UTF8 bytes. +const result lsx_validate_utf32le_with_errors(const char32_t *input, + size_t size) { + const char32_t *start = input; + const char32_t *end = input + size; - We prepare data for all these three cases in two registers. - The first register contains lower two UTF8 bytes (used in all - cases), while the second one contains just the third byte for - the three-UTF8-bytes case. + __m128i offset = lsx_splat_u32(0xffff2000); + __m128i standardoffsetmax = lsx_splat_u32(0xfffff7ff); + __m128i standardmax = lsx_splat_u32(0x10ffff); + __m128i currentmax = lsx_splat_u32(0); + __m128i currentoffsetmax = lsx_splat_u32(0); - Finally these two registers are interleaved forming eight-element - array of 32-bit values. The array spans two SSE registers. - The bytes from the registers are compressed using two shuffles. + while (input + 4 < end) { + __m128i in = __lsx_vld(reinterpret_cast(input), 0); + currentmax = __lsx_vmax_wu(in, currentmax); + currentoffsetmax = + __lsx_vmax_wu(__lsx_vadd_w(in, offset), currentoffsetmax); - We need 256-entry lookup table to get a compression pattern - and the number of output bytes in the compressed vector register. - Each entry occupies 17 bytes. + __m128i is_zero = + __lsx_vxor_v(__lsx_vmax_wu(currentmax, standardmax), standardmax); + if (__lsx_bnz_v(is_zero)) { + return result(error_code::TOO_LARGE, input - start); + } + is_zero = __lsx_vxor_v(__lsx_vmax_wu(currentoffsetmax, standardoffsetmax), + standardoffsetmax); + if (__lsx_bnz_v(is_zero)) { + return result(error_code::SURROGATE, input - start); + } - To summarize: - - We need two 256-entry tables that have 8704 bytes in total. -*/ + input += 4; + } + + return result(error_code::SUCCESS, input - start); +} +/* end file src/lsx/lsx_validate_utf32le.cpp */ +#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/lsx/lsx_convert_latin1_to_utf8.cpp */ /* Returns a pair: the first unprocessed byte from buf and utf8_output A scalar routing should carry on the conversion of the tail. */ -template -std::pair sse_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) { - const char16_t* end = buf + len; - - const __m128i v_0000 = _mm_setzero_si128(); - const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800); - const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800); - const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 +std::pair +lsx_convert_latin1_to_utf8(const char *latin1_input, size_t len, + char *utf8_out) { + uint8_t *utf8_output = reinterpret_cast(utf8_out); + const char *end = latin1_input + len; - while (buf + 16 + safety_margin <= end) { - __m128i in = _mm_loadu_si128((__m128i*)buf); - if (big_endian) { - const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - in = _mm_shuffle_epi8(in, swap); - } - // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes - const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80); - if(_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!! - __m128i nextin = _mm_loadu_si128((__m128i*)buf+1); - if (big_endian) { - const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - nextin = _mm_shuffle_epi8(nextin, swap); - } - if(!_mm_testz_si128(nextin, v_ff80)) { - // 1. pack the bytes - // obviously suboptimal. - const __m128i utf8_packed = _mm_packus_epi16(in,in); - // 2. store (16 bytes) - _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); - // 3. adjust pointers - buf += 8; - utf8_output += 8; - in = nextin; - } else { - // 1. pack the bytes - // obviously suboptimal. - const __m128i utf8_packed = _mm_packus_epi16(in,nextin); - // 2. store (16 bytes) - _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); - // 3. adjust pointers - buf += 16; - utf8_output += 16; - continue; // we are done for this round! - } + __m128i zero = __lsx_vldi(0); + // We always write 16 bytes, of which more than the first 8 bytes + // are valid. A safety margin of 8 is more than sufficient. + while (end - latin1_input >= 16) { + __m128i in8 = __lsx_vld(reinterpret_cast(latin1_input), 0); + uint32_t ascii = __lsx_vpickve2gr_hu(__lsx_vmskgez_b(in8), 0); + if (ascii == 0xffff) { // ASCII fast path!!!! + __lsx_vst(in8, utf8_output, 0); + utf8_output += 16; + latin1_input += 16; + continue; } + // We just fallback on UTF-16 code. This could be optimized/simplified + // further. + __m128i in16 = __lsx_vilvl_b(zero, in8); + // 1. prepare 2-byte values + // input 8-bit word : [aabb|bbbb] x 8 + // expected output : [1100|00aa|10bb|bbbb] x 8 + // t0 = [0000|00aa|bbbb|bb00] + __m128i t0 = __lsx_vslli_h(in16, 2); + // t1 = [0000|00aa|0000|0000] + __m128i t1 = __lsx_vand_v(t0, lsx_splat_u16(0x300)); + // t3 = [0000|00aa|00bb|bbbb] + __m128i t2 = __lsx_vbitsel_v(t1, in16, __lsx_vrepli_h(0x3f)); + // t4 = [1100|00aa|10bb|bbbb] + __m128i t3 = __lsx_vor_v(t2, __lsx_vreplgr2vr_h(uint16_t(0xc080))); + // merge ASCII and 2-byte codewords + __m128i one_byte_bytemask = __lsx_vsle_hu(in16, __lsx_vrepli_h(0x7F)); + __m128i utf8_unpacked = __lsx_vbitsel_v(t3, in16, one_byte_bytemask); + + const uint8_t *row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes + [lsx_1_2_utf8_bytes_mask[(ascii & 0xff)]][0]; + __m128i shuffle = __lsx_vld(row + 1, 0); + __m128i utf8_packed = __lsx_vshuf_b(zero, utf8_unpacked, shuffle); + + // store bytes + __lsx_vst(utf8_packed, utf8_output, 0); + // adjust pointers + latin1_input += 8; + utf8_output += row[0]; - // no bits set above 7th bit - const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000); - const uint16_t one_byte_bitmask = static_cast(_mm_movemask_epi8(one_byte_bytemask)); + } // while - // no bits set above 11th bit - const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000); - const uint16_t one_or_two_bytes_bitmask = static_cast(_mm_movemask_epi8(one_or_two_bytes_bytemask)); + return std::make_pair(latin1_input, reinterpret_cast(utf8_output)); +} +/* end file src/lsx/lsx_convert_latin1_to_utf8.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/lsx/lsx_convert_latin1_to_utf16.cpp */ +std::pair +lsx_convert_latin1_to_utf16le(const char *buf, size_t len, + char16_t *utf16_output) { + const char *end = buf + len; + + __m128i zero = __lsx_vldi(0); + while (end - buf >= 16) { + __m128i in8 = __lsx_vld(reinterpret_cast(buf), 0); + + __m128i inlow = __lsx_vilvl_b(zero, in8); + __m128i inhigh = __lsx_vilvh_b(zero, in8); + __lsx_vst(inlow, reinterpret_cast(utf16_output), 0); + __lsx_vst(inhigh, reinterpret_cast(utf16_output), 16); + + utf16_output += 16; + buf += 16; + } - if (one_or_two_bytes_bitmask == 0xffff) { - internal::westmere::write_v_u16_11bits_to_utf8(in, utf8_output, one_byte_bytemask, one_byte_bitmask); - buf += 8; - continue; - } + return std::make_pair(buf, utf16_output); +} - // 1. Check if there are any surrogate word in the input chunk. - // We have also deal with situation when there is a surrogate word - // at the end of a chunk. - const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800); +std::pair +lsx_convert_latin1_to_utf16be(const char *buf, size_t len, + char16_t *utf16_output) { + const char *end = buf + len; + __m128i zero = __lsx_vldi(0); + while (end - buf >= 16) { + __m128i in8 = __lsx_vld(reinterpret_cast(buf), 0); - // bitmask = 0x0000 if there are no surrogates - // = 0xc000 if the last word is a surrogate - const uint16_t surrogates_bitmask = static_cast(_mm_movemask_epi8(surrogates_bytemask)); - // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, - // it is likely an uncommon occurrence. - if (surrogates_bitmask == 0x0000) { - // case: code units from register produce either 1, 2 or 3 UTF-8 bytes - const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + __m128i inlow = __lsx_vilvl_b(in8, zero); + __m128i inhigh = __lsx_vilvh_b(in8, zero); + __lsx_vst(inlow, reinterpret_cast(utf16_output), 0); + __lsx_vst(inhigh, reinterpret_cast(utf16_output), 16); + utf16_output += 16; + buf += 16; + } - /* In this branch we handle three cases: - 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte - 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes - 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes + return std::make_pair(buf, utf16_output); +} +/* end file src/lsx/lsx_convert_latin1_to_utf16.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/lsx/lsx_convert_latin1_to_utf32.cpp */ +std::pair +lsx_convert_latin1_to_utf32(const char *buf, size_t len, + char32_t *utf32_output) { + const char *end = buf + len; + + while (end - buf >= 16) { + __m128i in8 = __lsx_vld(reinterpret_cast(buf), 0); + + __m128i zero = __lsx_vldi(0); + __m128i in16low = __lsx_vilvl_b(zero, in8); + __m128i in16high = __lsx_vilvh_b(zero, in8); + __m128i in32_0 = __lsx_vilvl_h(zero, in16low); + __m128i in32_1 = __lsx_vilvh_h(zero, in16low); + __m128i in32_2 = __lsx_vilvl_h(zero, in16high); + __m128i in32_3 = __lsx_vilvh_h(zero, in16high); + + __lsx_vst(in32_0, reinterpret_cast(utf32_output), 0); + __lsx_vst(in32_1, reinterpret_cast(utf32_output + 4), 0); + __lsx_vst(in32_2, reinterpret_cast(utf32_output + 8), 0); + __lsx_vst(in32_3, reinterpret_cast(utf32_output + 12), 0); + + utf32_output += 16; + buf += 16; + } - We expand the input word (16-bit) into two code units (32-bit), thus - we have room for four bytes. However, we need five distinct bit - layouts. Note that the last byte in cases #2 and #3 is the same. + return std::make_pair(buf, utf32_output); +} +/* end file src/lsx/lsx_convert_latin1_to_utf32.cpp */ +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 - We precompute byte 1 for case #1 and the common byte for cases #2 & #3 - in register t2. +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +/* begin file src/lsx/lsx_convert_utf8_to_utf16.cpp */ +// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 16, usually 12). +template +size_t convert_masked_utf8_to_utf16(const char *input, + uint64_t utf8_end_of_code_point_mask, + char16_t *&utf16_output) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + __m128i in = __lsx_vld(reinterpret_cast(input), 0); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & 0xfff; + // + // Optimization note: our main path below is load-latency dependent. Thus it + // is maybe beneficial to have fast paths that depend on branch prediction but + // have less latency. This results in more instructions but, potentially, also + // higher speeds. - We precompute byte 1 for case #3 and -- **conditionally** -- precompute - either byte 1 for case #2 or byte 2 for case #3. Note that they - differ by exactly one bit. + // We first try a few fast paths. + // The obvious first test is ASCII, which actually consumes the full 16. + if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xFFFF) { + // We process in chunks of 16 bytes + // The routine in simd.h is reused. + simd8 temp{in}; + temp.store_ascii_as_utf16(utf16_output); + utf16_output += 16; // We wrote 16 16-bit characters. + return 16; // We consumed 16 bytes. + } - Finally from these two code units we build proper UTF-8 sequence, taking - into account the case (i.e, the number of bytes to write). - */ - /** - * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: - * t2 => [0ccc|cccc] [10cc|cccc] - * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) - */ -#define simdutf_vec(x) _mm_set1_epi16(static_cast(x)) - // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] - const __m128i t0 = _mm_shuffle_epi8(in, dup_even); - // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] - const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111)); - // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] - const __m128i t2 = _mm_or_si128 (t1, simdutf_vec(0b1000000000000000)); - - // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] - const __m128i s0 = _mm_srli_epi16(in, 4); - // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] - const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100)); - // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] - const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140)); - // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] - const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000)); - const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000)); - const __m128i s4 = _mm_xor_si128(s3, m0); -#undef simdutf_vec + uint64_t buffer[2]; + // 3 byte sequences are the next most common, as seen in CJK, which has long + // sequences of these. + if (input_utf8_end_of_code_point_mask == 0x924) { + // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte + // UTF-16 code units. + __m128i composed = convert_utf8_3_byte_to_utf16(in); + // Byte swap if necessary + if simdutf_constexpr (!match_system(big_endian)) { + composed = lsx_swap_bytes(composed); + } - // 4. expand code units 16-bit => 32-bit - const __m128i out0 = _mm_unpacklo_epi16(t2, s4); - const __m128i out1 = _mm_unpackhi_epi16(t2, s4); + __lsx_vst(composed, reinterpret_cast(utf16_output), 0); + utf16_output += 4; // We wrote 4 16-bit characters. + return 12; // We consumed 12 bytes. + } - // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle - const uint16_t mask = (one_byte_bitmask & 0x5555) | - (one_or_two_bytes_bitmask & 0xaaaa); - if(mask == 0) { - // We only have three-byte code units. Use fast path. - const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); - const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle); - const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle); - _mm_storeu_si128((__m128i*)utf8_output, utf8_0); - utf8_output += 12; - _mm_storeu_si128((__m128i*)utf8_output, utf8_1); - utf8_output += 12; - buf += 8; - continue; - } - const uint8_t mask0 = uint8_t(mask); + // 2 byte sequences occur in short bursts in languages like Greek and Russian. + if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xAAAA) { + // We want to take 6 2-byte UTF-8 code units and turn them into 6 2-byte + // UTF-16 code units. + __m128i composed = convert_utf8_2_byte_to_utf16(in); + // Byte swap if necessary + if simdutf_constexpr (!match_system(big_endian)) { + composed = lsx_swap_bytes(composed); + } - const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; - const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1)); - const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0); + __lsx_vst(composed, reinterpret_cast(utf16_output), 0); + utf16_output += 6; // We wrote 6 16-bit characters. + return 12; // We consumed 12 bytes. + } - const uint8_t mask1 = static_cast(mask >> 8); + /// We do not have a fast path available, or the fast path is unimportant, so + /// we fallback. + const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex + [input_utf8_end_of_code_point_mask][0]; - const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; - const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1)); - const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1); + const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex + [input_utf8_end_of_code_point_mask][1]; + const __m128i zero = __lsx_vldi(0); + if (idx < 64) { + // SIX (6) input code-code units + // Convert to UTF-16 + __m128i composed = convert_utf8_1_to_2_byte_to_utf16(in, idx); + // Byte swap if necessary + if simdutf_constexpr (!match_system(big_endian)) { + composed = lsx_swap_bytes(composed); + } + // Store + __lsx_vst(composed, reinterpret_cast(utf16_output), 0); + utf16_output += 6; // We wrote 6 16-bit characters. + return consumed; + } else if (idx < 145) { + // FOUR (4) input code-code units + // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing. + __m128i sh = __lsx_vld(reinterpret_cast( + simdutf::tables::utf8_to_utf16::shufutf8[idx]), + 0); + // XXX: depending on the system scalar instructions might be faster. + // 1 byte: 00000000 00000000 0ccccccc + // 2 byte: 00000000 110bbbbb 10cccccc + // 3 byte: 1110aaaa 10bbbbbb 10cccccc + sh = __lsx_vand_v(sh, __lsx_vldi(0x1f)); + __m128i perm = __lsx_vshuf_b(zero, in, sh); + // 1 byte: 00000000 0ccccccc + // 2 byte: xx0bbbbb x0cccccc + // 3 byte: xxbbbbbb x0cccccc + __m128i lowperm = __lsx_vpickev_h(perm, perm); + // 1 byte: 00000000 00000000 + // 2 byte: 00000000 00000000 + // 3 byte: 00000000 1110aaaa + __m128i highperm = __lsx_vpickod_h(perm, perm); + // 3 byte: aaaa0000 00000000 + highperm = __lsx_vslli_h(highperm, 12); + // ASCII + // 1 byte: 00000000 0ccccccc + // 2+byte: 00000000 00cccccc + __m128i ascii = __lsx_vand_v(lowperm, __lsx_vrepli_h(0x7f)); + // 1 byte: 00000000 00000000 + // 2 byte: xx0bbbbb 00000000 + // 3 byte: xxbbbbbb 00000000 + __m128i middlebyte = __lsx_vand_v(lowperm, lsx_splat_u16(0xFF00)); + // 1 byte: 00000000 0ccccccc + // 2 byte: 0010bbbb bbcccccc + // 3 byte: 0010bbbb bbcccccc + __m128i composed = __lsx_vor_v(__lsx_vsrli_h(middlebyte, 2), ascii); - _mm_storeu_si128((__m128i*)utf8_output, utf8_0); - utf8_output += row0[0]; - _mm_storeu_si128((__m128i*)utf8_output, utf8_1); - utf8_output += row1[0]; + __m128i v0fff = __lsx_vreplgr2vr_h(uint16_t(0xfff)); + // aaaabbbb bbcccccc + composed = __lsx_vbitsel_v(highperm, composed, v0fff); - buf += 8; - // surrogate pair(s) in a register - } else { - // Let us do a scalar fallback. - // It may seem wasteful to use scalar code, but being efficient with SIMD - // in the presence of surrogate pairs may require non-trivial tables. - size_t forward = 15; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; - if((word & 0xFF80)==0) { - *utf8_output++ = char(word); - } else if((word & 0xF800)==0) { - *utf8_output++ = char((word>>6) | 0b11000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else if((word &0xF800 ) != 0xD800) { - *utf8_output++ = char((word>>12) | 0b11100000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else { - // must be a surrogate pair - uint16_t diff = uint16_t(word - 0xD800); - uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1]; - k++; - uint16_t diff2 = uint16_t(next_word - 0xDC00); - if((diff | diff2) > 0x3FF) { return std::make_pair(nullptr, utf8_output); } - uint32_t value = (diff << 10) + diff2 + 0x10000; - *utf8_output++ = char((value>>18) | 0b11110000); - *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000); - *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((value & 0b111111) | 0b10000000); - } + if simdutf_constexpr (!match_system(big_endian)) { + composed = lsx_swap_bytes(composed); + } + + __lsx_vst(composed, reinterpret_cast(utf16_output), 0); + utf16_output += 4; // We wrote 4 16-bit codepoints + return consumed; + } else if (idx < 209) { + // THREE (3) input code-code units + if (input_utf8_end_of_code_point_mask == 0x888) { + // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte + // UTF-16 pairs. Generating surrogate pairs is a little tricky though, but + // it is easier when we can assume they are all pairs. This version does + // not use the LUT, but 4 byte sequences are less common and the overhead + // of the extra memory access is less important than the early branch + // overhead in shorter sequences. + + __m128i expected_mask = + (__m128i)v16u8{0xf8, 0xc0, 0xc0, 0xc0, 0xf8, 0xc0, 0xc0, 0xc0, + 0xf8, 0xc0, 0xc0, 0xc0, 0x0, 0x0, 0x0, 0x0}; + __m128i expected = + (__m128i)v16u8{0xf0, 0x80, 0x80, 0x80, 0xf0, 0x80, 0x80, 0x80, + 0xf0, 0x80, 0x80, 0x80, 0x0, 0x0, 0x0, 0x0}; + __m128i check = __lsx_vseq_b(__lsx_vand_v(in, expected_mask), expected); + if (__lsx_bz_b(check)) + return 12; + // Swap byte pairs + // 10dddddd 10cccccc|10bbbbbb 11110aaa + // 10cccccc 10dddddd|11110aaa 10bbbbbb + __m128i swap = lsx_swap_bytes(in); + // Shift left 2 bits + // cccccc00 dddddd00 xxxxxxxx bbbbbb00 + __m128i shift = __lsx_vslli_b(swap, 2); + // Create a magic number containing the low 2 bits of the trail surrogate + // and all the corrections needed to create the pair. UTF-8 4b prefix = + // -0x0000|0xF000 surrogate offset = -0x0000|0x0040 (0x10000 << 6) + // surrogate high = +0x0000|0xD800 + // surrogate low = +0xDC00|0x0000 + // ------------------------------- + // = +0xDC00|0xE7C0 + __m128i magic = __lsx_vreplgr2vr_w(uint32_t(0xDC00E7C0)); + // Generate unadjusted trail surrogate minus lowest 2 bits + // xxxxxxxx xxxxxxxx|11110aaa bbbbbb00 + __m128i trail = __lsx_vbitsel_v(shift, swap, lsx_splat_u32(0x0000ff00)); + // Insert low 2 bits of trail surrogate to magic number for later + // 11011100 00000000 11100111 110000cc + __m128i magic_with_low_2 = __lsx_vor_v(__lsx_vsrli_w(shift, 30), magic); + + // Generate lead surrogate + // xxxxcccc ccdddddd|xxxxxxxx xxxxxxxx + // 000000cc ccdddddd|xxxxxxxx xxxxxxxx + __m128i lead = __lsx_vbitsel_v( + __lsx_vsrli_h(__lsx_vand_v(shift, __lsx_vldi(0x3F)), 4), swap, + __lsx_vrepli_h(0x3f /* 0x003f*/)); + + // Blend pairs + // 000000cc ccdddddd|11110aaa bbbbbb00 + __m128i blend = __lsx_vbitsel_v(lead, trail, lsx_splat_u32(0x0000FFFF)); + + // Add magic number to finish the result + // 110111CC CCDDDDDD|110110AA BBBBBBCC + __m128i composed = __lsx_vadd_h(blend, magic_with_low_2); + // Byte swap if necessary + if simdutf_constexpr (!match_system(big_endian)) { + composed = lsx_swap_bytes(composed); + } + // __lsx_vst(composed, reinterpret_cast(utf16_output), 0); + __lsx_vst(composed, reinterpret_cast(buffer), 0); + std::memcpy(utf16_output, buffer, 12); + utf16_output += 6; // We 3 32-bit surrogate pairs. + return 12; // We consumed 12 bytes. + } + // 3 1-4 byte sequences + __m128i sh = __lsx_vld(reinterpret_cast( + simdutf::tables::utf8_to_utf16::shufutf8[idx]), + 0); + // 1 byte: 00000000 00000000 00000000 0ddddddd + // 3 byte: 00000000 00000000 110ccccc 10dddddd + // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd + // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd + sh = __lsx_vand_v(sh, __lsx_vldi(0x1f)); + __m128i perm = __lsx_vshuf_b(zero, in, sh); + // added to fix issue https://github.com/simdutf/simdutf/issues/514 + // We only want to write 2 * 16-bit code units when that is actually what we + // have. Unfortunately, we cannot trust the input. So it is possible to get + // 0xff as an input byte and it should not result in a surrogate pair. We + // need to check for that. + uint32_t permbuffer[4]; + __lsx_vst(perm, permbuffer, 0); + // Mask the low and middle bytes + // 00000000 00000000 00000000 0ddddddd + __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7f)); + // Because the surrogates need more work, the high surrogate is computed + // first. + __m128i middlehigh = __lsx_vslli_w(perm, 2); + // 00000000 00000000 00cccccc 00000000 + __m128i middlebyte = __lsx_vand_v(perm, lsx_splat_u32(0x00003F00)); + // Start assembling the sequence. Since the 4th byte is in the same position + // as it would be in a surrogate and there is no dependency, shift left + // instead of right. 3 byte: 00000000 10bbbbxx xxxxxxxx xxxxxxxx 4 byte: + // 11110aaa bbbbbbxx xxxxxxxx xxxxxxxx + __m128i ab = __lsx_vbitsel_v(middlehigh, perm, lsx_splat_u32(0xFF000000)); + // Top 16 bits contains the high ten bits of the surrogate pair before + // correction 3 byte: 00000000 10bbbbcc|cccc0000 00000000 4 byte: 11110aaa + // bbbbbbcc|cccc0000 00000000 - high 10 bits correct w/o correction + __m128i v_fffc0000 = __lsx_vreplgr2vr_w(uint32_t(0xFFFC0000)); + __m128i abc = __lsx_vbitsel_v(__lsx_vslli_w(middlebyte, 4), ab, v_fffc0000); + // Combine the low 6 or 7 bits by a shift right accumulate + // 3 byte: 00000000 00000010|bbbbcccc ccdddddd - low 16 bits correct + // 4 byte: 00000011 110aaabb|bbbbcccc ccdddddd - low 10 bits correct w/o + // correction + __m128i composed = __lsx_vor_v(ascii, __lsx_vsrli_w(abc, 6)); + // After this is for surrogates + // Blend the low and high surrogates + // 4 byte: 11110aaa bbbbbbcc|bbbbcccc ccdddddd + __m128i mixed = __lsx_vbitsel_v(abc, composed, lsx_splat_u32(0x0000FFFF)); + // Clear the upper 6 bits of the low surrogate. Don't clear the upper bits + // yet as 0x10000 was not subtracted from the codepoint yet. 4 byte: + // 11110aaa bbbbbbcc|000000cc ccdddddd + __m128i v_ffff03ff = __lsx_vreplgr2vr_w(uint32_t(0xFFFF03FF)); + __m128i masked_pair = __lsx_vand_v(mixed, v_ffff03ff); + // Correct the remaining UTF-8 prefix, surrogate offset, and add the + // surrogate prefixes in one magic 16-bit addition. similar magic number but + // without the continue byte adjust and halfword swapped UTF-8 4b prefix = + // -0xF000|0x0000 surrogate offset = -0x0040|0x0000 (0x10000 << 6) + // surrogate high = +0xD800|0x0000 + // surrogate low = +0x0000|0xDC00 + // ----------------------------------- + // = +0xE7C0|0xDC00 + __m128i magic = __lsx_vreplgr2vr_w(uint32_t(0xE7C0DC00)); + // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD - surrogate pair complete + __m128i surrogates = __lsx_vadd_w(masked_pair, magic); + // If the high bit is 1 (s32 less than zero), this needs a surrogate pair + __m128i is_pair = __lsx_vslt_w(perm, zero); + // Select either the 4 byte surrogate pair or the 2 byte solo codepoint + // 3 byte: 0xxxxxxx xxxxxxxx|bbbbcccc ccdddddd + // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD + __m128i selected = __lsx_vbitsel_v(composed, surrogates, is_pair); + // Byte swap if necessary + if simdutf_constexpr (!match_system(big_endian)) { + selected = lsx_swap_bytes(selected); + } + // Attempting to shuffle and store would be complex, just scalarize. + uint32_t buffer_tmp[4]; + __lsx_vst(selected, buffer_tmp, 0); + // Test for the top bit of the surrogate mask. Remove due to issue 514 + // const uint32_t SURROGATE_MASK = match_system(big_endian) ? 0x80000000 : + // 0x00800000; + for (size_t i = 0; i < 3; i++) { + // Surrogate + // Used to be if (buffer[i] & SURROGATE_MASK) { + // See discussion above. + // patch for issue https://github.com/simdutf/simdutf/issues/514 + if ((permbuffer[i] & 0xf8000000) == 0xf0000000) { + utf16_output[0] = uint16_t(buffer_tmp[i] >> 16); + utf16_output[1] = uint16_t(buffer_tmp[i] & 0xFFFF); + utf16_output += 2; + } else { + utf16_output[0] = uint16_t(buffer_tmp[i] & 0xFFFF); + utf16_output++; } - buf += k; } - } // while - - return std::make_pair(buf, utf8_output); + return consumed; + } else { + // here we know that there is an error but we do not handle errors + return 12; + } } +/* end file src/lsx/lsx_convert_utf8_to_utf16.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +/* begin file src/lsx/lsx_convert_utf8_to_utf32.cpp */ +// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 12). +size_t convert_masked_utf8_to_utf32(const char *input, + uint64_t utf8_end_of_code_point_mask, + char32_t *&utf32_out) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + uint32_t *&utf32_output = reinterpret_cast(utf32_out); + __m128i in = __lsx_vld(reinterpret_cast(input), 0); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & 0xFFF; + // + // Optimization note: our main path below is load-latency dependent. Thus it + // is maybe beneficial to have fast paths that depend on branch prediction but + // have less latency. This results in more instructions but, potentially, also + // higher speeds. + // + // We first try a few fast paths. + if ((utf8_end_of_code_point_mask & 0xffff) == 0xffff) { + // We process in chunks of 16 bytes. + // use fast implementation in src/simdutf/arm64/simd.h + // Ideally the compiler can keep the tables in registers. + simd8 temp{in}; + temp.store_ascii_as_utf32_tbl(utf32_out); + utf32_output += 16; // We wrote 16 32-bit characters. + return 16; // We consumed 16 bytes. + } + __m128i zero = __lsx_vldi(0); + if (input_utf8_end_of_code_point_mask == 0x924) { + // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte + // UTF-32 code units. Convert to UTF-16 + __m128i composed_utf16 = convert_utf8_3_byte_to_utf16(in); + __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16); + __lsx_vst(utf32_low, reinterpret_cast(utf32_output), 0); + utf32_output += 4; // We wrote 4 32-bit characters. + return 12; // We consumed 12 bytes. + } + // 2 byte sequences occur in short bursts in languages like Greek and Russian. + if (input_utf8_end_of_code_point_mask == 0xaaa) { + // We want to take 6 2-byte UTF-8 code units and turn them into 6 4-byte + // UTF-32 code units. Convert to UTF-16 + __m128i composed_utf16 = convert_utf8_2_byte_to_utf16(in); -/* - Returns a pair: a result struct and utf8_output. - If there is an error, the count field of the result is the position of the error. - Otherwise, it is the position of the first unprocessed byte in buf (even if finished). - A scalar routing should carry on the conversion of the tail if needed. -*/ -template -std::pair sse_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) { - const char16_t* start = buf; - const char16_t* end = buf + len; + __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16); + __m128i utf32_high = __lsx_vilvh_h(zero, composed_utf16); - const __m128i v_0000 = _mm_setzero_si128(); - const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800); - const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800); - const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 + __lsx_vst(utf32_low, reinterpret_cast(utf32_output), 0); + __lsx_vst(utf32_high, reinterpret_cast(utf32_output), 16); + utf32_output += 6; + return 12; // We consumed 12 bytes. + } + /// Either no fast path or an unimportant fast path. - while (buf + 16 + safety_margin <= end) { - __m128i in = _mm_loadu_si128((__m128i*)buf); - if (big_endian) { - const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - in = _mm_shuffle_epi8(in, swap); - } - // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes - const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80); - if(_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!! - __m128i nextin = _mm_loadu_si128((__m128i*)buf+1); - if (big_endian) { - const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - nextin = _mm_shuffle_epi8(nextin, swap); - } - if(!_mm_testz_si128(nextin, v_ff80)) { - // 1. pack the bytes - // obviously suboptimal. - const __m128i utf8_packed = _mm_packus_epi16(in,in); - // 2. store (16 bytes) - _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); - // 3. adjust pointers - buf += 8; - utf8_output += 8; - in = nextin; - } else { - // 1. pack the bytes - // obviously suboptimal. - const __m128i utf8_packed = _mm_packus_epi16(in,nextin); - // 2. store (16 bytes) - _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); - // 3. adjust pointers - buf += 16; - utf8_output += 16; - continue; // we are done for this round! - } - } + const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex + [input_utf8_end_of_code_point_mask][0]; + const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex + [input_utf8_end_of_code_point_mask][1]; - // no bits set above 7th bit - const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000); - const uint16_t one_byte_bitmask = static_cast(_mm_movemask_epi8(one_byte_bytemask)); + if (idx < 64) { + // SIX (6) input code-code units + // Convert to UTF-16 + __m128i composed_utf16 = convert_utf8_1_to_2_byte_to_utf16(in, idx); + __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16); + __m128i utf32_high = __lsx_vilvh_h(zero, composed_utf16); - // no bits set above 11th bit - const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000); - const uint16_t one_or_two_bytes_bitmask = static_cast(_mm_movemask_epi8(one_or_two_bytes_bytemask)); + __lsx_vst(utf32_low, reinterpret_cast(utf32_output), 0); + __lsx_vst(utf32_high, reinterpret_cast(utf32_output), 16); + utf32_output += 6; + return consumed; + } else if (idx < 145) { + // FOUR (4) input code-code units + // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing. + __m128i sh = __lsx_vld(reinterpret_cast( + simdutf::tables::utf8_to_utf16::shufutf8[idx]), + 0); + // Shuffle + // 1 byte: 00000000 00000000 0ccccccc + // 2 byte: 00000000 110bbbbb 10cccccc + // 3 byte: 1110aaaa 10bbbbbb 10cccccc + sh = __lsx_vand_v(sh, __lsx_vldi(0x1f)); + __m128i perm = __lsx_vshuf_b(zero, in, sh); + // Split + // 00000000 00000000 0ccccccc + __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7F)); // 6 or 7 bits + // Note: unmasked + // xxxxxxxx aaaaxxxx xxxxxxxx + __m128i high = + __lsx_vsrli_w(__lsx_vand_v(perm, __lsx_vldi(0xf)), 4); // 4 bits + // Use 16 bit bic instead of and. + // The top bits will be corrected later in the bsl + // 00000000 10bbbbbb 00000000 + __m128i middle = + __lsx_vand_v(perm, lsx_splat_u32(0x0000FF00)); // 5 or 6 bits + // Combine low and middle with shift right accumulate + // 00000000 00xxbbbb bbcccccc + __m128i lowmid = __lsx_vor_v(ascii, __lsx_vsrli_w(middle, 2)); + // Insert top 4 bits from high byte with bitwise select + // 00000000 aaaabbbb bbcccccc + __m128i composed = __lsx_vbitsel_v(lowmid, high, lsx_splat_u32(0x0000F000)); + __lsx_vst(composed, utf32_output, 0); + utf32_output += 4; // We wrote 4 32-bit characters. + return consumed; + } else if (idx < 209) { + // THREE (3) input code-code units + if (input_utf8_end_of_code_point_mask == 0x888) { + // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte + // UTF-32 code units. This uses the same method as the fixed 3 byte + // version, reversing and shift left insert. However, there is no need for + // a shuffle mask now, just rev16 and rev32. + // + // This version does not use the LUT, but 4 byte sequences are less common + // and the overhead of the extra memory access is less important than the + // early branch overhead in shorter sequences, so it comes last. - if (one_or_two_bytes_bitmask == 0xffff) { - internal::westmere::write_v_u16_11bits_to_utf8(in, utf8_output, one_byte_bytemask, one_byte_bitmask); - buf += 8; - continue; + // Swap pairs of bytes + // 10dddddd|10cccccc|10bbbbbb|11110aaa + // 10cccccc 10dddddd|11110aaa 10bbbbbb + __m128i swap = lsx_swap_bytes(in); + // Shift left and insert + // xxxxcccc ccdddddd|xxxxxxxa aabbbbbb + __m128i merge1 = __lsx_vbitsel_v(__lsx_vsrli_h(swap, 2), swap, + __lsx_vrepli_h(0x3f /*0x003F*/)); + // Shift insert again + // xxxxxxxx xxxaaabb bbbbcccc ccdddddd + __m128i merge2 = + __lsx_vbitsel_v(__lsx_vslli_w(merge1, 12), /* merge1 << 12 */ + __lsx_vsrli_w(merge1, 16), /* merge1 >> 16 */ + lsx_splat_u32(0x00000FFF)); + // Clear the garbage + // 00000000 000aaabb bbbbcccc ccdddddd + __m128i composed = __lsx_vand_v(merge2, lsx_splat_u32(0x1FFFFF)); + // Store + __lsx_vst(composed, utf32_output, 0); + utf32_output += 3; // We wrote 3 32-bit characters. + return 12; // We consumed 12 bytes. } + // Unlike UTF-16, doing a fast codepath doesn't have nearly as much benefit + // due to surrogates no longer being involved. + __m128i sh = __lsx_vld(reinterpret_cast( + simdutf::tables::utf8_to_utf16::shufutf8[idx]), + 0); + // 1 byte: 00000000 00000000 00000000 0ddddddd + // 2 byte: 00000000 00000000 110ccccc 10dddddd + // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd + // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd + sh = __lsx_vand_v(sh, __lsx_vldi(0x1f)); + __m128i perm = __lsx_vshuf_b(zero, in, sh); - // 1. Check if there are any surrogate word in the input chunk. - // We have also deal with situation when there is a surrogate word - // at the end of a chunk. - const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800); - - // bitmask = 0x0000 if there are no surrogates - // = 0xc000 if the last word is a surrogate - const uint16_t surrogates_bitmask = static_cast(_mm_movemask_epi8(surrogates_bytemask)); - // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, - // it is likely an uncommon occurrence. - if (surrogates_bitmask == 0x0000) { - // case: code units from register produce either 1, 2 or 3 UTF-8 bytes - const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); - - /* In this branch we handle three cases: - 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte - 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes - 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes - - We expand the input word (16-bit) into two code units (32-bit), thus - we have room for four bytes. However, we need five distinct bit - layouts. Note that the last byte in cases #2 and #3 is the same. - - We precompute byte 1 for case #1 and the common byte for cases #2 & #3 - in register t2. - - We precompute byte 1 for case #3 and -- **conditionally** -- precompute - either byte 1 for case #2 or byte 2 for case #3. Note that they - differ by exactly one bit. - - Finally from these two code units we build proper UTF-8 sequence, taking - into account the case (i.e, the number of bytes to write). - */ - /** - * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: - * t2 => [0ccc|cccc] [10cc|cccc] - * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) - */ -#define simdutf_vec(x) _mm_set1_epi16(static_cast(x)) - // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] - const __m128i t0 = _mm_shuffle_epi8(in, dup_even); - // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] - const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111)); - // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] - const __m128i t2 = _mm_or_si128 (t1, simdutf_vec(0b1000000000000000)); - - // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] - const __m128i s0 = _mm_srli_epi16(in, 4); - // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] - const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100)); - // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] - const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140)); - // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] - const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000)); - const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000)); - const __m128i s4 = _mm_xor_si128(s3, m0); -#undef simdutf_vec + // Ascii + __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7F)); + __m128i middle = __lsx_vand_v(perm, lsx_splat_u32(0x00003f00)); + // 00000000 00000000 0000cccc ccdddddd + __m128i cd = __lsx_vor_v(__lsx_vsrli_w(middle, 2), ascii); - // 4. expand code units 16-bit => 32-bit - const __m128i out0 = _mm_unpacklo_epi16(t2, s4); - const __m128i out1 = _mm_unpackhi_epi16(t2, s4); + __m128i correction = __lsx_vand_v(perm, lsx_splat_u32(0x00400000)); + __m128i corrected = __lsx_vadd_b(perm, __lsx_vsrli_w(correction, 1)); + // Insert twice + // 00000000 000aaabb bbbbxxxx xxxxxxxx + __m128i corrected_srli2 = + __lsx_vsrli_w(__lsx_vand_v(corrected, __lsx_vrepli_b(0x7)), 2); + __m128i ab = + __lsx_vbitsel_v(corrected_srli2, corrected, __lsx_vrepli_h(0x3f)); + ab = __lsx_vsrli_w(ab, 4); + // 00000000 000aaabb bbbbcccc ccdddddd + __m128i composed = __lsx_vbitsel_v(ab, cd, lsx_splat_u32(0x00000FFF)); + // Store + __lsx_vst(composed, utf32_output, 0); + utf32_output += 3; // We wrote 3 32-bit characters. + return consumed; + } else { + // here we know that there is an error but we do not handle errors + return 12; + } +} +/* end file src/lsx/lsx_convert_utf8_to_utf32.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/lsx/lsx_convert_utf8_to_latin1.cpp */ +size_t convert_masked_utf8_to_latin1(const char *input, + uint64_t utf8_end_of_code_point_mask, + char *&latin1_output) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + __m128i in = __lsx_vld(reinterpret_cast(input), 0); - // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle - const uint16_t mask = (one_byte_bitmask & 0x5555) | - (one_or_two_bytes_bitmask & 0xaaaa); - if(mask == 0) { - // We only have three-byte code units. Use fast path. - const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); - const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle); - const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle); - _mm_storeu_si128((__m128i*)utf8_output, utf8_0); - utf8_output += 12; - _mm_storeu_si128((__m128i*)utf8_output, utf8_1); - utf8_output += 12; - buf += 8; - continue; - } - const uint8_t mask0 = uint8_t(mask); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & 0xfff; + // Optimization note: our main path below is load-latency dependent. Thus it + // is maybe beneficial to have fast paths that depend on branch prediction but + // have less latency. This results in more instructions but, potentially, also + // higher speeds. - const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; - const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1)); - const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0); + // We first try a few fast paths. + // The obvious first test is ASCII, which actually consumes the full 16. + if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xFFFF) { + // We process in chunks of 16 bytes + __lsx_vst(in, reinterpret_cast(latin1_output), 0); + latin1_output += 16; // We wrote 16 18-bit characters. + return 16; // We consumed 16 bytes. + } + /// We do not have a fast path available, or the fast path is unimportant, so + /// we fallback. + const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex + [input_utf8_end_of_code_point_mask][0]; - const uint8_t mask1 = static_cast(mask >> 8); + const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex + [input_utf8_end_of_code_point_mask][1]; + // this indicates an invalid input: + if (idx >= 64) { + return consumed; + } + // Here we should have (idx < 64), if not, there is a bug in the validation or + // elsewhere. SIX (6) input code-code units this is a relatively easy scenario + // we process SIX (6) input code-code units. The max length in bytes of six + // code code units spanning between 1 and 2 bytes each is 12 bytes. Converts 6 + // 1-2 byte UTF-8 characters to 6 UTF-16 characters. This is a relatively easy + // scenario we process SIX (6) input code-code units. The max length in bytes + // of six code code units spanning between 1 and 2 bytes each is 12 bytes. + __m128i sh = __lsx_vld(reinterpret_cast( + simdutf::tables::utf8_to_utf16::shufutf8[idx]), + 0); + // Shuffle + // 1 byte: 00000000 0bbbbbbb + // 2 byte: 110aaaaa 10bbbbbb + sh = __lsx_vand_v(sh, __lsx_vldi(0x1f)); + __m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, sh); + // ascii mask + // 1 byte: 11111111 11111111 + // 2 byte: 00000000 00000000 + __m128i ascii_mask = __lsx_vslt_bu(perm, __lsx_vldi(0x80)); + // utf8 mask + // 1 byte: 00000000 00000000 + // 2 byte: 00111111 00111111 + __m128i utf8_mask = __lsx_vand_v(__lsx_vsle_bu(__lsx_vldi(0x80), perm), + __lsx_vldi(0b00111111)); + // mask + // 1 byte: 11111111 11111111 + // 2 byte: 00111111 00111111 + __m128i mask = __lsx_vor_v(utf8_mask, ascii_mask); + + __m128i composed = __lsx_vbitsel_v(__lsx_vsrli_h(perm, 2), perm, mask); + // writing 8 bytes even though we only care about the first 6 bytes. + __m128i latin1_packed = __lsx_vpickev_b(__lsx_vldi(0), composed); - const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; - const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1)); - const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1); + uint64_t buffer[2]; + // __lsx_vst(latin1_packed, reinterpret_cast(latin1_output), 0); + __lsx_vst(latin1_packed, reinterpret_cast(buffer), 0); + std::memcpy(latin1_output, buffer, 6); + latin1_output += 6; // We wrote 6 bytes. + return consumed; +} +/* end file src/lsx/lsx_convert_utf8_to_latin1.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 - _mm_storeu_si128((__m128i*)utf8_output, utf8_0); - utf8_output += row0[0]; - _mm_storeu_si128((__m128i*)utf8_output, utf8_1); - utf8_output += row1[0]; +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/lsx/lsx_convert_utf16_to_latin1.cpp */ +template +std::pair +lsx_convert_utf16_to_latin1(const char16_t *buf, size_t len, + char *latin1_output) { + const char16_t *end = buf + len; + while (end - buf >= 16) { + __m128i in = __lsx_vld(reinterpret_cast(buf), 0); + __m128i in1 = __lsx_vld(reinterpret_cast(buf), 16); + if simdutf_constexpr (!match_system(big_endian)) { + in = lsx_swap_bytes(in); + in1 = lsx_swap_bytes(in1); + } + if (__lsx_bz_v(__lsx_vpickod_b(in1, in))) { + // 1. pack the bytes + __m128i latin1_packed = __lsx_vpickev_b(in1, in); + // 2. store (8 bytes) + __lsx_vst(latin1_packed, reinterpret_cast(latin1_output), 0); + // 3. adjust pointers + buf += 16; + latin1_output += 16; + } else { + return std::make_pair(nullptr, reinterpret_cast(latin1_output)); + } + } // while + return std::make_pair(buf, latin1_output); +} - buf += 8; - // surrogate pair(s) in a register +template +std::pair +lsx_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len, + char *latin1_output) { + const char16_t *start = buf; + const char16_t *end = buf + len; + while (end - buf >= 16) { + __m128i in = __lsx_vld(reinterpret_cast(buf), 0); + __m128i in1 = __lsx_vld(reinterpret_cast(buf), 16); + if simdutf_constexpr (!match_system(big_endian)) { + in = lsx_swap_bytes(in); + in1 = lsx_swap_bytes(in1); + } + if (__lsx_bz_v(__lsx_vpickod_b(in1, in))) { + // 1. pack the bytes + __m128i latin1_packed = __lsx_vpickev_b(in1, in); + // 2. store (8 bytes) + __lsx_vst(latin1_packed, reinterpret_cast(latin1_output), 0); + // 3. adjust pointers + buf += 16; + latin1_output += 16; } else { // Let us do a scalar fallback. - // It may seem wasteful to use scalar code, but being efficient with SIMD - // in the presence of surrogate pairs may require non-trivial tables. - size_t forward = 15; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; - if((word & 0xFF80)==0) { - *utf8_output++ = char(word); - } else if((word & 0xF800)==0) { - *utf8_output++ = char((word>>6) | 0b11000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else if((word &0xF800 ) != 0xD800) { - *utf8_output++ = char((word>>12) | 0b11100000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); + for (int k = 0; k < 16; k++) { + uint16_t word = scalar::utf16::swap_if_needed(buf[k]); + if (word <= 0xff) { + *latin1_output++ = char(word); } else { - // must be a surrogate pair - uint16_t diff = uint16_t(word - 0xD800); - uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1]; - k++; - uint16_t diff2 = uint16_t(next_word - 0xDC00); - if((diff | diff2) > 0x3FF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf8_output); } - uint32_t value = (diff << 10) + diff2 + 0x10000; - *utf8_output++ = char((value>>18) | 0b11110000); - *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000); - *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((value & 0b111111) | 0b10000000); + return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), + latin1_output); } } - buf += k; } } // while - - return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output); + return std::make_pair(result(error_code::SUCCESS, buf - start), + latin1_output); } -/* end file src/westmere/sse_convert_utf16_to_utf8.cpp */ -/* begin file src/westmere/sse_convert_utf16_to_utf32.cpp */ +/* end file src/lsx/lsx_convert_utf16_to_latin1.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF8 +/* begin file src/lsx/lsx_convert_utf16_to_utf8.cpp */ /* The vectorized algorithm works on single SSE register i.e., it loads eight 16-bit code units. @@ -34748,731 +60589,1145 @@ std::pair sse_convert_utf16_to_utf8_with_errors(const char16_t* b array of 32-bit values. The array spans two SSE registers. The bytes from the registers are compressed using two shuffles. - We need 256-entry lookup table to get a compression pattern - and the number of output bytes in the compressed vector register. - Each entry occupies 17 bytes. + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + + + To summarize: + - We need two 256-entry tables that have 8704 bytes in total. +*/ +/* + Returns a pair: the first unprocessed byte from buf and utf8_output + A scalar routing should carry on the conversion of the tail. +*/ +template +std::pair +lsx_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) { + uint8_t *utf8_output = reinterpret_cast(utf8_out); + const char16_t *end = buf + len; + + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 + + __m128i v_07ff = __lsx_vreplgr2vr_h(uint16_t(0x7ff)); + while (end - buf >= std::ptrdiff_t(16 + safety_margin)) { + __m128i in = __lsx_vld(reinterpret_cast(buf), 0); + if simdutf_constexpr (!match_system(big_endian)) { + in = lsx_swap_bytes(in); + } + if (__lsx_bz_v( + __lsx_vslt_hu(__lsx_vrepli_h(0x7F), in))) { // ASCII fast path!!!! + // It is common enough that we have sequences of 16 consecutive ASCII + // characters. + __m128i nextin = __lsx_vld(reinterpret_cast(buf), 16); + if simdutf_constexpr (!match_system(big_endian)) { + nextin = lsx_swap_bytes(nextin); + } + if (__lsx_bz_v(__lsx_vslt_hu(__lsx_vrepli_h(0x7F), nextin))) { + // 1. pack the bytes + // obviously suboptimal. + __m128i utf8_packed = __lsx_vpickev_b(nextin, in); + // 2. store (16 bytes) + __lsx_vst(utf8_packed, utf8_output, 0); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } else { + // 1. pack the bytes + // obviously suboptimal. + __m128i utf8_packed = __lsx_vpickev_b(in, in); + // 2. store (8 bytes) + __lsx_vst(utf8_packed, utf8_output, 0); + // 3. adjust pointers + buf += 8; + utf8_output += 8; + in = nextin; + } + } + + __m128i zero = __lsx_vldi(0); + if (__lsx_bz_v(__lsx_vslt_hu(v_07ff, in))) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + // t0 = [000a|aaaa|bbbb|bb00] + __m128i t0 = __lsx_vslli_h(in, 2); + // t1 = [000a|aaaa|0000|0000] + __m128i t1 = __lsx_vand_v(t0, lsx_splat_u16(0x1f00)); + // t2 = [0000|0000|00bb|bbbb] + __m128i t2 = __lsx_vand_v(in, __lsx_vrepli_h(0x3f)); + // t3 = [000a|aaaa|00bb|bbbb] + __m128i t3 = __lsx_vor_v(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + __m128i v_c080 = __lsx_vreplgr2vr_h(uint16_t(0xc080)); + __m128i t4 = __lsx_vor_v(t3, v_c080); + // 2. merge ASCII and 2-byte codewords + __m128i one_byte_bytemask = + __lsx_vsle_hu(in, __lsx_vrepli_h(0x7F /*0x007F*/)); + __m128i utf8_unpacked = __lsx_vbitsel_v(t4, in, one_byte_bytemask); + // 3. prepare bitmask for 8-bit lookup + uint32_t m2 = __lsx_vpickve2gr_bu(__lsx_vmskltz_h(one_byte_bytemask), 0); + // 4. pack the bytes + const uint8_t *row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes + [lsx_1_2_utf8_bytes_mask[m2]][0]; + __m128i shuffle = __lsx_vld(row, 1); + __m128i utf8_packed = __lsx_vshuf_b(zero, utf8_unpacked, shuffle); + // 5. store bytes + __lsx_vst(utf8_packed, utf8_output, 0); + // 6. adjust pointers + buf += 8; + utf8_output += row[0]; + continue; + } + __m128i surrogates_bytemask = __lsx_vseq_h( + __lsx_vand_v(in, lsx_splat_u16(0xf800)), lsx_splat_u16(0xd800)); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. + // However, it is likely an uncommon occurrence. + if (__lsx_bz_v(surrogates_bytemask)) { + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - + single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - + two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - + three UTF-8 bytes + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. - To summarize: - - We need two 256-entry tables that have 8704 bytes in total. -*/ + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. -/* - Returns a pair: the first unprocessed byte from buf and utf8_output - A scalar routing should carry on the conversion of the tail. -*/ -template -std::pair sse_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) { - const char16_t* end = buf + len; + We precompute byte 1 for case #3 and -- **conditionally** -- + precompute either byte 1 for case #2 or byte 2 for case #3. Note that + they differ by exactly one bit. - const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800); - const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800); + Finally from these two code units we build proper UTF-8 sequence, + taking into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + __m128i t0 = __lsx_vpickev_b(in, in); + t0 = __lsx_vilvl_b(t0, t0); - while (buf + 8 <= end) { - __m128i in = _mm_loadu_si128((__m128i*)buf); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|00cc|cccc] + __m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F)); + __m128i t1 = __lsx_vand_v(t0, v_3f7f); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + __m128i t2 = __lsx_vor_v(t1, lsx_splat_u16(0x8000)); + + // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] + __m128i s0 = __lsx_vsrli_h(in, 12); + // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] + __m128i s1 = __lsx_vslli_h(in, 2); + // s1: [aabb|bbbb|cccc|cc00] => [00bb|bbbb|0000|0000] + s1 = __lsx_vand_v(s1, lsx_splat_u16(0x3f00)); + + // [00bb|bbbb|0000|aaaa] + __m128i s2 = __lsx_vor_v(s0, s1); + // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + __m128i v_c0e0 = __lsx_vreplgr2vr_h(uint16_t(0xC0E0)); + __m128i s3 = __lsx_vor_v(s2, v_c0e0); + __m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(in, v_07ff); + __m128i m0 = + __lsx_vandn_v(one_or_two_bytes_bytemask, lsx_splat_u16(0x4000)); + __m128i s4 = __lsx_vxor_v(s3, m0); - if (big_endian) { - const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - in = _mm_shuffle_epi8(in, swap); - } + // 4. expand code units 16-bit => 32-bit + __m128i out0 = __lsx_vilvl_h(s4, t2); + __m128i out1 = __lsx_vilvh_h(s4, t2); - // 1. Check if there are any surrogate word in the input chunk. - // We have also deal with situation when there is a surrogate word - // at the end of a chunk. - const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800); + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + __m128i one_byte_bytemask = __lsx_vsle_hu(in, __lsx_vrepli_h(0x7F)); + + __m128i one_or_two_bytes_bytemask_low = + __lsx_vilvl_h(one_or_two_bytes_bytemask, zero); + __m128i one_or_two_bytes_bytemask_high = + __lsx_vilvh_h(one_or_two_bytes_bytemask, zero); + + __m128i one_byte_bytemask_low = + __lsx_vilvl_h(one_byte_bytemask, one_byte_bytemask); + __m128i one_byte_bytemask_high = + __lsx_vilvh_h(one_byte_bytemask, one_byte_bytemask); + + const uint32_t mask0 = __lsx_vpickve2gr_bu( + __lsx_vmskltz_h(__lsx_vor_v(one_or_two_bytes_bytemask_low, + one_byte_bytemask_low)), + 0); + const uint32_t mask1 = __lsx_vpickve2gr_bu( + __lsx_vmskltz_h(__lsx_vor_v(one_or_two_bytes_bytemask_high, + one_byte_bytemask_high)), + 0); + + const uint8_t *row0 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + __m128i shuffle0 = __lsx_vld(row0, 1); + __m128i utf8_0 = __lsx_vshuf_b(zero, out0, shuffle0); + + const uint8_t *row1 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + __m128i shuffle1 = __lsx_vld(row1, 1); + __m128i utf8_1 = __lsx_vshuf_b(zero, out1, shuffle1); + + __lsx_vst(utf8_0, utf8_output, 0); + utf8_output += row0[0]; + __lsx_vst(utf8_1, utf8_output, 0); + utf8_output += row1[0]; - // bitmask = 0x0000 if there are no surrogates - // = 0xc000 if the last word is a surrogate - const uint16_t surrogates_bitmask = static_cast(_mm_movemask_epi8(surrogates_bytemask)); - // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, - // it is likely an uncommon occurrence. - if (surrogates_bitmask == 0x0000) { - // case: no surrogate pair, extend 16-bit code units to 32-bit code units - _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(in)); - _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(in,8))); - utf32_output += 8; - buf += 8; - // surrogate pair(s) in a register + buf += 8; + // surrogate pair(s) in a register } else { // Let us do a scalar fallback. // It may seem wasteful to use scalar code, but being efficient with SIMD // in the presence of surrogate pairs may require non-trivial tables. size_t forward = 15; size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; - if((word &0xF800 ) != 0xD800) { - *utf32_output++ = char32_t(word); + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint16_t word = scalar::utf16::swap_if_needed(buf[k]); + if ((word & 0xFF80) == 0) { + *utf8_output++ = char(word); + } else if ((word & 0xF800) == 0) { + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if ((word & 0xF800) != 0xD800) { + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); } else { // must be a surrogate pair uint16_t diff = uint16_t(word - 0xD800); - uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1]; + uint16_t next_word = + scalar::utf16::swap_if_needed(buf[k + 1]); k++; uint16_t diff2 = uint16_t(next_word - 0xDC00); - if((diff | diff2) > 0x3FF) { return std::make_pair(nullptr, utf32_output); } + if ((diff | diff2) > 0x3FF) { + return std::make_pair(nullptr, + reinterpret_cast(utf8_output)); + } uint32_t value = (diff << 10) + diff2 + 0x10000; - *utf32_output++ = char32_t(value); + *utf8_output++ = char((value >> 18) | 0b11110000); + *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); } } buf += k; } } // while - return std::make_pair(buf, utf32_output); + return std::make_pair(buf, reinterpret_cast(utf8_output)); } - /* Returns a pair: a result struct and utf8_output. - If there is an error, the count field of the result is the position of the error. - Otherwise, it is the position of the first unprocessed byte in buf (even if finished). - A scalar routing should carry on the conversion of the tail if needed. + If there is an error, the count field of the result is the position of the + error. Otherwise, it is the position of the first unprocessed byte in buf + (even if finished). A scalar routing should carry on the conversion of the + tail if needed. */ template -std::pair sse_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) { - const char16_t* start = buf; - const char16_t* end = buf + len; - - const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800); - const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800); - - while (buf + 8 <= end) { - __m128i in = _mm_loadu_si128((__m128i*)buf); - - if (big_endian) { - const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - in = _mm_shuffle_epi8(in, swap); - } - - // 1. Check if there are any surrogate word in the input chunk. - // We have also deal with situation when there is a surrogate word - // at the end of a chunk. - const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800); - - // bitmask = 0x0000 if there are no surrogates - // = 0xc000 if the last word is a surrogate - const uint16_t surrogates_bitmask = static_cast(_mm_movemask_epi8(surrogates_bytemask)); - // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, - // it is likely an uncommon occurrence. - if (surrogates_bitmask == 0x0000) { - // case: no surrogate pair, extend 16-bit code units to 32-bit code units - _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(in)); - _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(in,8))); - utf32_output += 8; - buf += 8; - // surrogate pair(s) in a register - } else { - // Let us do a scalar fallback. - // It may seem wasteful to use scalar code, but being efficient with SIMD - // in the presence of surrogate pairs may require non-trivial tables. - size_t forward = 15; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; - if((word &0xF800 ) != 0xD800) { - *utf32_output++ = char32_t(word); - } else { - // must be a surrogate pair - uint16_t diff = uint16_t(word - 0xD800); - uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1]; - k++; - uint16_t diff2 = uint16_t(next_word - 0xDC00); - if((diff | diff2) > 0x3FF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf32_output); } - uint32_t value = (diff << 10) + diff2 + 0x10000; - *utf32_output++ = char32_t(value); - } - } - buf += k; - } - } // while - return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output); -} -/* end file src/westmere/sse_convert_utf16_to_utf32.cpp */ - -/* begin file src/westmere/sse_convert_utf32_to_latin1.cpp */ -std::pair -sse_convert_utf32_to_latin1(const char32_t *buf, size_t len, - char *latin1_output) { - const size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16 - - __m128i high_bytes_mask = _mm_set1_epi32(0xFFFFFF00); - __m128i shufmask = - _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0); - - for (size_t i = 0; i < rounded_len; i += 16) { - __m128i in1 = _mm_loadu_si128((__m128i *)buf); - __m128i in2 = _mm_loadu_si128((__m128i *)(buf + 4)); - __m128i in3 = _mm_loadu_si128((__m128i *)(buf + 8)); - __m128i in4 = _mm_loadu_si128((__m128i *)(buf + 12)); - - __m128i check_combined = _mm_or_si128(in1, in2); - check_combined = _mm_or_si128(check_combined, in3); - check_combined = _mm_or_si128(check_combined, in4); - - if (!_mm_testz_si128(check_combined, high_bytes_mask)) { - return std::make_pair(nullptr, latin1_output); - } - __m128i pack1 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in1, shufmask), _mm_shuffle_epi8(in2, shufmask)); - __m128i pack2 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in3, shufmask), _mm_shuffle_epi8(in4, shufmask)); - __m128i pack = _mm_unpacklo_epi64(pack1, pack2); - _mm_storeu_si128((__m128i *)latin1_output, pack); - latin1_output += 16; - buf += 16; - } - - return std::make_pair(buf, latin1_output); -} - std::pair -sse_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len, - char *latin1_output) { - const char32_t *start = buf; - const size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16 - - __m128i high_bytes_mask = _mm_set1_epi32(0xFFFFFF00); - __m128i shufmask = - _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0); - - for (size_t i = 0; i < rounded_len; i += 16) { - __m128i in1 = _mm_loadu_si128((__m128i *)buf); - __m128i in2 = _mm_loadu_si128((__m128i *)(buf + 4)); - __m128i in3 = _mm_loadu_si128((__m128i *)(buf + 8)); - __m128i in4 = _mm_loadu_si128((__m128i *)(buf + 12)); - - __m128i check_combined = _mm_or_si128(in1, in2); - check_combined = _mm_or_si128(check_combined, in3); - check_combined = _mm_or_si128(check_combined, in4); +lsx_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len, + char *utf8_out) { + uint8_t *utf8_output = reinterpret_cast(utf8_out); + const char16_t *start = buf; + const char16_t *end = buf + len; - if (!_mm_testz_si128(check_combined, high_bytes_mask)) { - // Fallback to scalar code for handling errors - for (int k = 0; k < 16; k++) { - char32_t codepoint = buf[k]; - if (codepoint <= 0xff) { - *latin1_output++ = char(codepoint); - } else { - return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), - latin1_output); - } + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 + while (end - buf >= std::ptrdiff_t(16 + safety_margin)) { + __m128i in = __lsx_vld(reinterpret_cast(buf), 0); + if simdutf_constexpr (!match_system(big_endian)) { + in = lsx_swap_bytes(in); + } + if (__lsx_bz_v( + __lsx_vslt_hu(__lsx_vrepli_h(0x7F), in))) { // ASCII fast path!!!! + // It is common enough that we have sequences of 16 consecutive ASCII + // characters. + __m128i nextin = __lsx_vld(reinterpret_cast(buf), 16); + if simdutf_constexpr (!match_system(big_endian)) { + nextin = lsx_swap_bytes(nextin); } - buf += 16; - continue; - } - __m128i pack1 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in1, shufmask), _mm_shuffle_epi8(in2, shufmask)); - __m128i pack2 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in3, shufmask), _mm_shuffle_epi8(in4, shufmask)); - __m128i pack = _mm_unpacklo_epi64(pack1, pack2); - _mm_storeu_si128((__m128i *)latin1_output, pack); - latin1_output += 16; - buf += 16; - } - - return std::make_pair(result(error_code::SUCCESS, buf - start), - latin1_output); -} -/* end file src/westmere/sse_convert_utf32_to_latin1.cpp */ -/* begin file src/westmere/sse_convert_utf32_to_utf8.cpp */ -std::pair sse_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) { - const char32_t* end = buf + len; - - const __m128i v_0000 = _mm_setzero_si128();//__m128 = 128 bits - const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); //1111 1000 0000 0000 - const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080); //1100 0000 1000 0000 - const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80); //1111 1111 1000 0000 - const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000); //1111 1111 1111 1111 0000 0000 0000 0000 - const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff); //0111 1111 1111 1111 1111 1111 1111 1111 - __m128i running_max = _mm_setzero_si128(); - __m128i forbidden_bytemask = _mm_setzero_si128(); - const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 - - while (buf + 16 + safety_margin <= end) { //buf is a char32_t pointer, each char32_t has 4 bytes or 32 bits, thus buf + 16 * char_32t = 512 bits = 64 bytes - // We load two 16 bytes registers for a total of 32 bytes or 16 characters. - __m128i in = _mm_loadu_si128((__m128i*)buf); - __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);//These two values can hold only 8 UTF32 chars - running_max = _mm_max_epu32( - _mm_max_epu32(in, running_max), //take element-wise max char32_t from in and running_max vector - nextin); //and take element-wise max element from nextin and running_max vector - - // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned saturation - __m128i in_16 = _mm_packus_epi32( - _mm_and_si128(in, v_7fffffff), - _mm_and_si128(nextin, v_7fffffff) - );//in this context pack the two __m128 into a single - //By ensuring the highest bit is set to 0(&v_7fffffff), we're making sure all values are interpreted as non-negative, or specifically, the values are within the range of valid Unicode code points. - //remember : having leading byte 0 means a positive number by the two complements system. Unicode is well beneath the range where you'll start getting issues so that's OK. - - // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp - - // Check for ASCII fast path - - // ASCII fast path!!!! - // We eagerly load another 32 bytes, hoping that they will be ASCII too. - // The intuition is that we try to collect 16 ASCII characters which requires - // a total of 64 bytes of input. If we fail, we just pass thirdin and fourthin - // as our new inputs. - if(_mm_testz_si128(in_16, v_ff80)) { //if the first two blocks are ASCII - __m128i thirdin = _mm_loadu_si128((__m128i*)buf+2); - __m128i fourthin = _mm_loadu_si128((__m128i*)buf+3); - running_max = _mm_max_epu32(_mm_max_epu32(thirdin, running_max), fourthin);//take the running max of all 4 vectors thus far - __m128i nextin_16 = _mm_packus_epi32(_mm_and_si128(thirdin, v_7fffffff), _mm_and_si128(fourthin, v_7fffffff));//pack into 1 vector, now you have two - if(!_mm_testz_si128(nextin_16, v_ff80)) { //checks if the second packed vector is ASCII, if not: + if (__lsx_bz_v(__lsx_vslt_hu(__lsx_vrepli_h(0x7F), nextin))) { // 1. pack the bytes // obviously suboptimal. - const __m128i utf8_packed = _mm_packus_epi16(in_16,in_16); //creates two copy of in_16 in 1 vector - // 2. store (16 bytes) - _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); //put them into the output - // 3. adjust pointers - buf += 8; //the char32_t buffer pointer goes up 8 char32_t chars* 32 bits = 256 bits - utf8_output += 8; //same with output, e.g. lift the first two blocks alone. - // Proceed with next input - in_16 = nextin_16; - // We need to update in and nextin because they are used later. - in = thirdin; - nextin = fourthin; - } else { - // 1. pack the bytes - const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16); + __m128i utf8_packed = __lsx_vpickev_b(nextin, in); // 2. store (16 bytes) - _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); + __lsx_vst(utf8_packed, utf8_output, 0); // 3. adjust pointers buf += 16; utf8_output += 16; continue; // we are done for this round! + } else { + // 1. pack the bytes + // obviously suboptimal. + __m128i utf8_packed = __lsx_vpickev_b(in, in); + // 2. store (8 bytes) + __lsx_vst(utf8_packed, utf8_output, 0); + // 3. adjust pointers + buf += 8; + utf8_output += 8; + in = nextin; } } - // no bits set above 7th bit -- find out all the ASCII characters - const __m128i one_byte_bytemask = _mm_cmpeq_epi16( // this takes four bytes at a time and compares: - _mm_and_si128(in_16, v_ff80), // the vector that get only the first 9 bits of each 16-bit/2-byte units - v_0000 // - ); // they should be all zero if they are ASCII. E.g. ASCII in UTF32 is of format 0000 0000 0000 0XXX XXXX - // _mm_cmpeq_epi16 should now return a 1111 1111 1111 1111 for equals, and 0000 0000 0000 0000 if not for each 16-bit/2-byte units - const uint16_t one_byte_bitmask = static_cast(_mm_movemask_epi8(one_byte_bytemask)); // collect the MSB from previous vector and put them into uint16_t mas - - // no bits set above 11th bit - const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000); - const uint16_t one_or_two_bytes_bitmask = static_cast(_mm_movemask_epi8(one_or_two_bytes_bytemask)); - - if (one_or_two_bytes_bitmask == 0xffff) { - // case: all code units either produce 1 or 2 UTF-8 bytes (at least one produces 2 bytes) + __m128i v_07ff = __lsx_vreplgr2vr_h(uint16_t(0x7ff)); + __m128i zero = __lsx_vldi(0); + if (__lsx_bz_v(__lsx_vslt_hu(v_07ff, in))) { // 1. prepare 2-byte values // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 // expected output : [110a|aaaa|10bb|bbbb] x 8 - const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00); // 0001 1111 0000 0000 - const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f); // 0000 0000 0011 1111 - // t0 = [000a|aaaa|bbbb|bb00] - const __m128i t0 = _mm_slli_epi16(in_16, 2); // shift packed vector by two + __m128i t0 = __lsx_vslli_h(in, 2); // t1 = [000a|aaaa|0000|0000] - const __m128i t1 = _mm_and_si128(t0, v_1f00); // potential first utf8 byte + __m128i t1 = __lsx_vand_v(t0, lsx_splat_u16(0x1f00)); // t2 = [0000|0000|00bb|bbbb] - const __m128i t2 = _mm_and_si128(in_16, v_003f);// potential second utf8 byte + __m128i t2 = __lsx_vand_v(in, __lsx_vrepli_h(0x3f)); // t3 = [000a|aaaa|00bb|bbbb] - const __m128i t3 = _mm_or_si128(t1, t2); // first and second potential utf8 byte together + __m128i t3 = __lsx_vor_v(t1, t2); // t4 = [110a|aaaa|10bb|bbbb] - const __m128i t4 = _mm_or_si128(t3, v_c080); // t3 | 1100 0000 1000 0000 = full potential 2-byte utf8 unit - + __m128i v_c080 = __lsx_vreplgr2vr_h(uint16_t(0xc080)); + __m128i t4 = __lsx_vor_v(t3, v_c080); // 2. merge ASCII and 2-byte codewords - const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in_16, one_byte_bytemask); - + __m128i one_byte_bytemask = + __lsx_vsle_hu(in, __lsx_vrepli_h(0x7F /*0x007F*/)); + __m128i utf8_unpacked = __lsx_vbitsel_v(t4, in, one_byte_bytemask); // 3. prepare bitmask for 8-bit lookup - // one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB) - const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a - const uint16_t m1 = static_cast(m0 >> 7); // m1 = 00000000h0g0f0e0 - const uint8_t m2 = static_cast((m0 | m1) & 0xff); // m2 = hdgcfbea + uint32_t m2 = __lsx_vpickve2gr_bu(__lsx_vmskltz_h(one_byte_bytemask), 0); // 4. pack the bytes - const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; - const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1)); - const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle); - + const uint8_t *row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes + [lsx_1_2_utf8_bytes_mask[m2]][0]; + __m128i shuffle = __lsx_vld(row, 1); + __m128i utf8_packed = __lsx_vshuf_b(zero, utf8_unpacked, shuffle); // 5. store bytes - _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); - + __lsx_vst(utf8_packed, utf8_output, 0); // 6. adjust pointers buf += 8; utf8_output += row[0]; continue; } - - // Check for overflow in packing - - const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000); - const uint32_t saturation_bitmask = static_cast(_mm_movemask_epi8(saturation_bytemask)); - if (saturation_bitmask == 0xffff) { + __m128i surrogates_bytemask = __lsx_vseq_h( + __lsx_vand_v(in, lsx_splat_u16(0xf800)), lsx_splat_u16(0xd800)); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. + // However, it is likely an uncommon occurrence. + if (__lsx_bz_v(surrogates_bytemask)) { // case: code units from register produce either 1, 2 or 3 UTF-8 bytes - const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800); - forbidden_bytemask = _mm_or_si128(forbidden_bytemask, _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800)); - - const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); - /* In this branch we handle three cases: - 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte - 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes - 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - + single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - + two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - + three UTF-8 bytes - We expand the input word (16-bit) into two code units (32-bit), thus - we have room for four bytes. However, we need five distinct bit - layouts. Note that the last byte in cases #2 and #3 is the same. + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. - We precompute byte 1 for case #1 and the common byte for cases #2 & #3 - in register t2. + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. - We precompute byte 1 for case #3 and -- **conditionally** -- precompute - either byte 1 for case #2 or byte 2 for case #3. Note that they - differ by exactly one bit. + We precompute byte 1 for case #3 and -- **conditionally** -- + precompute either byte 1 for case #2 or byte 2 for case #3. Note that + they differ by exactly one bit. - Finally from these two code units we build proper UTF-8 sequence, taking - into account the case (i.e, the number of bytes to write). - */ + Finally from these two code units we build proper UTF-8 sequence, + taking into account the case (i.e, the number of bytes to write). + */ /** * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: * t2 => [0ccc|cccc] [10cc|cccc] * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) */ -#define simdutf_vec(x) _mm_set1_epi16(static_cast(x)) // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] - const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even); - // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] - const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111)); - // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] - const __m128i t2 = _mm_or_si128 (t1, simdutf_vec(0b1000000000000000)); + __m128i t0 = __lsx_vpickev_b(in, in); + t0 = __lsx_vilvl_b(t0, t0); - // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] - const __m128i s0 = _mm_srli_epi16(in_16, 4); - // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] - const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100)); - // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] - const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140)); - // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] - const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000)); - const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000)); - const __m128i s4 = _mm_xor_si128(s3, m0); -#undef simdutf_vec + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|00cc|cccc] + __m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F)); + __m128i t1 = __lsx_vand_v(t0, v_3f7f); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + __m128i t2 = __lsx_vor_v(t1, lsx_splat_u16(0x8000)); + + // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] + __m128i s0 = __lsx_vsrli_h(in, 12); + // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] + __m128i s1 = __lsx_vslli_h(in, 2); + // s1: [aabb|bbbb|cccc|cc00] => [00bb|bbbb|0000|0000] + s1 = __lsx_vand_v(s1, lsx_splat_u16(0x3f00)); + + // [00bb|bbbb|0000|aaaa] + __m128i s2 = __lsx_vor_v(s0, s1); + // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + __m128i v_c0e0 = __lsx_vreplgr2vr_h(uint16_t(0xC0E0)); + __m128i s3 = __lsx_vor_v(s2, v_c0e0); + __m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(in, v_07ff); + __m128i m0 = + __lsx_vandn_v(one_or_two_bytes_bytemask, lsx_splat_u16(0x4000)); + __m128i s4 = __lsx_vxor_v(s3, m0); // 4. expand code units 16-bit => 32-bit - const __m128i out0 = _mm_unpacklo_epi16(t2, s4); - const __m128i out1 = _mm_unpackhi_epi16(t2, s4); + __m128i out0 = __lsx_vilvl_h(s4, t2); + __m128i out1 = __lsx_vilvh_h(s4, t2); // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle - const uint16_t mask = (one_byte_bitmask & 0x5555) | - (one_or_two_bytes_bitmask & 0xaaaa); - if(mask == 0) { - // We only have three-byte code units. Use fast path. - const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); - const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle); - const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle); - _mm_storeu_si128((__m128i*)utf8_output, utf8_0); - utf8_output += 12; - _mm_storeu_si128((__m128i*)utf8_output, utf8_1); - utf8_output += 12; - buf += 8; - continue; - } - const uint8_t mask0 = uint8_t(mask); - - const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; - const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1)); - const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0); - - const uint8_t mask1 = static_cast(mask >> 8); - - const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; - const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1)); - const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1); - - _mm_storeu_si128((__m128i*)utf8_output, utf8_0); + __m128i one_byte_bytemask = __lsx_vsle_hu(in, __lsx_vrepli_h(0x7F)); + + __m128i one_or_two_bytes_bytemask_low = + __lsx_vilvl_h(one_or_two_bytes_bytemask, zero); + __m128i one_or_two_bytes_bytemask_high = + __lsx_vilvh_h(one_or_two_bytes_bytemask, zero); + + __m128i one_byte_bytemask_low = + __lsx_vilvl_h(one_byte_bytemask, one_byte_bytemask); + __m128i one_byte_bytemask_high = + __lsx_vilvh_h(one_byte_bytemask, one_byte_bytemask); + + const uint32_t mask0 = __lsx_vpickve2gr_bu( + __lsx_vmskltz_h(__lsx_vor_v(one_or_two_bytes_bytemask_low, + one_byte_bytemask_low)), + 0); + const uint32_t mask1 = __lsx_vpickve2gr_bu( + __lsx_vmskltz_h(__lsx_vor_v(one_or_two_bytes_bytemask_high, + one_byte_bytemask_high)), + 0); + + const uint8_t *row0 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + __m128i shuffle0 = __lsx_vld(row0, 1); + __m128i utf8_0 = __lsx_vshuf_b(zero, out0, shuffle0); + + const uint8_t *row1 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + __m128i shuffle1 = __lsx_vld(row1, 1); + __m128i utf8_1 = __lsx_vshuf_b(zero, out1, shuffle1); + + __lsx_vst(utf8_0, utf8_output, 0); utf8_output += row0[0]; - _mm_storeu_si128((__m128i*)utf8_output, utf8_1); + __lsx_vst(utf8_1, utf8_output, 0); utf8_output += row1[0]; buf += 8; + // surrogate pair(s) in a register } else { - // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes // Let us do a scalar fallback. // It may seem wasteful to use scalar code, but being efficient with SIMD // in the presence of surrogate pairs may require non-trivial tables. size_t forward = 15; size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint32_t word = buf[k]; - if((word & 0xFFFFFF80)==0) { + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint16_t word = scalar::utf16::swap_if_needed(buf[k]); + if ((word & 0xFF80) == 0) { *utf8_output++ = char(word); - } else if((word & 0xFFFFF800)==0) { - *utf8_output++ = char((word>>6) | 0b11000000); + } else if ((word & 0xF800) == 0) { + *utf8_output++ = char((word >> 6) | 0b11000000); *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else if((word &0xFFFF0000 )==0) { - if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); } - *utf8_output++ = char((word>>12) | 0b11100000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + } else if ((word & 0xF800) != 0xD800) { + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); *utf8_output++ = char((word & 0b111111) | 0b10000000); } else { - if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); } - *utf8_output++ = char((word>>18) | 0b11110000); - *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = + scalar::utf16::swap_if_needed(buf[k + 1]); + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if ((diff | diff2) > 0x3FF) { + return std::make_pair( + result(error_code::SURROGATE, buf - start + k - 1), + reinterpret_cast(utf8_output)); + } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf8_output++ = char((value >> 18) | 0b11110000); + *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); } } buf += k; } } // while - // check for invalid input - const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff); - if(static_cast(_mm_movemask_epi8(_mm_cmpeq_epi32(_mm_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffff) { - return std::make_pair(nullptr, utf8_output); - } - - if (static_cast(_mm_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf8_output); } + return std::make_pair(result(error_code::SUCCESS, buf - start), + reinterpret_cast(utf8_output)); +} +/* end file src/lsx/lsx_convert_utf16_to_utf8.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF8 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +/* begin file src/lsx/lsx_convert_utf16_to_utf32.cpp */ +template +std::pair +lsx_convert_utf16_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_out) { + uint32_t *utf32_output = reinterpret_cast(utf32_out); + const char16_t *end = buf + len; - return std::make_pair(buf, utf8_output); + __m128i zero = __lsx_vldi(0); + __m128i v_f800 = lsx_splat_u16(0xf800); + __m128i v_d800 = lsx_splat_u16(0xd800); + + while (end - buf >= 8) { + __m128i in = __lsx_vld(reinterpret_cast(buf), 0); + if simdutf_constexpr (!match_system(big_endian)) { + in = lsx_swap_bytes(in); + } + + __m128i surrogates_bytemask = + __lsx_vseq_h(__lsx_vand_v(in, v_f800), v_d800); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. + // However, it is likely an uncommon occurrence. + if (__lsx_bz_v(surrogates_bytemask)) { + // case: no surrogate pairs, extend all 16-bit code units to 32-bit code + // units + __lsx_vst(__lsx_vilvl_h(zero, in), utf32_output, 0); + __lsx_vst(__lsx_vilvh_h(zero, in), utf32_output, 16); + utf32_output += 8; + buf += 8; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint16_t word = scalar::utf16::swap_if_needed(buf[k]); + if ((word & 0xF800) != 0xD800) { + *utf32_output++ = char32_t(word); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = + scalar::utf16::swap_if_needed(buf[k + 1]); + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if ((diff | diff2) > 0x3FF) { + return std::make_pair(nullptr, + reinterpret_cast(utf32_output)); + } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf32_output++ = char32_t(value); + } + } + buf += k; + } + } // while + return std::make_pair(buf, reinterpret_cast(utf32_output)); } +/* + Returns a pair: a result struct and utf8_output. + If there is an error, the count field of the result is the position of the + error. Otherwise, it is the position of the first unprocessed byte in buf + (even if finished). A scalar routing should carry on the conversion of the + tail if needed. +*/ +template +std::pair +lsx_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len, + char32_t *utf32_out) { + uint32_t *utf32_output = reinterpret_cast(utf32_out); + const char16_t *start = buf; + const char16_t *end = buf + len; -std::pair sse_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) { + __m128i zero = __lsx_vldi(0); + __m128i v_f800 = lsx_splat_u16(0xf800); + __m128i v_d800 = lsx_splat_u16(0xd800); - const char32_t* end = buf + len; - const char32_t* start = buf; + while (end - buf >= 8) { + __m128i in = __lsx_vld(reinterpret_cast(buf), 0); + if simdutf_constexpr (!match_system(big_endian)) { + in = lsx_swap_bytes(in); + } - const __m128i v_0000 = _mm_setzero_si128(); - const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); - const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080); - const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80); - const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000); - const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff); - const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff); + __m128i surrogates_bytemask = + __lsx_vseq_h(__lsx_vand_v(in, v_f800), v_d800); + if (__lsx_bz_v(surrogates_bytemask)) { + // case: no surrogate pairs, extend all 16-bit code units to 32-bit code + // units + __lsx_vst(__lsx_vilvl_h(zero, in), utf32_output, 0); + __lsx_vst(__lsx_vilvh_h(zero, in), utf32_output, 16); + utf32_output += 8; + buf += 8; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint16_t word = scalar::utf16::swap_if_needed(buf[k]); + if ((word & 0xF800) != 0xD800) { + *utf32_output++ = char32_t(word); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = + scalar::utf16::swap_if_needed(buf[k + 1]); + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if ((diff | diff2) > 0x3FF) { + return std::make_pair( + result(error_code::SURROGATE, buf - start + k - 1), + reinterpret_cast(utf32_output)); + } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf32_output++ = char32_t(value); + } + } + buf += k; + } + } // while + return std::make_pair(result(error_code::SUCCESS, buf - start), + reinterpret_cast(utf32_output)); +} +/* end file src/lsx/lsx_convert_utf16_to_utf32.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 - const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +/* begin file src/lsx/lsx_convert_utf32_to_latin1.cpp */ +std::pair +lsx_convert_utf32_to_latin1(const char32_t *buf, size_t len, + char *latin1_output) { + const char32_t *end = buf + len; + const v16u8 shuf_mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0}; + __m128i v_ff = __lsx_vrepli_w(0xFF); - while (buf + 16 + safety_margin <= end) { - // We load two 16 bytes registers for a total of 32 bytes or 16 characters. - __m128i in = _mm_loadu_si128((__m128i*)buf); - __m128i nextin = _mm_loadu_si128((__m128i*)buf+1); + while (end - buf >= 16) { + __m128i in1 = __lsx_vld(reinterpret_cast(buf), 0); + __m128i in2 = __lsx_vld(reinterpret_cast(buf), 16); - // Check for too large input - __m128i max_input = _mm_max_epu32(_mm_max_epu32(in, nextin), v_10ffff); - if(static_cast(_mm_movemask_epi8(_mm_cmpeq_epi32(max_input, v_10ffff))) != 0xffff) { - return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output); + __m128i in12 = __lsx_vor_v(in1, in2); + if (__lsx_bz_v(__lsx_vslt_wu(v_ff, in12))) { + // 1. pack the bytes + __m128i latin1_packed = __lsx_vshuf_b(in2, in1, (__m128i)shuf_mask); + // 2. store (8 bytes) + __lsx_vst(latin1_packed, reinterpret_cast(latin1_output), 0); + // 3. adjust pointers + buf += 8; + latin1_output += 8; + } else { + return std::make_pair(nullptr, reinterpret_cast(latin1_output)); } + } // while + return std::make_pair(buf, latin1_output); +} - // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned saturation - __m128i in_16 = _mm_packus_epi32(_mm_and_si128(in, v_7fffffff), _mm_and_si128(nextin, v_7fffffff)); +std::pair +lsx_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len, + char *latin1_output) { + const char32_t *start = buf; + const char32_t *end = buf + len; - // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp + const v16u8 shuf_mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0}; + __m128i v_ff = __lsx_vrepli_w(0xFF); - // Check for ASCII fast path - if(_mm_testz_si128(in_16, v_ff80)) { // ASCII fast path!!!! - // We eagerly load another 32 bytes, hoping that they will be ASCII too. - // The intuition is that we try to collect 16 ASCII characters which requires - // a total of 64 bytes of input. If we fail, we just pass thirdin and fourthin - // as our new inputs. - __m128i thirdin = _mm_loadu_si128((__m128i*)buf+2); - __m128i fourthin = _mm_loadu_si128((__m128i*)buf+3); - __m128i nextin_16 = _mm_packus_epi32(_mm_and_si128(thirdin, v_7fffffff), _mm_and_si128(fourthin, v_7fffffff)); - if(!_mm_testz_si128(nextin_16, v_ff80)) { - // 1. pack the bytes - // obviously suboptimal. - const __m128i utf8_packed = _mm_packus_epi16(in_16,in_16); - // 2. store (16 bytes) - _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); - // 3. adjust pointers - buf += 8; - utf8_output += 8; - // Proceed with next input - in_16 = nextin_16; - __m128i next_max_input = _mm_max_epu32(_mm_max_epu32(thirdin, fourthin), v_10ffff); - if(static_cast(_mm_movemask_epi8(_mm_cmpeq_epi32(next_max_input, v_10ffff))) != 0xffff) { - return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output); + while (end - buf >= 16) { + __m128i in1 = __lsx_vld(reinterpret_cast(buf), 0); + __m128i in2 = __lsx_vld(reinterpret_cast(buf), 16); + + __m128i in12 = __lsx_vor_v(in1, in2); + + if (__lsx_bz_v(__lsx_vslt_wu(v_ff, in12))) { + // 1. pack the bytes + __m128i latin1_packed = __lsx_vshuf_b(in2, in1, (__m128i)shuf_mask); + // 2. store (8 bytes) + __lsx_vst(latin1_packed, reinterpret_cast(latin1_output), 0); + // 3. adjust pointers + buf += 8; + latin1_output += 8; + } else { + // Let us do a scalar fallback. + for (int k = 0; k < 8; k++) { + uint32_t word = buf[k]; + if (word <= 0xff) { + *latin1_output++ = char(word); + } else { + return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), + latin1_output); } - // We need to update in and nextin because they are used later. - in = thirdin; - nextin = fourthin; - } else { - // 1. pack the bytes - const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16); - // 2. store (16 bytes) - _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); - // 3. adjust pointers - buf += 16; - utf8_output += 16; - continue; // we are done for this round! } } + } // while + return std::make_pair(result(error_code::SUCCESS, buf - start), + latin1_output); +} +/* end file src/lsx/lsx_convert_utf32_to_latin1.cpp */ +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +/* begin file src/lsx/lsx_convert_utf32_to_utf8.cpp */ +std::pair +lsx_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) { + uint8_t *utf8_output = reinterpret_cast(utf8_out); + const char32_t *end = buf + len; - // no bits set above 7th bit - const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_ff80), v_0000); - const uint16_t one_byte_bitmask = static_cast(_mm_movemask_epi8(one_byte_bytemask)); - - // no bits set above 11th bit - const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000); - const uint16_t one_or_two_bytes_bitmask = static_cast(_mm_movemask_epi8(one_or_two_bytes_bytemask)); + __m128i v_c080 = lsx_splat_u16(0xc080); + __m128i v_07ff = lsx_splat_u16(0x07ff); + __m128i v_dfff = lsx_splat_u16(0xdfff); + __m128i v_d800 = lsx_splat_u16(0xd800); + __m128i forbidden_bytemask = __lsx_vldi(0x0); - if (one_or_two_bytes_bitmask == 0xffff) { - // case: all code units either produce 1 or 2 UTF-8 bytes (at least one produces 2 bytes) - // 1. prepare 2-byte values - // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 - // expected output : [110a|aaaa|10bb|bbbb] x 8 - const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00); - const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f); + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 - // t0 = [000a|aaaa|bbbb|bb00] - const __m128i t0 = _mm_slli_epi16(in_16, 2); - // t1 = [000a|aaaa|0000|0000] - const __m128i t1 = _mm_and_si128(t0, v_1f00); - // t2 = [0000|0000|00bb|bbbb] - const __m128i t2 = _mm_and_si128(in_16, v_003f); - // t3 = [000a|aaaa|00bb|bbbb] - const __m128i t3 = _mm_or_si128(t1, t2); - // t4 = [110a|aaaa|10bb|bbbb] - const __m128i t4 = _mm_or_si128(t3, v_c080); + while (end - buf > std::ptrdiff_t(16 + safety_margin)) { + __m128i in = __lsx_vld(reinterpret_cast(buf), 0); + __m128i nextin = __lsx_vld(reinterpret_cast(buf), 16); - // 2. merge ASCII and 2-byte codewords - const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in_16, one_byte_bytemask); + // Check if no bits set above 16th + if (__lsx_bz_v(__lsx_vpickod_h(in, nextin))) { + // Pack UTF-32 to UTF-16 safely (without surrogate pairs) + // Apply UTF-16 => UTF-8 routine (lsx_convert_utf16_to_utf8.cpp) + __m128i utf16_packed = __lsx_vpickev_h(nextin, in); - // 3. prepare bitmask for 8-bit lookup - // one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB) - const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a - const uint16_t m1 = static_cast(m0 >> 7); // m1 = 00000000h0g0f0e0 - const uint8_t m2 = static_cast((m0 | m1) & 0xff); // m2 = hdgcfbea - // 4. pack the bytes - const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; - const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1)); - const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle); + if (__lsx_bz_v(__lsx_vslt_hu(__lsx_vrepli_h(0x7F), + utf16_packed))) { // ASCII fast path!!!! + // 1. pack the bytes + // obviously suboptimal. + __m128i utf8_packed = __lsx_vpickev_b(utf16_packed, utf16_packed); + // 2. store (8 bytes) + __lsx_vst(utf8_packed, utf8_output, 0); + // 3. adjust pointers + buf += 8; + utf8_output += 8; + continue; // we are done for this round! + } + __m128i zero = __lsx_vldi(0); + if (__lsx_bz_v(__lsx_vslt_hu(v_07ff, utf16_packed))) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 - // 5. store bytes - _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); + // t0 = [000a|aaaa|bbbb|bb00] + const __m128i t0 = __lsx_vslli_h(utf16_packed, 2); + // t1 = [000a|aaaa|0000|0000] + const __m128i t1 = __lsx_vand_v(t0, lsx_splat_u16(0x1f00)); + // t2 = [0000|0000|00bb|bbbb] + const __m128i t2 = __lsx_vand_v(utf16_packed, __lsx_vrepli_h(0x3f)); + // t3 = [000a|aaaa|00bb|bbbb] + const __m128i t3 = __lsx_vor_v(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const __m128i t4 = __lsx_vor_v(t3, v_c080); + // 2. merge ASCII and 2-byte codewords + __m128i one_byte_bytemask = + __lsx_vsle_hu(utf16_packed, __lsx_vrepli_h(0x7F /*0x007F*/)); + __m128i utf8_unpacked = + __lsx_vbitsel_v(t4, utf16_packed, one_byte_bytemask); + // 3. prepare bitmask for 8-bit lookup + uint32_t m2 = + __lsx_vpickve2gr_bu(__lsx_vmskltz_h(one_byte_bytemask), 0); + // 4. pack the bytes + const uint8_t *row = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes + [lsx_1_2_utf8_bytes_mask[m2]][0]; + __m128i shuffle = __lsx_vld(row, 1); + __m128i utf8_packed = __lsx_vshuf_b(zero, utf8_unpacked, shuffle); + // 5. store bytes + __lsx_vst(utf8_packed, utf8_output, 0); - // 6. adjust pointers - buf += 8; - utf8_output += row[0]; - continue; - } + // 6. adjust pointers + buf += 8; + utf8_output += row[0]; + continue; + } else { + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes + forbidden_bytemask = __lsx_vor_v( + __lsx_vand_v( + __lsx_vsle_h(utf16_packed, v_dfff), // utf16_packed <= 0xdfff + __lsx_vsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800 + forbidden_bytemask); + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single + UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two + UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three + UTF-8 bytes + + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. + + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + + Finally from these two code units we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + __m128i t0 = __lsx_vpickev_b(utf16_packed, utf16_packed); + t0 = __lsx_vilvl_b(t0, t0); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + __m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F)); + __m128i t1 = __lsx_vand_v(t0, v_3f7f); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + __m128i t2 = __lsx_vor_v(t1, lsx_splat_u16(0x8000)); + // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] + __m128i s0 = __lsx_vsrli_h(utf16_packed, 12); + // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] + __m128i s1 = __lsx_vslli_h(utf16_packed, 2); + // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000] + s1 = __lsx_vand_v(s1, lsx_splat_u16(0x3F00)); + // [00bb|bbbb|0000|aaaa] + __m128i s2 = __lsx_vor_v(s0, s1); + // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + __m128i v_c0e0 = __lsx_vreplgr2vr_h(uint16_t(0xC0E0)); + __m128i s3 = __lsx_vor_v(s2, v_c0e0); + __m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(utf16_packed, v_07ff); + __m128i m0 = + __lsx_vandn_v(one_or_two_bytes_bytemask, lsx_splat_u16(0x4000)); + __m128i s4 = __lsx_vxor_v(s3, m0); - // Check for overflow in packing - const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000); - const uint32_t saturation_bitmask = static_cast(_mm_movemask_epi8(saturation_bytemask)); + // 4. expand code units 16-bit => 32-bit + __m128i out0 = __lsx_vilvl_h(s4, t2); + __m128i out1 = __lsx_vilvh_h(s4, t2); - if (saturation_bitmask == 0xffff) { - // case: code units from register produce either 1, 2 or 3 UTF-8 bytes + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + __m128i one_byte_bytemask = + __lsx_vsle_hu(utf16_packed, __lsx_vrepli_h(0x7F)); + + __m128i one_or_two_bytes_bytemask_u16_to_u32_low = + __lsx_vilvl_h(one_or_two_bytes_bytemask, zero); + __m128i one_or_two_bytes_bytemask_u16_to_u32_high = + __lsx_vilvh_h(one_or_two_bytes_bytemask, zero); + + __m128i one_byte_bytemask_u16_to_u32_low = + __lsx_vilvl_h(one_byte_bytemask, one_byte_bytemask); + __m128i one_byte_bytemask_u16_to_u32_high = + __lsx_vilvh_h(one_byte_bytemask, one_byte_bytemask); + + const uint32_t mask0 = + __lsx_vpickve2gr_bu(__lsx_vmskltz_h(__lsx_vor_v( + one_or_two_bytes_bytemask_u16_to_u32_low, + one_byte_bytemask_u16_to_u32_low)), + 0); + const uint32_t mask1 = + __lsx_vpickve2gr_bu(__lsx_vmskltz_h(__lsx_vor_v( + one_or_two_bytes_bytemask_u16_to_u32_high, + one_byte_bytemask_u16_to_u32_high)), + 0); + + const uint8_t *row0 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + __m128i shuffle0 = __lsx_vld(row0, 1); + __m128i utf8_0 = __lsx_vshuf_b(zero, out0, shuffle0); + + const uint8_t *row1 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + __m128i shuffle1 = __lsx_vld(row1, 1); + __m128i utf8_1 = __lsx_vshuf_b(zero, out1, shuffle1); + + __lsx_vst(utf8_0, utf8_output, 0); + utf8_output += row0[0]; + __lsx_vst(utf8_1, utf8_output, 0); + utf8_output += row1[0]; - // Check for illegal surrogate code units - const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800); - const __m128i forbidden_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800); - if (static_cast(_mm_movemask_epi8(forbidden_bytemask)) != 0) { - return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output); + buf += 8; + } + // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> + // will produce four UTF-8 bytes. + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint32_t word = buf[k]; + if ((word & 0xFFFFFF80) == 0) { + *utf8_output++ = char(word); + } else if ((word & 0xFFFFF800) == 0) { + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if ((word & 0xFFFF0000) == 0) { + if (word >= 0xD800 && word <= 0xDFFF) { + return std::make_pair(nullptr, + reinterpret_cast(utf8_output)); + } + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + if (word > 0x10FFFF) { + return std::make_pair(nullptr, + reinterpret_cast(utf8_output)); + } + *utf8_output++ = char((word >> 18) | 0b11110000); + *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } } + buf += k; + } + } // while - const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + // check for invalid input + if (__lsx_bnz_v(forbidden_bytemask)) { + return std::make_pair(nullptr, reinterpret_cast(utf8_output)); + } - /* In this branch we handle three cases: - 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte - 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes - 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes + return std::make_pair(buf, reinterpret_cast(utf8_output)); +} - We expand the input word (16-bit) into two code units (32-bit), thus - we have room for four bytes. However, we need five distinct bit - layouts. Note that the last byte in cases #2 and #3 is the same. +std::pair +lsx_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len, + char *utf8_out) { + uint8_t *utf8_output = reinterpret_cast(utf8_out); + const char32_t *start = buf; + const char32_t *end = buf + len; - We precompute byte 1 for case #1 and the common byte for cases #2 & #3 - in register t2. + __m128i v_c080 = lsx_splat_u16(0xc080); + __m128i v_07ff = lsx_splat_u16(0x07ff); + __m128i v_dfff = lsx_splat_u16(0xdfff); + __m128i v_d800 = lsx_splat_u16(0xd800); + __m128i forbidden_bytemask = __lsx_vldi(0x0); + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 - We precompute byte 1 for case #3 and -- **conditionally** -- precompute - either byte 1 for case #2 or byte 2 for case #3. Note that they - differ by exactly one bit. + while (end - buf > std::ptrdiff_t(16 + safety_margin)) { + __m128i in = __lsx_vld(reinterpret_cast(buf), 0); + __m128i nextin = __lsx_vld(reinterpret_cast(buf), 16); - Finally from these two code units we build proper UTF-8 sequence, taking - into account the case (i.e, the number of bytes to write). - */ - /** - * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: - * t2 => [0ccc|cccc] [10cc|cccc] - * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) - */ -#define simdutf_vec(x) _mm_set1_epi16(static_cast(x)) - // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] - const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even); - // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] - const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111)); - // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] - const __m128i t2 = _mm_or_si128 (t1, simdutf_vec(0b1000000000000000)); + // Check if no bits set above 16th + if (__lsx_bz_v(__lsx_vpickod_h(in, nextin))) { + // Pack UTF-32 to UTF-16 safely (without surrogate pairs) + // Apply UTF-16 => UTF-8 routine (lsx_convert_utf16_to_utf8.cpp) + __m128i utf16_packed = __lsx_vpickev_h(nextin, in); - // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] - const __m128i s0 = _mm_srli_epi16(in_16, 4); - // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] - const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100)); - // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] - const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140)); - // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] - const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000)); - const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000)); - const __m128i s4 = _mm_xor_si128(s3, m0); -#undef simdutf_vec + if (__lsx_bz_v(__lsx_vslt_hu(__lsx_vrepli_h(0x7F), + utf16_packed))) { // ASCII fast path!!!! + // 1. pack the bytes + // obviously suboptimal. + __m128i utf8_packed = __lsx_vpickev_b(utf16_packed, utf16_packed); + // 2. store (8 bytes) + __lsx_vst(utf8_packed, utf8_output, 0); + // 3. adjust pointers + buf += 8; + utf8_output += 8; + continue; // we are done for this round! + } + __m128i zero = __lsx_vldi(0); + if (__lsx_bz_v(__lsx_vslt_hu(v_07ff, utf16_packed))) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 - // 4. expand code units 16-bit => 32-bit - const __m128i out0 = _mm_unpacklo_epi16(t2, s4); - const __m128i out1 = _mm_unpackhi_epi16(t2, s4); + // t0 = [000a|aaaa|bbbb|bb00] + const __m128i t0 = __lsx_vslli_h(utf16_packed, 2); + // t1 = [000a|aaaa|0000|0000] + const __m128i t1 = __lsx_vand_v(t0, lsx_splat_u16(0x1f00)); + // t2 = [0000|0000|00bb|bbbb] + const __m128i t2 = __lsx_vand_v(utf16_packed, __lsx_vrepli_h(0x3f)); + // t3 = [000a|aaaa|00bb|bbbb] + const __m128i t3 = __lsx_vor_v(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const __m128i t4 = __lsx_vor_v(t3, v_c080); + // 2. merge ASCII and 2-byte codewords + __m128i one_byte_bytemask = + __lsx_vsle_hu(utf16_packed, __lsx_vrepli_h(0x7F /*0x007F*/)); + __m128i utf8_unpacked = + __lsx_vbitsel_v(t4, utf16_packed, one_byte_bytemask); + // 3. prepare bitmask for 8-bit lookup + uint32_t m2 = + __lsx_vpickve2gr_bu(__lsx_vmskltz_h(one_byte_bytemask), 0); + // 4. pack the bytes + const uint8_t *row = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes + [lsx_1_2_utf8_bytes_mask[m2]][0]; + __m128i shuffle = __lsx_vld(row, 1); + __m128i utf8_packed = __lsx_vshuf_b(zero, utf8_unpacked, shuffle); + // 5. store bytes + __lsx_vst(utf8_packed, utf8_output, 0); - // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle - const uint16_t mask = (one_byte_bitmask & 0x5555) | - (one_or_two_bytes_bitmask & 0xaaaa); - if(mask == 0) { - // We only have three-byte code units. Use fast path. - const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); - const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle); - const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle); - _mm_storeu_si128((__m128i*)utf8_output, utf8_0); - utf8_output += 12; - _mm_storeu_si128((__m128i*)utf8_output, utf8_1); - utf8_output += 12; + // 6. adjust pointers buf += 8; + utf8_output += row[0]; continue; - } - const uint8_t mask0 = uint8_t(mask); - - const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; - const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1)); - const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0); + } else { + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes + forbidden_bytemask = __lsx_vor_v( + __lsx_vand_v( + __lsx_vsle_h(utf16_packed, v_dfff), // utf16_packed <= 0xdfff + __lsx_vsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800 + forbidden_bytemask); + if (__lsx_bnz_v(forbidden_bytemask)) { + return std::make_pair(result(error_code::SURROGATE, buf - start), + reinterpret_cast(utf8_output)); + } + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single + UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two + UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three + UTF-8 bytes + + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. + + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + + Finally from these two code units we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + __m128i t0 = __lsx_vpickev_b(utf16_packed, utf16_packed); + t0 = __lsx_vilvl_b(t0, t0); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + __m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F)); + __m128i t1 = __lsx_vand_v(t0, v_3f7f); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + __m128i t2 = __lsx_vor_v(t1, lsx_splat_u16(0x8000)); - const uint8_t mask1 = static_cast(mask >> 8); + // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] + __m128i s0 = __lsx_vsrli_h(utf16_packed, 12); + // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] + __m128i s1 = __lsx_vslli_h(utf16_packed, 2); + // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000] + s1 = __lsx_vand_v(s1, lsx_splat_u16(0x3F00)); + // [00bb|bbbb|0000|aaaa] + __m128i s2 = __lsx_vor_v(s0, s1); + // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + __m128i v_c0e0 = __lsx_vreplgr2vr_h(uint16_t(0xC0E0)); + __m128i s3 = __lsx_vor_v(s2, v_c0e0); + // __m128i v_07ff = vmovq_n_u16((uint16_t)0x07FF); + __m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(utf16_packed, v_07ff); + __m128i m0 = + __lsx_vandn_v(one_or_two_bytes_bytemask, lsx_splat_u16(0x4000)); + __m128i s4 = __lsx_vxor_v(s3, m0); - const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; - const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1)); - const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1); + // 4. expand code units 16-bit => 32-bit + __m128i out0 = __lsx_vilvl_h(s4, t2); + __m128i out1 = __lsx_vilvh_h(s4, t2); - _mm_storeu_si128((__m128i*)utf8_output, utf8_0); - utf8_output += row0[0]; - _mm_storeu_si128((__m128i*)utf8_output, utf8_1); - utf8_output += row1[0]; + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + __m128i one_byte_bytemask = + __lsx_vsle_hu(utf16_packed, __lsx_vrepli_h(0x7F)); + + __m128i one_or_two_bytes_bytemask_u16_to_u32_low = + __lsx_vilvl_h(one_or_two_bytes_bytemask, zero); + __m128i one_or_two_bytes_bytemask_u16_to_u32_high = + __lsx_vilvh_h(one_or_two_bytes_bytemask, zero); + + __m128i one_byte_bytemask_u16_to_u32_low = + __lsx_vilvl_h(one_byte_bytemask, one_byte_bytemask); + __m128i one_byte_bytemask_u16_to_u32_high = + __lsx_vilvh_h(one_byte_bytemask, one_byte_bytemask); + + const uint32_t mask0 = + __lsx_vpickve2gr_bu(__lsx_vmskltz_h(__lsx_vor_v( + one_or_two_bytes_bytemask_u16_to_u32_low, + one_byte_bytemask_u16_to_u32_low)), + 0); + const uint32_t mask1 = + __lsx_vpickve2gr_bu(__lsx_vmskltz_h(__lsx_vor_v( + one_or_two_bytes_bytemask_u16_to_u32_high, + one_byte_bytemask_u16_to_u32_high)), + 0); + + const uint8_t *row0 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + __m128i shuffle0 = __lsx_vld(row0, 1); + __m128i utf8_0 = __lsx_vshuf_b(zero, out0, shuffle0); + + const uint8_t *row1 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + __m128i shuffle1 = __lsx_vld(row1, 1); + __m128i utf8_1 = __lsx_vshuf_b(zero, out1, shuffle1); + + __lsx_vst(utf8_0, utf8_output, 0); + utf8_output += row0[0]; + __lsx_vst(utf8_1, utf8_output, 0); + utf8_output += row1[0]; - buf += 8; + buf += 8; + } + // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> + // will produce four UTF-8 bytes. } else { - // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes // Let us do a scalar fallback. // It may seem wasteful to use scalar code, but being efficient with SIMD // in the presence of surrogate pairs may require non-trivial tables. size_t forward = 15; size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { uint32_t word = buf[k]; - if((word & 0xFFFFFF80)==0) { + if ((word & 0xFFFFFF80) == 0) { *utf8_output++ = char(word); - } else if((word & 0xFFFFF800)==0) { - *utf8_output++ = char((word>>6) | 0b11000000); + } else if ((word & 0xFFFFF800) == 0) { + *utf8_output++ = char((word >> 6) | 0b11000000); *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else if((word &0xFFFF0000 )==0) { - if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); } - *utf8_output++ = char((word>>12) | 0b11100000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + } else if ((word & 0xFFFF0000) == 0) { + if (word >= 0xD800 && word <= 0xDFFF) { + return std::make_pair( + result(error_code::SURROGATE, buf - start + k), + reinterpret_cast(utf8_output)); + } + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); *utf8_output++ = char((word & 0b111111) | 0b10000000); } else { - if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf- start + k), utf8_output); } - *utf8_output++ = char((word>>18) | 0b11110000); - *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + if (word > 0x10FFFF) { + return std::make_pair( + result(error_code::TOO_LARGE, buf - start + k), + reinterpret_cast(utf8_output)); + } + *utf8_output++ = char((word >> 18) | 0b11110000); + *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); *utf8_output++ = char((word & 0b111111) | 0b10000000); } } @@ -35480,61 +61735,72 @@ std::pair sse_convert_utf32_to_utf8_with_errors(const char32_t* b } } // while - return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output); + return std::make_pair(result(error_code::SUCCESS, buf - start), + reinterpret_cast(utf8_output)); } -/* end file src/westmere/sse_convert_utf32_to_utf8.cpp */ -/* begin file src/westmere/sse_convert_utf32_to_utf16.cpp */ +/* end file src/lsx/lsx_convert_utf32_to_utf8.cpp */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +/* begin file src/lsx/lsx_convert_utf32_to_utf16.cpp */ template -std::pair sse_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output) { - - const char32_t* end = buf + len; - - const __m128i v_0000 = _mm_setzero_si128(); - const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000); - __m128i forbidden_bytemask = _mm_setzero_si128(); +std::pair +lsx_convert_utf32_to_utf16(const char32_t *buf, size_t len, + char16_t *utf16_out) { + uint16_t *utf16_output = reinterpret_cast(utf16_out); + const char32_t *end = buf + len; - while (buf + 8 <= end) { - __m128i in = _mm_loadu_si128((__m128i*)buf); - __m128i nextin = _mm_loadu_si128((__m128i*)buf+1); - const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000); - const uint32_t saturation_bitmask = static_cast(_mm_movemask_epi8(saturation_bytemask)); + __m128i forbidden_bytemask = __lsx_vrepli_h(0); + __m128i v_d800 = lsx_splat_u16(0xd800); + __m128i v_dfff = lsx_splat_u16(0xdfff); + while (end - buf >= 8) { + __m128i in0 = __lsx_vld(reinterpret_cast(buf), 0); + __m128i in1 = __lsx_vld(reinterpret_cast(buf), 16); // Check if no bits set above 16th - if (saturation_bitmask == 0xffff) { - // Pack UTF-32 to UTF-16 - __m128i utf16_packed = _mm_packus_epi32(in, nextin); - - const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); - const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800); - forbidden_bytemask = _mm_or_si128(forbidden_bytemask, _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800)); - - if (big_endian) { - const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - utf16_packed = _mm_shuffle_epi8(utf16_packed, swap); + if (__lsx_bz_v(__lsx_vpickod_h(in1, in0))) { + __m128i utf16_packed = __lsx_vpickev_h(in1, in0); + forbidden_bytemask = __lsx_vor_v( + __lsx_vand_v( + __lsx_vsle_h(utf16_packed, v_dfff), // utf16_packed <= 0xdfff + __lsx_vsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800 + forbidden_bytemask); + + if simdutf_constexpr (!match_system(big_endian)) { + utf16_packed = lsx_swap_bytes(utf16_packed); } - - _mm_storeu_si128((__m128i*)utf16_output, utf16_packed); + __lsx_vst(utf16_packed, utf16_output, 0); utf16_output += 8; buf += 8; } else { - size_t forward = 7; + size_t forward = 3; size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { uint32_t word = buf[k]; - if((word & 0xFFFF0000)==0) { + if ((word & 0xFFFF0000) == 0) { // will not generate a surrogate pair - if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); } - *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word); + if (word >= 0xD800 && word <= 0xDFFF) { + return std::make_pair(nullptr, + reinterpret_cast(utf16_output)); + } + *utf16_output++ = !match_system(big_endian) + ? char16_t(word >> 8 | word << 8) + : char16_t(word); } else { // will generate a surrogate pair - if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); } + if (word > 0x10FFFF) { + return std::make_pair(nullptr, + reinterpret_cast(utf16_output)); + } word -= 0x10000; uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); - if (big_endian) { - high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8)); - low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8)); + if simdutf_constexpr (!match_system(big_endian)) { + high_surrogate = + uint16_t(high_surrogate >> 8 | high_surrogate << 8); + low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8); } *utf16_output++ = char16_t(high_surrogate); *utf16_output++ = char16_t(low_surrogate); @@ -35545,65 +61811,80 @@ std::pair sse_convert_utf32_to_utf16(const char32_t* } // check for invalid input - if (static_cast(_mm_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf16_output); } - - return std::make_pair(buf, utf16_output); + if (__lsx_bnz_v(forbidden_bytemask)) { + return std::make_pair(nullptr, reinterpret_cast(utf16_output)); + } + return std::make_pair(buf, reinterpret_cast(utf16_output)); } - template -std::pair sse_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) { - const char32_t* start = buf; - const char32_t* end = buf + len; - - const __m128i v_0000 = _mm_setzero_si128(); - const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000); +std::pair +lsx_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len, + char16_t *utf16_out) { + uint16_t *utf16_output = reinterpret_cast(utf16_out); + const char32_t *start = buf; + const char32_t *end = buf + len; - while (buf + 8 <= end) { - __m128i in = _mm_loadu_si128((__m128i*)buf); - __m128i nextin = _mm_loadu_si128((__m128i*)buf+1); - const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000); - const uint32_t saturation_bitmask = static_cast(_mm_movemask_epi8(saturation_bytemask)); + __m128i forbidden_bytemask = __lsx_vrepli_h(0); + __m128i v_d800 = lsx_splat_u16(0xd800); + __m128i v_dfff = lsx_splat_u16(0xdfff); + while (end - buf >= 8) { + __m128i in0 = __lsx_vld(reinterpret_cast(buf), 0); + __m128i in1 = __lsx_vld(reinterpret_cast(buf), 16); // Check if no bits set above 16th - if (saturation_bitmask == 0xffff) { - // Pack UTF-32 to UTF-16 - __m128i utf16_packed = _mm_packus_epi32(in, nextin); - - const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); - const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800); - const __m128i forbidden_bytemask = _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800); - if (static_cast(_mm_movemask_epi8(forbidden_bytemask)) != 0) { - return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output); + if (__lsx_bz_v(__lsx_vpickod_h(in1, in0))) { + __m128i utf16_packed = __lsx_vpickev_h(in1, in0); + + forbidden_bytemask = __lsx_vor_v( + __lsx_vand_v( + __lsx_vsle_h(utf16_packed, v_dfff), // utf16_packed <= 0xdfff + __lsx_vsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800 + forbidden_bytemask); + if (__lsx_bnz_v(forbidden_bytemask)) { + return std::make_pair(result(error_code::SURROGATE, buf - start), + reinterpret_cast(utf16_output)); } - if (big_endian) { - const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - utf16_packed = _mm_shuffle_epi8(utf16_packed, swap); + if simdutf_constexpr (!match_system(big_endian)) { + utf16_packed = lsx_swap_bytes(utf16_packed); } - _mm_storeu_si128((__m128i*)utf16_output, utf16_packed); + __lsx_vst(utf16_packed, utf16_output, 0); utf16_output += 8; buf += 8; } else { - size_t forward = 7; + size_t forward = 3; size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { uint32_t word = buf[k]; - if((word & 0xFFFF0000)==0) { + if ((word & 0xFFFF0000) == 0) { // will not generate a surrogate pair - if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); } - *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word); + if (word >= 0xD800 && word <= 0xDFFF) { + return std::make_pair( + result(error_code::SURROGATE, buf - start + k), + reinterpret_cast(utf16_output)); + } + *utf16_output++ = !match_system(big_endian) + ? char16_t(word >> 8 | word << 8) + : char16_t(word); } else { // will generate a surrogate pair - if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); } + if (word > 0x10FFFF) { + return std::make_pair( + result(error_code::TOO_LARGE, buf - start + k), + reinterpret_cast(utf16_output)); + } word -= 0x10000; uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); - if (big_endian) { - high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8)); - low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8)); + if simdutf_constexpr (!match_system(big_endian)) { + high_surrogate = + uint16_t(high_surrogate >> 8 | high_surrogate << 8); + low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8); } *utf16_output++ = char16_t(high_surrogate); *utf16_output++ = char16_t(low_surrogate); @@ -35613,138 +61894,147 @@ std::pair sse_convert_utf32_to_utf16_with_errors(const char32 } } - return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output); + return std::make_pair(result(error_code::SUCCESS, buf - start), + reinterpret_cast(utf16_output)); } -/* end file src/westmere/sse_convert_utf32_to_utf16.cpp */ -/* begin file src/westmere/sse_base64.cpp */ +/* end file src/lsx/lsx_convert_utf32_to_utf16.cpp */ +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_BASE64 +/* begin file src/lsx/lsx_base64.cpp */ /** * References and further reading: * * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the * speed of a memory copy, Software: Practice and Experience 50 (2), 2020. - * https://arxiv.org/abs/1910.05109 - * - * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2 - * Instructions, ACM Transactions on the Web 12 (3), 2018. - * https://arxiv.org/abs/1704.00605 - * - * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings. - * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force, - * Request for Comments: 4648. - * - * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization. - * http://www.alfredklomp.com/programming/sse-base64/. (2014). - * - * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD - * acceleration. https://github.com/aklomp/base64. (2014). - * - * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014). - * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/ - * - * Nick Kopp. 2013. Base64 Encoding on a GPU. - * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013). - */ -template __m128i lookup_pshufb_improved(const __m128i input) { - // credit: Wojciech Muła - // reduce 0..51 -> 0 - // 52..61 -> 1 .. 10 - // 62 -> 11 - // 63 -> 12 - __m128i result = _mm_subs_epu8(input, _mm_set1_epi8(51)); - - // distinguish between ranges 0..25 and 26..51: - // 0 .. 25 -> remains 0 - // 26 .. 51 -> becomes 13 - const __m128i less = _mm_cmpgt_epi8(_mm_set1_epi8(26), input); - result = _mm_or_si128(result, _mm_and_si128(less, _mm_set1_epi8(13))); - - __m128i shift_LUT; - if (base64_url) { - shift_LUT = _mm_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, - '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, - '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0); - } else { - shift_LUT = _mm_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, - '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, - '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0); - } - - // read shift - result = _mm_shuffle_epi8(shift_LUT, result); - - return _mm_add_epi8(result, input); -} + * https://arxiv.org/abs/1910.05109 + * + * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2 + * Instructions, ACM Transactions on the Web 12 (3), 2018. + * https://arxiv.org/abs/1704.00605 + * + * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings. + * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force, + * Request for Comments: 4648. + * + * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization. + * http://www.alfredklomp.com/programming/sse-base64/. (2014). + * + * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD + * acceleration. https://github.com/aklomp/base64. (2014). + * + * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014). + * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/ + * + * Nick Kopp. 2013. Base64 Encoding on a GPU. + * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013). + */ -template -size_t encode_base64(char *dst, const char *src, size_t srclen) { +template +size_t encode_base64(char *dst, const char *src, size_t srclen, + base64_options options) { // credit: Wojciech Muła // SSE (lookup: pshufb improved unrolled) const uint8_t *input = (const uint8_t *)src; - + static const char *lookup_tbl = + isbase64url + ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_" + : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; uint8_t *out = (uint8_t *)dst; - const __m128i shuf = - _mm_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1); + + v16u8 shuf; + __m128i v_fc0fc00, v_3f03f0, shift_r, shift_l, base64_tbl0, base64_tbl1, + base64_tbl2, base64_tbl3; + if (srclen >= 16) { + shuf = v16u8{1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10}; + v_fc0fc00 = __lsx_vreplgr2vr_w(uint32_t(0x0fc0fc00)); + v_3f03f0 = __lsx_vreplgr2vr_w(uint32_t(0x003f03f0)); + shift_r = __lsx_vreplgr2vr_w(uint32_t(0x0006000a)); + shift_l = __lsx_vreplgr2vr_w(uint32_t(0x00080004)); + base64_tbl0 = __lsx_vld(lookup_tbl, 0); + base64_tbl1 = __lsx_vld(lookup_tbl, 16); + base64_tbl2 = __lsx_vld(lookup_tbl, 32); + base64_tbl3 = __lsx_vld(lookup_tbl, 48); + } size_t i = 0; for (; i + 52 <= srclen; i += 48) { - __m128i in0 = _mm_loadu_si128( - reinterpret_cast(input + i + 4 * 3 * 0)); - __m128i in1 = _mm_loadu_si128( - reinterpret_cast(input + i + 4 * 3 * 1)); - __m128i in2 = _mm_loadu_si128( - reinterpret_cast(input + i + 4 * 3 * 2)); - __m128i in3 = _mm_loadu_si128( - reinterpret_cast(input + i + 4 * 3 * 3)); - - in0 = _mm_shuffle_epi8(in0, shuf); - in1 = _mm_shuffle_epi8(in1, shuf); - in2 = _mm_shuffle_epi8(in2, shuf); - in3 = _mm_shuffle_epi8(in3, shuf); - - const __m128i t0_0 = _mm_and_si128(in0, _mm_set1_epi32(0x0fc0fc00)); - const __m128i t0_1 = _mm_and_si128(in1, _mm_set1_epi32(0x0fc0fc00)); - const __m128i t0_2 = _mm_and_si128(in2, _mm_set1_epi32(0x0fc0fc00)); - const __m128i t0_3 = _mm_and_si128(in3, _mm_set1_epi32(0x0fc0fc00)); - - const __m128i t1_0 = _mm_mulhi_epu16(t0_0, _mm_set1_epi32(0x04000040)); - const __m128i t1_1 = _mm_mulhi_epu16(t0_1, _mm_set1_epi32(0x04000040)); - const __m128i t1_2 = _mm_mulhi_epu16(t0_2, _mm_set1_epi32(0x04000040)); - const __m128i t1_3 = _mm_mulhi_epu16(t0_3, _mm_set1_epi32(0x04000040)); - - const __m128i t2_0 = _mm_and_si128(in0, _mm_set1_epi32(0x003f03f0)); - const __m128i t2_1 = _mm_and_si128(in1, _mm_set1_epi32(0x003f03f0)); - const __m128i t2_2 = _mm_and_si128(in2, _mm_set1_epi32(0x003f03f0)); - const __m128i t2_3 = _mm_and_si128(in3, _mm_set1_epi32(0x003f03f0)); - - const __m128i t3_0 = _mm_mullo_epi16(t2_0, _mm_set1_epi32(0x01000010)); - const __m128i t3_1 = _mm_mullo_epi16(t2_1, _mm_set1_epi32(0x01000010)); - const __m128i t3_2 = _mm_mullo_epi16(t2_2, _mm_set1_epi32(0x01000010)); - const __m128i t3_3 = _mm_mullo_epi16(t2_3, _mm_set1_epi32(0x01000010)); - - const __m128i input0 = _mm_or_si128(t1_0, t3_0); - const __m128i input1 = _mm_or_si128(t1_1, t3_1); - const __m128i input2 = _mm_or_si128(t1_2, t3_2); - const __m128i input3 = _mm_or_si128(t1_3, t3_3); - - _mm_storeu_si128(reinterpret_cast<__m128i *>(out), - lookup_pshufb_improved(input0)); + __m128i in0 = + __lsx_vld(reinterpret_cast(input + i), 4 * 3 * 0); + __m128i in1 = + __lsx_vld(reinterpret_cast(input + i), 4 * 3 * 1); + __m128i in2 = + __lsx_vld(reinterpret_cast(input + i), 4 * 3 * 2); + __m128i in3 = + __lsx_vld(reinterpret_cast(input + i), 4 * 3 * 3); + + in0 = __lsx_vshuf_b(in0, in0, (__m128i)shuf); + in1 = __lsx_vshuf_b(in1, in1, (__m128i)shuf); + in2 = __lsx_vshuf_b(in2, in2, (__m128i)shuf); + in3 = __lsx_vshuf_b(in3, in3, (__m128i)shuf); + + __m128i t0_0 = __lsx_vand_v(in0, v_fc0fc00); + __m128i t0_1 = __lsx_vand_v(in1, v_fc0fc00); + __m128i t0_2 = __lsx_vand_v(in2, v_fc0fc00); + __m128i t0_3 = __lsx_vand_v(in3, v_fc0fc00); + + __m128i t1_0 = __lsx_vsrl_h(t0_0, shift_r); + __m128i t1_1 = __lsx_vsrl_h(t0_1, shift_r); + __m128i t1_2 = __lsx_vsrl_h(t0_2, shift_r); + __m128i t1_3 = __lsx_vsrl_h(t0_3, shift_r); + + __m128i t2_0 = __lsx_vand_v(in0, v_3f03f0); + __m128i t2_1 = __lsx_vand_v(in1, v_3f03f0); + __m128i t2_2 = __lsx_vand_v(in2, v_3f03f0); + __m128i t2_3 = __lsx_vand_v(in3, v_3f03f0); + + __m128i t3_0 = __lsx_vsll_h(t2_0, shift_l); + __m128i t3_1 = __lsx_vsll_h(t2_1, shift_l); + __m128i t3_2 = __lsx_vsll_h(t2_2, shift_l); + __m128i t3_3 = __lsx_vsll_h(t2_3, shift_l); + + __m128i input0 = __lsx_vor_v(t1_0, t3_0); + __m128i input0_shuf0 = __lsx_vshuf_b(base64_tbl1, base64_tbl0, input0); + __m128i input0_shuf1 = __lsx_vshuf_b(base64_tbl3, base64_tbl2, + __lsx_vsub_b(input0, __lsx_vldi(32))); + __m128i input0_mask = __lsx_vslei_bu(input0, 31); + __m128i input0_result = + __lsx_vbitsel_v(input0_shuf1, input0_shuf0, input0_mask); + __lsx_vst(input0_result, reinterpret_cast<__m128i *>(out), 0); out += 16; - _mm_storeu_si128(reinterpret_cast<__m128i *>(out), - lookup_pshufb_improved(input1)); + __m128i input1 = __lsx_vor_v(t1_1, t3_1); + __m128i input1_shuf0 = __lsx_vshuf_b(base64_tbl1, base64_tbl0, input1); + __m128i input1_shuf1 = __lsx_vshuf_b(base64_tbl3, base64_tbl2, + __lsx_vsub_b(input1, __lsx_vldi(32))); + __m128i input1_mask = __lsx_vslei_bu(input1, 31); + __m128i input1_result = + __lsx_vbitsel_v(input1_shuf1, input1_shuf0, input1_mask); + __lsx_vst(input1_result, reinterpret_cast<__m128i *>(out), 0); out += 16; - _mm_storeu_si128(reinterpret_cast<__m128i *>(out), - lookup_pshufb_improved(input2)); + __m128i input2 = __lsx_vor_v(t1_2, t3_2); + __m128i input2_shuf0 = __lsx_vshuf_b(base64_tbl1, base64_tbl0, input2); + __m128i input2_shuf1 = __lsx_vshuf_b(base64_tbl3, base64_tbl2, + __lsx_vsub_b(input2, __lsx_vldi(32))); + __m128i input2_mask = __lsx_vslei_bu(input2, 31); + __m128i input2_result = + __lsx_vbitsel_v(input2_shuf1, input2_shuf0, input2_mask); + __lsx_vst(input2_result, reinterpret_cast<__m128i *>(out), 0); out += 16; - _mm_storeu_si128(reinterpret_cast<__m128i *>(out), - lookup_pshufb_improved(input3)); + __m128i input3 = __lsx_vor_v(t1_3, t3_3); + __m128i input3_shuf0 = __lsx_vshuf_b(base64_tbl1, base64_tbl0, input3); + __m128i input3_shuf1 = __lsx_vshuf_b(base64_tbl3, base64_tbl2, + __lsx_vsub_b(input3, __lsx_vldi(32))); + __m128i input3_mask = __lsx_vslei_bu(input3, 31); + __m128i input3_result = + __lsx_vbitsel_v(input3_shuf1, input3_shuf0, input3_mask); + __lsx_vst(input3_result, reinterpret_cast<__m128i *>(out), 0); out += 16; } for (; i + 16 <= srclen; i += 12) { - __m128i in = _mm_loadu_si128(reinterpret_cast(input + i)); + __m128i in = __lsx_vld(reinterpret_cast(input + i), 0); // bytes from groups A, B and C are needed in separate 32-bit lanes // in = [DDDD|CCCC|BBBB|AAAA] @@ -35758,40 +62048,44 @@ size_t encode_base64(char *dst, const char *src, size_t srclen) { // [bbbbcccc|ccdddddd|aaaaaabb|bbbbcccc] // ^^^^ ^^^^^^^^ ^^^^^^^^ ^^^^ // processed bits - in = _mm_shuffle_epi8(in, shuf); + in = __lsx_vshuf_b(in, in, (__m128i)shuf); // unpacking - // t0 = [0000cccc|cc000000|aaaaaa00|00000000] - const __m128i t0 = _mm_and_si128(in, _mm_set1_epi32(0x0fc0fc00)); + __m128i t0 = __lsx_vand_v(in, v_fc0fc00); // t1 = [00000000|00cccccc|00000000|00aaaaaa] - // (c * (1 << 10), a * (1 << 6)) >> 16 (note: an unsigned - // multiplication) - const __m128i t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040)); + // ((c >> 6), (a >> 10)) + __m128i t1 = __lsx_vsrl_h(t0, shift_r); // t2 = [00000000|00dddddd|000000bb|bbbb0000] - const __m128i t2 = _mm_and_si128(in, _mm_set1_epi32(0x003f03f0)); - // t3 = [00dddddd|00000000|00bbbbbb|00000000]( - // (d * (1 << 8), b * (1 << 4)) - const __m128i t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010)); + __m128i t2 = __lsx_vand_v(in, v_3f03f0); + // t3 = [00dddddd|00000000|00bbbbbb|00000000] + // ((d << 8), (b << 4)) + __m128i t3 = __lsx_vsll_h(t2, shift_l); // res = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] = t1 | t3 - const __m128i indices = _mm_or_si128(t1, t3); + __m128i indices = __lsx_vor_v(t1, t3); + + __m128i indices_shuf0 = __lsx_vshuf_b(base64_tbl1, base64_tbl0, indices); + __m128i indices_shuf1 = __lsx_vshuf_b( + base64_tbl3, base64_tbl2, __lsx_vsub_b(indices, __lsx_vldi(32))); + __m128i indices_mask = __lsx_vslei_bu(indices, 31); + __m128i indices_result = + __lsx_vbitsel_v(indices_shuf1, indices_shuf0, indices_mask); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out), - lookup_pshufb_improved(indices)); + __lsx_vst(indices_result, reinterpret_cast<__m128i *>(out), 0); out += 16; } return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i, options); } + static inline void compress(__m128i data, uint16_t mask, char *output) { if (mask == 0) { - _mm_storeu_si128(reinterpret_cast<__m128i *>(output), data); + __lsx_vst(data, reinterpret_cast<__m128i *>(output), 0); return; } - // this particular implementation was inspired by work done by @animetosho // we do it in two steps, first 8 bytes and then second 8 bytes uint8_t mask1 = uint8_t(mask); // least significant 8 bits @@ -35800,13 +62094,15 @@ static inline void compress(__m128i data, uint16_t mask, char *output) { // thintable_epi8[mask2] into a 128-bit register, using only // two instructions on most compilers. - __m128i shufmask = _mm_set_epi64x(tables::base64::thintable_epi8[mask2], - tables::base64::thintable_epi8[mask1]); + v2u64 shufmask = {tables::base64::thintable_epi8[mask1], + tables::base64::thintable_epi8[mask2]}; + // we increment by 0x08 the second half of the mask - shufmask = - _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0)); + v4u32 hi = {0, 0, 0x08080808, 0x08080808}; + __m128i shufmask1 = __lsx_vadd_b((__m128i)shufmask, (__m128i)hi); + // this is the version "nearly pruned" - __m128i pruned = _mm_shuffle_epi8(data, shufmask); + __m128i pruned = __lsx_vshuf_b(data, data, shufmask1); // we still need to put the two halves together. // we compute the popcount of the first half: int pop1 = tables::base64::BitsSetTable256mul2[mask1]; @@ -35814,178 +62110,284 @@ static inline void compress(__m128i data, uint16_t mask, char *output) { // only the first pop1 bytes from the first 8 bytes, and then // it fills in with the bytes from the second 8 bytes + some filling // at the end. - __m128i compactmask = _mm_loadu_si128(reinterpret_cast( - tables::base64::pshufb_combine_table + pop1 * 8)); - __m128i answer = _mm_shuffle_epi8(pruned, compactmask); - _mm_storeu_si128(reinterpret_cast<__m128i *>(output), answer); + __m128i compactmask = + __lsx_vld(reinterpret_cast( + tables::base64::pshufb_combine_table + pop1 * 8), + 0); + __m128i answer = __lsx_vshuf_b(pruned, pruned, compactmask); + + __lsx_vst(answer, reinterpret_cast<__m128i *>(output), 0); } struct block64 { __m128i chunks[4]; }; -template +template static inline uint16_t to_base64_mask(__m128i *src, bool *error) { - const __m128i ascii_space_tbl = - _mm_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa, 0x0, - 0xc, 0xd, 0x0, 0x0); + const v16u8 ascii_space_tbl = {0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0}; // credit: aqrit - __m128i delta_asso; - if (base64_url) { - delta_asso = _mm_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, - 0x0, 0x0, 0x0, 0xF, 0x0, 0xF); + /* + '0'(0x30)-'9'(0x39) => delta_values_index = 4 + 'A'(0x41)-'Z'(0x5a) => delta_values_index = 4/5/12(4+8) + 'a'(0x61)-'z'(0x7a) => delta_values_index = 6/7/14(6+8) + '+'(0x2b) => delta_values_index = 3 + '/'(0x2f) => delta_values_index = 2+8 = 10 + '-'(0x2d) => delta_values_index = 2+8 = 10 + '_'(0x5f) => delta_values_index = 5+8 = 13 + */ + v16u8 delta_asso; + if (default_or_url) { + delta_asso = v16u8{0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x16}; } else { - - delta_asso = _mm_setr_epi8(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F); + delta_asso = v16u8{0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, + 0x0, 0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF}; } - __m128i delta_values; - if (base64_url) { - delta_values = _mm_setr_epi8(0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), - uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9), - 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF), - uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9)); + v16i8 delta_values; + if (default_or_url) { + delta_values = + v16i8{int8_t(0xBF), int8_t(0xE0), int8_t(0xB9), int8_t(0x13), + int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), + int8_t(0xB9), int8_t(0x00), int8_t(0xFF), int8_t(0x11), + int8_t(0xFF), int8_t(0xBF), int8_t(0x10), int8_t(0xB9)}; + } else if (base64_url) { + delta_values = + v16i8{int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), + int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), + int8_t(0xB9), int8_t(0x00), int8_t(0x11), int8_t(0xC3), + int8_t(0xBF), int8_t(0xE0), int8_t(0xB9), int8_t(0xB9)}; } else { - delta_values = - _mm_setr_epi8(int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), - int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), - int8_t(0xB9), int8_t(0x00), int8_t(0x10), int8_t(0xC3), - int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9)); - } - __m128i check_asso; - if (base64_url) { - check_asso = _mm_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, - 0x3, 0x7, 0xB, 0x6, 0xB, 0x12); + v16i8{int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), + int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), + int8_t(0xB9), int8_t(0x00), int8_t(0x10), int8_t(0xC3), + int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9)}; + } + + v16u8 check_asso; + if (default_or_url) { + check_asso = v16u8{0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0E, 0x0B, 0x06}; + } else if (base64_url) { + check_asso = v16u8{0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x03, 0x07, 0x0B, 0x06, 0x0B, 0x12}; } else { - - check_asso = _mm_setr_epi8(0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, - 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F); + check_asso = v16u8{0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F}; } - __m128i check_values; - if (base64_url) { - check_values = _mm_setr_epi8(0x0, uint8_t(0x80), uint8_t(0x80), - uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF), - uint8_t(0xD3), uint8_t(0xA6), uint8_t(0xB5), - uint8_t(0x86), uint8_t(0xD0), uint8_t(0x80), - uint8_t(0xB0), uint8_t(0x80), 0x0, 0x0); - } else { + v16i8 check_values; + if (default_or_url) { + check_values = + v16i8{int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), + int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), + int8_t(0xB5), int8_t(0xA1), int8_t(0x00), int8_t(0x80), + int8_t(0x00), int8_t(0x80), int8_t(0x00), int8_t(0x80)}; + } else if (base64_url) { + check_values = v16i8{int8_t(0x0), int8_t(0x80), int8_t(0x80), int8_t(0x80), + int8_t(0xCF), int8_t(0xBF), int8_t(0xD3), int8_t(0xA6), + int8_t(0xB5), int8_t(0x86), int8_t(0xD0), int8_t(0x80), + int8_t(0xB0), int8_t(0x80), int8_t(0x0), int8_t(0x0)}; + } else { check_values = - _mm_setr_epi8(int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), - int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), - int8_t(0xB5), int8_t(0x86), int8_t(0xD1), int8_t(0x80), - int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80)); + v16i8{int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), + int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), + int8_t(0xB5), int8_t(0x86), int8_t(0xD1), int8_t(0x80), + int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80)}; } - const __m128i shifted = _mm_srli_epi32(*src, 3); + const __m128i shifted = __lsx_vsrli_b(*src, 3); + __m128i asso_index = __lsx_vand_v(*src, __lsx_vldi(0xF)); const __m128i delta_hash = - _mm_avg_epu8(_mm_shuffle_epi8(delta_asso, *src), shifted); + __lsx_vavgr_bu(__lsx_vshuf_b((__m128i)delta_asso, (__m128i)delta_asso, + (__m128i)asso_index), + shifted); const __m128i check_hash = - _mm_avg_epu8(_mm_shuffle_epi8(check_asso, *src), shifted); + __lsx_vavgr_bu(__lsx_vshuf_b((__m128i)check_asso, (__m128i)check_asso, + (__m128i)asso_index), + shifted); const __m128i out = - _mm_adds_epi8(_mm_shuffle_epi8(delta_values, delta_hash), *src); + __lsx_vsadd_b(__lsx_vshuf_b((__m128i)delta_values, (__m128i)delta_values, + (__m128i)delta_hash), + *src); const __m128i chk = - _mm_adds_epi8(_mm_shuffle_epi8(check_values, check_hash), *src); - const int mask = _mm_movemask_epi8(chk); + __lsx_vsadd_b(__lsx_vshuf_b((__m128i)check_values, (__m128i)check_values, + (__m128i)check_hash), + *src); + unsigned int mask = __lsx_vpickve2gr_hu(__lsx_vmskltz_b(chk), 0); if (mask) { - __m128i ascii_space = - _mm_cmpeq_epi8(_mm_shuffle_epi8(ascii_space_tbl, *src), *src); - *error |= (mask != _mm_movemask_epi8(ascii_space)); + __m128i ascii_space = __lsx_vseq_b(__lsx_vshuf_b((__m128i)ascii_space_tbl, + (__m128i)ascii_space_tbl, + (__m128i)asso_index), + *src); + *error |= + (mask != __lsx_vpickve2gr_hu(__lsx_vmskltz_b((__m128i)ascii_space), 0)); } + *src = out; return (uint16_t)mask; } -template +template static inline uint64_t to_base64_mask(block64 *b, bool *error) { *error = 0; - uint64_t m0 = to_base64_mask(&b->chunks[0], error); - uint64_t m1 = to_base64_mask(&b->chunks[1], error); - uint64_t m2 = to_base64_mask(&b->chunks[2], error); - uint64_t m3 = to_base64_mask(&b->chunks[3], error); + uint64_t m0 = + to_base64_mask(&b->chunks[0], error); + uint64_t m1 = + to_base64_mask(&b->chunks[1], error); + uint64_t m2 = + to_base64_mask(&b->chunks[2], error); + uint64_t m3 = + to_base64_mask(&b->chunks[3], error); return m0 | (m1 << 16) | (m2 << 32) | (m3 << 48); } static inline void copy_block(block64 *b, char *output) { - _mm_storeu_si128(reinterpret_cast<__m128i *>(output), b->chunks[0]); - _mm_storeu_si128(reinterpret_cast<__m128i *>(output + 16), b->chunks[1]); - _mm_storeu_si128(reinterpret_cast<__m128i *>(output + 32), b->chunks[2]); - _mm_storeu_si128(reinterpret_cast<__m128i *>(output + 48), b->chunks[3]); + __lsx_vst(b->chunks[0], reinterpret_cast<__m128i *>(output), 0); + __lsx_vst(b->chunks[1], reinterpret_cast<__m128i *>(output), 16); + __lsx_vst(b->chunks[2], reinterpret_cast<__m128i *>(output), 32); + __lsx_vst(b->chunks[3], reinterpret_cast<__m128i *>(output), 48); } static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) { uint64_t nmask = ~mask; + uint64_t count = + __lsx_vpickve2gr_d(__lsx_vpcnt_h(__lsx_vreplgr2vr_d(nmask)), 0); + uint16_t *count_ptr = (uint16_t *)&count; compress(b->chunks[0], uint16_t(mask), output); - compress(b->chunks[1], uint16_t(mask >> 16), - output + _mm_popcnt_u64(nmask & 0xFFFF)); + compress(b->chunks[1], uint16_t(mask >> 16), output + count_ptr[0]); compress(b->chunks[2], uint16_t(mask >> 32), - output + _mm_popcnt_u64(nmask & 0xFFFFFFFF)); + output + count_ptr[0] + count_ptr[1]); compress(b->chunks[3], uint16_t(mask >> 48), - output + _mm_popcnt_u64(nmask & 0xFFFFFFFFFFFFULL)); - return _mm_popcnt_u64(nmask); -} - -// The caller of this function is responsible to ensure that there are 64 bytes available -// from reading at src. The data is read into a block64 structure. + output + count_ptr[0] + count_ptr[1] + count_ptr[2]); + return count_ones(nmask); +} + +template bool is_power_of_two(T x) { return (x & (x - 1)) == 0; } + +inline size_t compress_block_single(block64 *b, uint64_t mask, char *output) { + const size_t pos64 = trailing_zeroes(mask); + const int8_t pos = pos64 & 0xf; + // Predefine the index vector + const v16u8 v1 = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + switch (pos64 >> 4) { + case 0b00: { + const __m128i v0 = __lsx_vreplgr2vr_b((uint8_t)(pos - 1)); + const __m128i v2 = __lsx_vslt_b(v0, (__m128i)v1); // v1 > v0 + const __m128i sh = __lsx_vsub_b((__m128i)v1, v2); + const __m128i compressed = __lsx_vshuf_b(b->chunks[0], b->chunks[0], sh); + __lsx_vst(compressed, reinterpret_cast<__m128i *>(output + 0 * 16), 0); + __lsx_vst(b->chunks[1], reinterpret_cast<__m128i *>(output + 1 * 16 - 1), + 0); + __lsx_vst(b->chunks[2], reinterpret_cast<__m128i *>(output + 2 * 16 - 1), + 0); + __lsx_vst(b->chunks[3], reinterpret_cast<__m128i *>(output + 3 * 16 - 1), + 0); + } break; + + case 0b01: { + __lsx_vst(b->chunks[0], reinterpret_cast<__m128i *>(output + 0 * 16), 0); + + const __m128i v0 = __lsx_vreplgr2vr_b((uint8_t)(pos - 1)); + const __m128i v2 = __lsx_vslt_b(v0, (__m128i)v1); + const __m128i sh = __lsx_vsub_b((__m128i)v1, v2); + const __m128i compressed = __lsx_vshuf_b(b->chunks[1], b->chunks[1], sh); + + __lsx_vst(compressed, reinterpret_cast<__m128i *>(output + 1 * 16), 0); + __lsx_vst(b->chunks[2], reinterpret_cast<__m128i *>(output + 2 * 16 - 1), + 0); + __lsx_vst(b->chunks[3], reinterpret_cast<__m128i *>(output + 3 * 16 - 1), + 0); + } break; + + case 0b10: { + __lsx_vst(b->chunks[0], reinterpret_cast<__m128i *>(output + 0 * 16), 0); + __lsx_vst(b->chunks[1], reinterpret_cast<__m128i *>(output + 1 * 16), 0); + + const __m128i v0 = __lsx_vreplgr2vr_b((uint8_t)(pos - 1)); + const __m128i v2 = __lsx_vslt_b(v0, (__m128i)v1); + const __m128i sh = __lsx_vsub_b((__m128i)v1, v2); + const __m128i compressed = __lsx_vshuf_b(b->chunks[2], b->chunks[2], sh); + + __lsx_vst(compressed, reinterpret_cast<__m128i *>(output + 2 * 16), 0); + __lsx_vst(b->chunks[3], reinterpret_cast<__m128i *>(output + 3 * 16 - 1), + 0); + } break; + + case 0b11: { + __lsx_vst(b->chunks[0], reinterpret_cast<__m128i *>(output + 0 * 16), 0); + __lsx_vst(b->chunks[1], reinterpret_cast<__m128i *>(output + 1 * 16), 0); + __lsx_vst(b->chunks[2], reinterpret_cast<__m128i *>(output + 2 * 16), 0); + + const __m128i v0 = __lsx_vreplgr2vr_b((uint8_t)(pos - 1)); + const __m128i v2 = __lsx_vslt_b(v0, (__m128i)v1); + const __m128i sh = __lsx_vsub_b((__m128i)v1, v2); + const __m128i compressed = __lsx_vshuf_b(b->chunks[3], b->chunks[3], sh); + + __lsx_vst(compressed, reinterpret_cast<__m128i *>(output + 3 * 16), 0); + } break; + } + return 63; +} + +// The caller of this function is responsible to ensure that there are 64 bytes +// available from reading at src. The data is read into a block64 structure. static inline void load_block(block64 *b, const char *src) { - b->chunks[0] = _mm_loadu_si128(reinterpret_cast(src)); - b->chunks[1] = _mm_loadu_si128(reinterpret_cast(src + 16)); - b->chunks[2] = _mm_loadu_si128(reinterpret_cast(src + 32)); - b->chunks[3] = _mm_loadu_si128(reinterpret_cast(src + 48)); + b->chunks[0] = __lsx_vld(reinterpret_cast(src), 0); + b->chunks[1] = __lsx_vld(reinterpret_cast(src), 16); + b->chunks[2] = __lsx_vld(reinterpret_cast(src), 32); + b->chunks[3] = __lsx_vld(reinterpret_cast(src), 48); } -// The caller of this function is responsible to ensure that there are 128 bytes available -// from reading at src. The data is read into a block64 structure. +// The caller of this function is responsible to ensure that there are 128 bytes +// available from reading at src. The data is read into a block64 structure. static inline void load_block(block64 *b, const char16_t *src) { - __m128i m1 = _mm_loadu_si128(reinterpret_cast(src)); - __m128i m2 = _mm_loadu_si128(reinterpret_cast(src + 8)); - __m128i m3 = _mm_loadu_si128(reinterpret_cast(src + 16)); - __m128i m4 = _mm_loadu_si128(reinterpret_cast(src + 24)); - __m128i m5 = _mm_loadu_si128(reinterpret_cast(src + 32)); - __m128i m6 = _mm_loadu_si128(reinterpret_cast(src + 40)); - __m128i m7 = _mm_loadu_si128(reinterpret_cast(src + 48)); - __m128i m8 = _mm_loadu_si128(reinterpret_cast(src + 56)); - b->chunks[0] = _mm_packus_epi16(m1, m2); - b->chunks[1] = _mm_packus_epi16(m3, m4); - b->chunks[2] = _mm_packus_epi16(m5, m6); - b->chunks[3] = _mm_packus_epi16(m7, m8); + __m128i m1 = __lsx_vld(reinterpret_cast(src), 0); + __m128i m2 = __lsx_vld(reinterpret_cast(src), 16); + __m128i m3 = __lsx_vld(reinterpret_cast(src), 32); + __m128i m4 = __lsx_vld(reinterpret_cast(src), 48); + __m128i m5 = __lsx_vld(reinterpret_cast(src), 64); + __m128i m6 = __lsx_vld(reinterpret_cast(src), 80); + __m128i m7 = __lsx_vld(reinterpret_cast(src), 96); + __m128i m8 = __lsx_vld(reinterpret_cast(src), 112); + b->chunks[0] = __lsx_vssrlni_bu_h(m2, m1, 0); + b->chunks[1] = __lsx_vssrlni_bu_h(m4, m3, 0); + b->chunks[2] = __lsx_vssrlni_bu_h(m6, m5, 0); + b->chunks[3] = __lsx_vssrlni_bu_h(m8, m7, 0); } static inline void base64_decode(char *out, __m128i str) { - // credit: aqrit - - const __m128i pack_shuffle = - _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1); + __m128i t0 = __lsx_vor_v( + __lsx_vslli_w(str, 26), + __lsx_vslli_w(__lsx_vand_v(str, lsx_splat_u32(0x0000FF00)), 12)); + __m128i t1 = __lsx_vsrli_w(__lsx_vand_v(str, lsx_splat_u32(0x003F0000)), 2); + __m128i t2 = __lsx_vor_v(t0, t1); + __m128i t3 = __lsx_vor_v(t2, __lsx_vsrli_w(str, 16)); + const v16u8 pack_shuffle = {3, 2, 1, 7, 6, 5, 11, 10, + 9, 15, 14, 13, 0, 0, 0, 0}; + t3 = __lsx_vshuf_b(t3, t3, (__m128i)pack_shuffle); - const __m128i t0 = _mm_maddubs_epi16(str, _mm_set1_epi32(0x01400140)); - const __m128i t1 = _mm_madd_epi16(t0, _mm_set1_epi32(0x00011000)); - const __m128i t2 = _mm_shuffle_epi8(t1, pack_shuffle); // Store the output: - // this writes 16 bytes, but we only need 12. - _mm_storeu_si128((__m128i *)out, t2); + // we only need 12. + __lsx_vstelm_d(t3, out, 0, 0); + __lsx_vstelm_w(t3, out + 8, 0, 2); } // decode 64 bytes and output 48 bytes static inline void base64_decode_block(char *out, const char *src) { - base64_decode(out, _mm_loadu_si128(reinterpret_cast(src))); + base64_decode(out, __lsx_vld(reinterpret_cast(src), 0)); base64_decode(out + 12, - _mm_loadu_si128(reinterpret_cast(src + 16))); + __lsx_vld(reinterpret_cast(src), 16)); base64_decode(out + 24, - _mm_loadu_si128(reinterpret_cast(src + 32))); + __lsx_vld(reinterpret_cast(src), 32)); base64_decode(out + 36, - _mm_loadu_si128(reinterpret_cast(src + 48))); + __lsx_vld(reinterpret_cast(src), 48)); } static inline void base64_decode_block_safe(char *out, const char *src) { - base64_decode(out, _mm_loadu_si128(reinterpret_cast(src))); - base64_decode(out + 12, - _mm_loadu_si128(reinterpret_cast(src + 16))); - base64_decode(out + 24, - _mm_loadu_si128(reinterpret_cast(src + 32))); - char buffer[16]; - base64_decode(buffer, - _mm_loadu_si128(reinterpret_cast(src + 48))); - std::memcpy(out + 36, buffer, 12); + base64_decode_block(out, src); } static inline void base64_decode_block(char *out, block64 *b) { base64_decode(out, b->chunks[0]); @@ -35994,96 +62396,84 @@ static inline void base64_decode_block(char *out, block64 *b) { base64_decode(out + 36, b->chunks[3]); } static inline void base64_decode_block_safe(char *out, block64 *b) { - base64_decode(out, b->chunks[0]); - base64_decode(out + 12, b->chunks[1]); - base64_decode(out + 24, b->chunks[2]); - char buffer[16]; - base64_decode(buffer, b->chunks[3]); - std::memcpy(out + 36, buffer, 12); -} - -template -result compress_decode_base64(char *dst, const chartype *src, size_t srclen, - base64_options options) { - const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value - : tables::base64::to_base64_value; - // skip trailing spaces - while (srclen > 0 && to_base64[uint8_t(src[srclen - 1])] == 64) { - srclen--; - } - size_t equalsigns = 0; - if (srclen > 0 && src[srclen - 1] == '=') { - srclen--; - equalsigns = 1; - // skip trailing spaces - while (srclen > 0 && to_base64[uint8_t(src[srclen - 1])] == 64) { - srclen--; - } - if (srclen > 0 && src[srclen - 1] == '=') { - srclen--; - equalsigns = 2; - } + base64_decode_block(out, b); +} + +template +full_result +compress_decode_base64(char *dst, const char_type *src, size_t srclen, + base64_options options, + last_chunk_handling_options last_chunk_options) { + const uint8_t *to_base64 = + default_or_url ? tables::base64::to_base64_default_or_url_value + : (base64_url ? tables::base64::to_base64_url_value + : tables::base64::to_base64_value); + auto ri = simdutf::scalar::base64::find_end(src, srclen, options); + size_t equallocation = ri.equallocation; + size_t equalsigns = ri.equalsigns; + srclen = ri.srclen; + size_t full_input_length = ri.full_input_length; + if (srclen == 0) { + if (!ignore_garbage && equalsigns > 0) { + return {INVALID_BASE64_CHARACTER, equallocation, 0}; + } + return {SUCCESS, full_input_length, 0}; } - char *end_of_safe_64byte_zone = - (srclen + 3) / 4 * 3 >= 63 ? dst + (srclen + 3) / 4 * 3 - 63 : dst; - - const chartype *const srcinit = src; + const char_type *const srcinit = src; const char *const dstinit = dst; - const chartype *const srcend = src + srclen; + const char_type *const srcend = src + srclen; - constexpr size_t block_size = 6; - static_assert(block_size >= 2, "block should of size 2 or more"); + constexpr size_t block_size = 10; char buffer[block_size * 64]; char *bufferptr = buffer; if (srclen >= 64) { - const chartype *const srcend64 = src + srclen - 64; + const char_type *const srcend64 = src + srclen - 64; while (src <= srcend64) { block64 b; load_block(&b, src); src += 64; bool error = false; - uint64_t badcharmask = to_base64_mask(&b, &error); - if (error) { - src -= 64; - while (src < srcend && to_base64[uint8_t(*src)] <= 64) { - src++; + uint64_t badcharmask = + to_base64_mask(&b, &error); + if (badcharmask) { + if (error && !ignore_garbage) { + src -= 64; + while (src < srcend && scalar::base64::is_eight_byte(*src) && + to_base64[uint8_t(*src)] <= 64) { + src++; + } + if (src < srcend) { + // should never happen + } + return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit), + size_t(dst - dstinit)}; } - return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)}; } + if (badcharmask != 0) { - // optimization opportunity: check for simple masks like those made of - // continuous 1s followed by continuous 0s. And masks containing a - // single bad character. - bufferptr += compress_block(&b, badcharmask, bufferptr); - } else if (bufferptr != buffer) { - copy_block(&b, bufferptr); - bufferptr += 64; - } else { - if (dst >= end_of_safe_64byte_zone) { - base64_decode_block_safe(dst, &b); + if (is_power_of_two(badcharmask)) { + bufferptr += compress_block_single(&b, badcharmask, bufferptr); } else { - base64_decode_block(dst, &b); + bufferptr += compress_block(&b, badcharmask, bufferptr); } - dst += 48; + } else { + // optimization opportunity: if bufferptr == buffer and mask == 0, we + // can avoid the call to compress_block and decode directly. + copy_block(&b, bufferptr); + bufferptr += 64; } if (bufferptr >= (block_size - 1) * 64 + buffer) { - for (size_t i = 0; i < (block_size - 2); i++) { + for (size_t i = 0; i < (block_size - 1); i++) { base64_decode_block(dst, buffer + i * 64); dst += 48; } - if (dst >= end_of_safe_64byte_zone) { - base64_decode_block_safe(dst, buffer + (block_size - 2) * 64); - } else { - base64_decode_block(dst, buffer + (block_size - 2) * 64); - } - dst += 48; std::memcpy(buffer, buffer + (block_size - 1) * 64, 64); // 64 might be too much bufferptr -= (block_size - 1) * 64; } } } - char *buffer_start = buffer; // Optimization note: if this is almost full, then it is worth our // time, otherwise, we should just decode directly. @@ -36092,8 +62482,10 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) { uint8_t val = to_base64[uint8_t(*src)]; *bufferptr = char(val); - if (val > 64) { - return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)}; + if ((!scalar::base64::is_eight_byte(*src) || val > 64) && + !ignore_garbage) { + return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit), + size_t(dst - dstinit)}; } bufferptr += (val <= 63); src++; @@ -36101,11 +62493,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, } for (; buffer_start + 64 <= bufferptr; buffer_start += 64) { - if (dst >= end_of_safe_64byte_zone) { - base64_decode_block_safe(dst, buffer_start); - } else { - base64_decode_block(dst, buffer_start); - } + base64_decode_block(dst, buffer_start); dst += 48; } if ((bufferptr - buffer_start) % 64 != 0) { @@ -36115,7 +62503,8 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) + (uint32_t(uint8_t(buffer_start[3])) << 0 * 6)) << 8; - triple = scalar::utf32::swap_bytes(triple); + // lsx is little-endian + triple = scalar::u32_swap_bytes(triple); std::memcpy(dst, &triple, 4); dst += 3; @@ -36127,96 +62516,143 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) + (uint32_t(uint8_t(buffer_start[3])) << 0 * 6)) << 8; - triple = scalar::utf32::swap_bytes(triple); + // lsx is little-endian + triple = scalar::u32_swap_bytes(triple); std::memcpy(dst, &triple, 3); dst += 3; buffer_start += 4; } // we may have 1, 2 or 3 bytes left and we need to decode them so let us - // bring in src content + // backtrack int leftover = int(bufferptr - buffer_start); - if (leftover > 0) { - while (leftover < 4 && src < srcend) { - uint8_t val = to_base64[uint8_t(*src)]; - if (val > 64) { - return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)}; + while (leftover > 0) { + if (!ignore_garbage) { + while (to_base64[uint8_t(*(src - 1))] == 64) { + src--; } - buffer_start[leftover] = char(val); - leftover += (val <= 63); - src++; - } - - if (leftover == 1) { - return {BASE64_INPUT_REMAINDER, size_t(dst - dstinit)}; - } - if (leftover == 2) { - uint32_t triple = (uint32_t(buffer_start[0]) << 3 * 6) + - (uint32_t(buffer_start[1]) << 2 * 6); - triple = scalar::utf32::swap_bytes(triple); - triple >>= 8; - std::memcpy(dst, &triple, 1); - dst += 1; - } else if (leftover == 3) { - uint32_t triple = (uint32_t(buffer_start[0]) << 3 * 6) + - (uint32_t(buffer_start[1]) << 2 * 6) + - (uint32_t(buffer_start[2]) << 1 * 6); - triple = scalar::utf32::swap_bytes(triple); - - triple >>= 8; - - std::memcpy(dst, &triple, 2); - dst += 2; } else { - uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) + - (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) + - (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) + - (uint32_t(uint8_t(buffer_start[3])) << 0 * 6)) - << 8; - triple = scalar::utf32::swap_bytes(triple); - std::memcpy(dst, &triple, 3); - dst += 3; + while (to_base64[uint8_t(*(src - 1))] >= 64) { + src--; + } } + src--; + leftover--; } } if (src < srcend + equalsigns) { - result r = - scalar::base64::base64_tail_decode(dst, src, srcend - src, options); - if (r.error == error_code::INVALID_BASE64_CHARACTER) { - r.count += size_t(src - srcinit); - return r; - } else { - r.count += size_t(dst - dstinit); - } - if(r.error == error_code::SUCCESS && equalsigns > 0) { - // additional checks - if((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) { - r.error = error_code::INVALID_BASE64_CHARACTER; + full_result r = scalar::base64::base64_tail_decode( + dst, src, srcend - src, equalsigns, options, last_chunk_options); + r = scalar::base64::patch_tail_result( + r, size_t(src - srcinit), size_t(dst - dstinit), equallocation, + full_input_length, last_chunk_options); + // When is_partial(last_chunk_options) is true, we must either end with + // the end of the stream (beyond whitespace) or right after a non-ignorable + // character or at the very beginning of the stream. + // See https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64 + if (is_partial(last_chunk_options) && r.error == error_code::SUCCESS && + r.input_count < full_input_length) { + // First check if we can extend the input to the end of the stream + while (r.input_count < full_input_length && + base64_ignorable(*(srcinit + r.input_count), options)) { + r.input_count++; + } + // If we are still not at the end of the stream, then we must backtrack + // to the last non-ignorable character. + if (r.input_count < full_input_length) { + while (r.input_count > 0 && + base64_ignorable(*(srcinit + r.input_count - 1), options)) { + r.input_count--; + } } } return r; } - if(equalsigns > 0) { - if((size_t(dst - dstinit) % 3 == 0) || ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) { - return {INVALID_BASE64_CHARACTER, size_t(dst - dstinit)}; + if (equalsigns > 0 && !ignore_garbage) { + if ((size_t(dst - dstinit) % 3 == 0) || + ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) { + return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)}; } } - return {SUCCESS, size_t(dst - dstinit)}; + return {SUCCESS, srclen, size_t(dst - dstinit)}; } -/* end file src/westmere/sse_base64.cpp */ +/* end file src/lsx/lsx_base64.cpp */ +/* begin file src/lsx/lsx_find.cpp */ +simdutf_really_inline const char *util_find(const char *start, const char *end, + char character) noexcept { + if (start >= end) + return end; -} // unnamed namespace -} // namespace westmere + const int step = 16; + __m128i char_vec = __lsx_vreplgr2vr_b(static_cast(character)); + + while (end - start >= step) { + __m128i data = __lsx_vld(reinterpret_cast(start), 0); + __m128i cmp = __lsx_vseq_b(data, char_vec); + if (__lsx_bnz_v(cmp)) { + uint16_t mask = + static_cast(__lsx_vpickve2gr_hu(__lsx_vmsknz_b(cmp), 0)); + return start + trailing_zeroes(mask); + } + + start += step; + } + + // Handle remaining bytes with scalar loop + for (; start < end; ++start) { + if (*start == character) { + return start; + } + } + + return end; +} + +simdutf_really_inline const char16_t *util_find(const char16_t *start, + const char16_t *end, + char16_t character) noexcept { + if (start >= end) + return end; + + const int step = 8; + __m128i char_vec = __lsx_vreplgr2vr_h(static_cast(character)); + + while (end - start >= step) { + __m128i data = __lsx_vld(reinterpret_cast(start), 0); + __m128i cmp = __lsx_vseq_h(data, char_vec); + if (__lsx_bnz_v(cmp)) { + uint16_t mask = + static_cast(__lsx_vpickve2gr_hu(__lsx_vmsknz_b(cmp), 0)); + return start + trailing_zeroes(mask) / 2; + } + + start += step; + } + + // Handle remaining elements with scalar loop + for (; start < end; ++start) { + if (*start == character) { + return start; + } + } + + return end; +} +/* end file src/lsx/lsx_find.cpp */ +#endif // SIMDUTF_FEATURE_BASE64 + +} // namespace +} // namespace lsx } // namespace simdutf /* begin file src/generic/buf_block_reader.h */ namespace simdutf { -namespace westmere { +namespace lsx { namespace { -// Walks through a buffer in block-sized increments, loading the last part with spaces -template -struct buf_block_reader { +// Walks through a buffer in block-sized increments, loading the last part with +// spaces +template struct buf_block_reader { public: simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len); simdutf_really_inline size_t block_index(); @@ -36225,14 +62661,16 @@ struct buf_block_reader { /** * Get the last block, padded with spaces. * - * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this - * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there - * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding. + * There will always be a last block, with at least 1 byte, unless len == 0 + * (in which case this function fills the buffer with spaces and returns 0. In + * particular, if len == STEP_SIZE there will be 0 full_blocks and 1 remainder + * block with STEP_SIZE bytes and no spaces for padding. * * @return the number of effective characters in the last block. */ simdutf_really_inline size_t get_remainder(uint8_t *dst) const; simdutf_really_inline void advance(); + private: const uint8_t *buf; const size_t len; @@ -36241,9 +62679,10 @@ struct buf_block_reader { }; // Routines to print masks and text for debugging bitmask operations -simdutf_unused static char * format_input_text_64(const uint8_t *text) { - static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); - for (size_t i=0; i); i++) { +simdutf_unused static char *format_input_text_64(const uint8_t *text) { + static char *buf = + reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + for (size_t i = 0; i < sizeof(simd8x64); i++) { buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]); } buf[sizeof(simd8x64)] = '\0'; @@ -36251,98 +62690,114 @@ simdutf_unused static char * format_input_text_64(const uint8_t *text) { } // Routines to print masks and text for debugging bitmask operations -simdutf_unused static char * format_input_text(const simd8x64& in) { - static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); - in.store(reinterpret_cast(buf)); - for (size_t i=0; i); i++) { - if (buf[i] < ' ') { buf[i] = '_'; } +simdutf_unused static char *format_input_text(const simd8x64 &in) { + static char *buf = + reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + in.store(reinterpret_cast(buf)); + for (size_t i = 0; i < sizeof(simd8x64); i++) { + if (buf[i] < ' ') { + buf[i] = '_'; + } } buf[sizeof(simd8x64)] = '\0'; return buf; } -simdutf_unused static char * format_mask(uint64_t mask) { - static char *buf = reinterpret_cast(malloc(64 + 1)); - for (size_t i=0; i<64; i++) { +simdutf_unused static char *format_mask(uint64_t mask) { + static char *buf = reinterpret_cast(malloc(64 + 1)); + for (size_t i = 0; i < 64; i++) { buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' '; } buf[64] = '\0'; return buf; } -template -simdutf_really_inline buf_block_reader::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} +template +simdutf_really_inline +buf_block_reader::buf_block_reader(const uint8_t *_buf, size_t _len) + : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, + idx{0} {} -template -simdutf_really_inline size_t buf_block_reader::block_index() { return idx; } +template +simdutf_really_inline size_t buf_block_reader::block_index() { + return idx; +} -template +template simdutf_really_inline bool buf_block_reader::has_full_block() const { return idx < lenminusstep; } -template -simdutf_really_inline const uint8_t *buf_block_reader::full_block() const { +template +simdutf_really_inline const uint8_t * +buf_block_reader::full_block() const { return &buf[idx]; } -template -simdutf_really_inline size_t buf_block_reader::get_remainder(uint8_t *dst) const { - if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers - std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once. +template +simdutf_really_inline size_t +buf_block_reader::get_remainder(uint8_t *dst) const { + if (len == idx) { + return 0; + } // memcpy(dst, null, 0) will trigger an error with some sanitizers + std::memset(dst, 0x20, + STEP_SIZE); // std::memset STEP_SIZE because it is more efficient + // to write out 8 or 16 bytes at once. std::memcpy(dst, buf + idx, len - idx); return len - idx; } -template +template simdutf_really_inline void buf_block_reader::advance() { idx += STEP_SIZE; } } // unnamed namespace -} // namespace westmere +} // namespace lsx } // namespace simdutf /* end file src/generic/buf_block_reader.h */ +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING /* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ namespace simdutf { -namespace westmere { +namespace lsx { namespace { namespace utf8_validation { using namespace simd; - simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { -// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) -// Bit 1 = Too Long (ASCII followed by continuation) -// Bit 2 = Overlong 3-byte -// Bit 4 = Surrogate -// Bit 5 = Overlong 2-byte -// Bit 7 = Two Continuations - constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ - // 11______ 11______ - constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ - constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ - constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ - constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ - constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ - constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ - // 11110100 101_____ - // 11110101 1001____ - // 11110101 101_____ - // 1111011_ 1001____ - // 1111011_ 101_____ - // 11111___ 1001____ - // 11111___ 101_____ - constexpr const uint8_t TOO_LARGE_1000 = 1<<6; - // 11110101 1000____ - // 1111011_ 1000____ - // 11111___ 1000____ - constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ - - const simd8 byte_1_high = prev1.shr<4>().lookup_16( +simdutf_really_inline simd8 +check_special_cases(const simd8 input, const simd8 prev1) { + // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) + // Bit 1 = Too Long (ASCII followed by continuation) + // Bit 2 = Overlong 3-byte + // Bit 4 = Surrogate + // Bit 5 = Overlong 2-byte + // Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1 << 6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( // 0_______ ________ - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, // 10______ ________ TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, // 1100____ ________ @@ -36352,245 +62807,283 @@ using namespace simd; // 1110____ ________ TOO_SHORT | OVERLONG_3 | SURROGATE, // 1111____ ________ - TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 - ); - constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . - const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( - // ____0000 ________ - CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, - // ____0001 ________ - CARRY | OVERLONG_2, - // ____001_ ________ - CARRY, - CARRY, - - // ____0100 ________ - CARRY | TOO_LARGE, - // ____0101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - // ____011_ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - - // ____1___ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - // ____1101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000 - ); - const simd8 byte_2_high = input.shr<4>().lookup_16( + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4); + constexpr const uint8_t CARRY = + TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = + (prev1 & 0x0F) + .lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000); + const simd8 byte_2_high = input.shr<4>().lookup_16( // ________ 0_______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, // ________ 1000____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | + OVERLONG_4, // ________ 1001____ TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, // ________ 101_____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, // ________ 11______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT - ); - return (byte_1_high & byte_1_low & byte_2_high); - } - simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, - const simd8 prev_input, const simd8 sc) { - simd8 prev2 = input.prev<2>(prev_input); - simd8 prev3 = input.prev<3>(prev_input); - simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); - simd8 must23_80 = must23 & uint8_t(0x80); - return must23_80 ^ sc; - } - - // - // Return nonzero if there are incomplete multibyte characters at the end of the block: - // e.g. if there is a 4-byte character, but it's 3 bytes from the end. - // - simdutf_really_inline simd8 is_incomplete(const simd8 input) { - // If the previous input's last 3 bytes match this, they're too short (they ended at EOF): - // ... 1111____ 111_____ 11______ - static const uint8_t max_array[32] = { - 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1 - }; - const simd8 max_value(&max_array[sizeof(max_array)-sizeof(simd8)]); - return input.gt_bits(max_value); - } - - struct utf8_checker { - // If this is nonzero, there has been a UTF-8 error. - simd8 error; - // The last input we received - simd8 prev_input_block; - // Whether the last input we received was incomplete (used for ASCII fast path) - simd8 prev_incomplete; - - // - // Check whether the current bytes are valid UTF-8. - // - simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { - // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes - // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) - simd8 prev1 = input.prev<1>(prev_input); - simd8 sc = check_special_cases(input, prev1); - this->error |= check_multibyte_lengths(input, prev_input, sc); - } - - // The only problem that can happen at EOF is that a multibyte character is too short - // or a byte value too large in the last bytes: check_special_cases only checks for bytes - // too large in the first of two bytes. - simdutf_really_inline void check_eof() { - // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't - // possibly finish them. - this->error |= this->prev_incomplete; - } - - simdutf_really_inline void check_next_input(const simd8x64& input) { - if(simdutf_likely(is_ascii(input))) { - this->error |= this->prev_incomplete; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], this->prev_input_block); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], this->prev_input_block); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - this->prev_incomplete = is_incomplete(input.chunks[simd8x64::NUM_CHUNKS-1]); - this->prev_input_block = input.chunks[simd8x64::NUM_CHUNKS-1]; + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); + return (byte_1_high & byte_1_low & byte_2_high); +} +simdutf_really_inline simd8 +check_multibyte_lengths(const simd8 input, + const simd8 prev_input, + const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = + simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; +} + +// +// Return nonzero if there are incomplete multibyte characters at the end of the +// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end. +// +simdutf_really_inline simd8 is_incomplete(const simd8 input) { + // If the previous input's last 3 bytes match this, they're too short (they + // ended at EOF): + // ... 1111____ 111_____ 11______ + static const uint8_t max_array[32] = {255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 0b11110000u - 1, + 0b11100000u - 1, + 0b11000000u - 1}; + const simd8 max_value( + &max_array[sizeof(max_array) - sizeof(simd8)]); + return input.gt_bits(max_value); +} + +struct utf8_checker { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + // The last input we received + simd8 prev_input_block; + // Whether the last input we received was incomplete (used for ASCII fast + // path) + simd8 prev_incomplete; + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, + const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ + // lead bytes (2, 3, 4-byte leads become large positive numbers instead of + // small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + // The only problem that can happen at EOF is that a multibyte character is + // too short or a byte value too large in the last bytes: check_special_cases + // only checks for bytes too large in the first of two bytes. + simdutf_really_inline void check_eof() { + // If the previous block had incomplete UTF-8 characters at the end, an + // ASCII block can't possibly finish them. + this->error |= this->prev_incomplete; + } + + simdutf_really_inline void check_next_input(const simd8x64 &input) { + if (simdutf_likely(is_ascii(input))) { + this->error |= this->prev_incomplete; + } else { + // you might think that a for-loop would work, but under Visual Studio, it + // is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); } + this->prev_incomplete = + is_incomplete(input.chunks[simd8x64::NUM_CHUNKS - 1]); + this->prev_input_block = input.chunks[simd8x64::NUM_CHUNKS - 1]; } + } - // do not forget to call check_eof! - simdutf_really_inline bool errors() const { - return this->error.any_bits_set_anywhere(); - } + // do not forget to call check_eof! + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } - }; // struct utf8_checker +}; // struct utf8_checker } // namespace utf8_validation using utf8_validation::utf8_checker; } // unnamed namespace -} // namespace westmere +} // namespace lsx } // namespace simdutf /* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ /* begin file src/generic/utf8_validation/utf8_validator.h */ namespace simdutf { -namespace westmere { +namespace lsx { namespace { namespace utf8_validation { /** * Validates that the string is actual UTF-8. */ -template -bool generic_validate_utf8(const uint8_t * input, size_t length) { - checker c{}; - buf_block_reader<64> reader(input, length); - while (reader.has_full_block()) { - simd::simd8x64 in(reader.full_block()); - c.check_next_input(in); - reader.advance(); - } - uint8_t block[64]{}; - reader.get_remainder(block); - simd::simd8x64 in(block); +template +bool generic_validate_utf8(const uint8_t *input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); c.check_next_input(in); reader.advance(); - c.check_eof(); - return !c.errors(); + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + c.check_eof(); + return !c.errors(); } -bool generic_validate_utf8(const char * input, size_t length) { - return generic_validate_utf8(reinterpret_cast(input),length); +bool generic_validate_utf8(const char *input, size_t length) { + return generic_validate_utf8( + reinterpret_cast(input), length); } /** * Validates that the string is actual UTF-8 and stops on errors. */ -template -result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) { - checker c{}; - buf_block_reader<64> reader(input, length); - size_t count{0}; - while (reader.has_full_block()) { - simd::simd8x64 in(reader.full_block()); - c.check_next_input(in); - if(c.errors()) { - if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk - result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(input), reinterpret_cast(input + count), length - count); - res.count += count; - return res; - } - reader.advance(); - count += 64; - } - uint8_t block[64]{}; - reader.get_remainder(block); - simd::simd8x64 in(block); +template +result generic_validate_utf8_with_errors(const uint8_t *input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + size_t count{0}; + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); c.check_next_input(in); - reader.advance(); - c.check_eof(); if (c.errors()) { - if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk - result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(input), reinterpret_cast(input) + count, length - count); + if (count != 0) { + count--; + } // Sometimes the error is only detected in the next chunk + result res = scalar::utf8::rewind_and_validate_with_errors( + reinterpret_cast(input), + reinterpret_cast(input + count), length - count); res.count += count; return res; - } else { - return result(error_code::SUCCESS, length); } + reader.advance(); + count += 64; + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + c.check_eof(); + if (c.errors()) { + if (count != 0) { + count--; + } // Sometimes the error is only detected in the next chunk + result res = scalar::utf8::rewind_and_validate_with_errors( + reinterpret_cast(input), + reinterpret_cast(input) + count, length - count); + res.count += count; + return res; + } else { + return result(error_code::SUCCESS, length); + } } -result generic_validate_utf8_with_errors(const char * input, size_t length) { - return generic_validate_utf8_with_errors(reinterpret_cast(input),length); -} - -template -bool generic_validate_ascii(const uint8_t * input, size_t length) { - buf_block_reader<64> reader(input, length); - uint8_t blocks[64]{}; - simd::simd8x64 running_or(blocks); - while (reader.has_full_block()) { - simd::simd8x64 in(reader.full_block()); - running_or |= in; - reader.advance(); - } - uint8_t block[64]{}; - reader.get_remainder(block); - simd::simd8x64 in(block); - running_or |= in; - return running_or.is_ascii(); +result generic_validate_utf8_with_errors(const char *input, size_t length) { + return generic_validate_utf8_with_errors( + reinterpret_cast(input), length); } -bool generic_validate_ascii(const char * input, size_t length) { - return generic_validate_ascii(reinterpret_cast(input),length); -} +} // namespace utf8_validation +} // unnamed namespace +} // namespace lsx +} // namespace simdutf +/* end file src/generic/utf8_validation/utf8_validator.h */ +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_ASCII +/* begin file src/generic/ascii_validation.h */ +namespace simdutf { +namespace lsx { +namespace { +namespace ascii_validation { -template -result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) { - buf_block_reader<64> reader(input, length); +result generic_validate_ascii_with_errors(const char *input, size_t length) { + buf_block_reader<64> reader(reinterpret_cast(input), length); size_t count{0}; while (reader.has_full_block()) { simd::simd8x64 in(reader.full_block()); if (!in.is_ascii()) { - result res = scalar::ascii::validate_with_errors(reinterpret_cast(input + count), length - count); + result res = scalar::ascii::validate_with_errors( + reinterpret_cast(input + count), length - count); return result(res.error, count + res.count); } reader.advance(); @@ -36601,64 +63094,482 @@ result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) reader.get_remainder(block); simd::simd8x64 in(block); if (!in.is_ascii()) { - result res = scalar::ascii::validate_with_errors(reinterpret_cast(input + count), length - count); + result res = scalar::ascii::validate_with_errors( + reinterpret_cast(input + count), length - count); return result(res.error, count + res.count); } else { return result(error_code::SUCCESS, length); } } -result generic_validate_ascii_with_errors(const char * input, size_t length) { - return generic_validate_ascii_with_errors(reinterpret_cast(input),length); +bool generic_validate_ascii(const char *input, size_t length) { + buf_block_reader<64> reader(reinterpret_cast(input), length); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + if (!in.is_ascii()) { + return false; + } + reader.advance(); + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + return in.is_ascii(); } -} // namespace utf8_validation +} // namespace ascii_validation } // unnamed namespace -} // namespace westmere +} // namespace lsx } // namespace simdutf -/* end file src/generic/utf8_validation/utf8_validator.h */ -// transcoding from UTF-8 to UTF-16 -/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ +/* end file src/generic/ascii_validation.h */ +#endif // SIMDUTF_FEATURE_ASCII + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + // transcoding from UTF-8 to Latin 1 +/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */ +namespace simdutf { +namespace lsx { +namespace { +namespace utf8_to_latin1 { +using namespace simd; + +simdutf_really_inline simd8 +check_special_cases(const simd8 input, const simd8 prev1) { + // For UTF-8 to Latin 1, we can allow any ASCII character, and any + // continuation byte, but the non-ASCII leading bytes must be 0b11000011 or + // 0b11000010 and nothing else. + // + // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) + // Bit 1 = Too Long (ASCII followed by continuation) + // Bit 2 = Overlong 3-byte + // Bit 4 = Surrogate + // Bit 5 = Overlong 2-byte + // Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1 << 6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____ + constexpr const uint8_t FORBIDDEN = 0xff; + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + FORBIDDEN, + // 1110____ ________ + FORBIDDEN, + // 1111____ ________ + FORBIDDEN); + constexpr const uint8_t CARRY = + TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = + (prev1 & 0x0F) + .lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, CARRY, + + // ____0100 ________ + FORBIDDEN, + // ____0101 ________ + FORBIDDEN, + // ____011_ ________ + FORBIDDEN, FORBIDDEN, + + // ____1___ ________ + FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, + // ____1101 ________ + FORBIDDEN, FORBIDDEN, FORBIDDEN); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | + OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); + return (byte_1_high & byte_1_low & byte_2_high); +} + +struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, + const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ + // lead bytes (2, 3, 4-byte leads become large positive numbers instead of + // small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + this->error |= check_special_cases(input, prev1); + } + + simdutf_really_inline size_t convert(const char *in, size_t size, + char *latin1_output) { + size_t pos = 0; + char *start{latin1_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 16 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 16; margin--) { + leading_byte += (int8_t(in[margin - 1]) > + -65); // twos complement of -65 is 1011 1111 ... + } + // If the input is long enough, then we have that margin-1 is the eight last + // leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store((int8_t *)latin1_output); + latin1_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = + input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in + // this case, we also have ASCII to account for. + if (utf8_continuation_mask & 1) { + return 0; // error + } + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_latin1( + in + pos, utf8_end_of_code_point_mask, latin1_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (errors()) { + return 0; + } + if (pos < size) { + size_t howmany = + scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output); + if (howmany == 0) { + return 0; + } + latin1_output += howmany; + } + return latin1_output - start; + } + + simdutf_really_inline result convert_with_errors(const char *in, size_t size, + char *latin1_output) { + size_t pos = 0; + char *start{latin1_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 8 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the eight last + // leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store((int8_t *)latin1_output); + latin1_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + if (errors()) { + // rewind_and_convert_with_errors will seek a potential error from + // in+pos onward, with the ability to go back up to pos bytes, and + // read size-pos bytes forward. + result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors( + pos, in + pos, size - pos, latin1_output); + res.count += pos; + return res; + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_latin1( + in + pos, utf8_end_of_code_point_mask, latin1_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (errors()) { + // rewind_and_convert_with_errors will seek a potential error from in+pos + // onward, with the ability to go back up to pos bytes, and read size-pos + // bytes forward. + result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors( + pos, in + pos, size - pos, latin1_output); + res.count += pos; + return res; + } + if (pos < size) { + // rewind_and_convert_with_errors will seek a potential error from in+pos + // onward, with the ability to go back up to pos bytes, and read size-pos + // bytes forward. + result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors( + pos, in + pos, size - pos, latin1_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + latin1_output += res.count; + } + } + return result(error_code::SUCCESS, latin1_output - start); + } + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } +}; // struct utf8_checker +} // namespace utf8_to_latin1 +} // unnamed namespace +} // namespace lsx +} // namespace simdutf +/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */ +/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */ namespace simdutf { -namespace westmere { +namespace lsx { +namespace { +namespace utf8_to_latin1 { +using namespace simd; + +simdutf_really_inline size_t convert_valid(const char *in, size_t size, + char *latin1_output) { + size_t pos = 0; + char *start{latin1_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last + // 16 bytes, and if the data is valid, then it is entirely safe because 16 + // UTF-8 bytes generate much more than 8 bytes. However, you cannot generally + // assume that you have valid UTF-8 input, so we are going to go back from the + // end counting 8 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > + -65); // twos complement of -65 is 1011 1111 ... + } + // If the input is long enough, then we have that margin-1 is the eight last + // leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store((int8_t *)latin1_output); + latin1_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it + // is not good enough. + uint64_t utf8_continuation_mask = + input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in + // this case, we also have ASCII to account for. + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_latin1( + in + pos, utf8_end_of_code_point_mask, latin1_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (pos < size) { + size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos, + latin1_output); + latin1_output += howmany; + } + return latin1_output - start; +} + +} // namespace utf8_to_latin1 +} // namespace +} // namespace lsx +} // namespace simdutf + // namespace simdutf +/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + // transcoding from UTF-8 to UTF-16 +/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ +namespace simdutf { +namespace lsx { namespace { namespace utf8_to_utf16 { using namespace simd; template -simdutf_warn_unused size_t convert_valid(const char* input, size_t size, - char16_t* utf16_output) noexcept { - // The implementation is not specific to haswell and should be moved to the generic directory. +simdutf_warn_unused size_t convert_valid(const char *input, size_t size, + char16_t *utf16_output) noexcept { + // The implementation is not specific to haswell and should be moved to the + // generic directory. size_t pos = 0; - char16_t* start{utf16_output}; + char16_t *start{utf16_output}; const size_t safety_margin = 16; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - // this loop could be unrolled further. For example, we could process the mask - // far more than 64 bytes. + while (pos + 64 + safety_margin <= size) { + // this loop could be unrolled further. For example, we could process the + // mask far more than 64 bytes. simd8x64 in(reinterpret_cast(input + pos)); - if(in.is_ascii()) { + if (in.is_ascii()) { in.store_ascii_as_utf16(utf16_output); utf16_output += 64; pos += 64; } else { - // Slow path. We hope that the compiler will recognize that this is a slow path. - // Anything that is not a continuation mask is a 'leading byte', that is, the - // start of a new code point. + // Slow path. We hope that the compiler will recognize that this is a slow + // path. Anything that is not a continuation mask is a 'leading byte', + // that is, the start of a new code point. uint64_t utf8_continuation_mask = in.lt(-65 + 1); - // -65 is 0b10111111 in two-complement's, so largest possible continuation byte + // -65 is 0b10111111 in two-complement's, so largest possible continuation + // byte uint64_t utf8_leading_mask = ~utf8_continuation_mask; - // The *start* of code points is not so useful, rather, we want the *end* of code points. - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // The *start* of code points is not so useful, rather, we want the *end* + // of code points. + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; // We process in blocks of up to 12 bytes except possibly // for fast paths which may process up to 16 bytes. For the // slow path to work, we should have at least 12 input bytes left. size_t max_starting_point = (pos + 64) - 12; // Next loop is going to run at least five times when using solely // the slow/regular path, and at least four times if there are fast paths. - while(pos < max_starting_point) { + while (pos < max_starting_point) { // Performance note: our ability to compute 'consumed' and // then shift and recompute is critical. If there is a // latency of, say, 4 cycles on getting 'consumed', then @@ -36672,8 +63583,8 @@ simdutf_warn_unused size_t convert_valid(const char* input, size_t size, // Thus we may allow convert_masked_utf8_to_utf16 to process // more bytes at a time under a fast-path mode where 16 bytes // are consumed at once (e.g., when encountering ASCII). - size_t consumed = convert_masked_utf8_to_utf16(input + pos, - utf8_end_of_code_point_mask, utf16_output); + size_t consumed = convert_masked_utf8_to_utf16( + input + pos, utf8_end_of_code_point_mask, utf16_output); pos += consumed; utf8_end_of_code_point_mask >>= consumed; } @@ -36683,409 +63594,493 @@ simdutf_warn_unused size_t convert_valid(const char* input, size_t size, // 85% to 90% efficiency. } } - utf16_output += scalar::utf8_to_utf16::convert_valid(input + pos, size - pos, utf16_output); + utf16_output += scalar::utf8_to_utf16::convert_valid( + input + pos, size - pos, utf16_output); return utf16_output - start; } } // namespace utf8_to_utf16 } // unnamed namespace -} // namespace westmere +} // namespace lsx } // namespace simdutf /* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ /* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */ +namespace simdutf { +namespace lsx { +namespace { +namespace utf8_to_utf16 { +using namespace simd; + +simdutf_really_inline simd8 +check_special_cases(const simd8 input, const simd8 prev1) { + // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) + // Bit 1 = Too Long (ASCII followed by continuation) + // Bit 2 = Overlong 3-byte + // Bit 4 = Surrogate + // Bit 5 = Overlong 2-byte + // Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1 << 6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4); + constexpr const uint8_t CARRY = + TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = + (prev1 & 0x0F) + .lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | + OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); + return (byte_1_high & byte_1_low & byte_2_high); +} +simdutf_really_inline simd8 +check_multibyte_lengths(const simd8 input, + const simd8 prev_input, + const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = + simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; +} + +struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, + const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ + // lead bytes (2, 3, 4-byte leads become large positive numbers instead of + // small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + template + simdutf_really_inline size_t convert(const char *in, size_t size, + char16_t *utf16_output) { + size_t pos = 0; + char16_t *start{utf16_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 8 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the eight last + // leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (utf8_continuation_mask & 1) { + return 0; // error + } + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf16( + in + pos, utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (errors()) { + return 0; + } + if (pos < size) { + size_t howmany = scalar::utf8_to_utf16::convert( + in + pos, size - pos, utf16_output); + if (howmany == 0) { + return 0; + } + utf16_output += howmany; + } + return utf16_output - start; + } + + template + simdutf_really_inline result convert_with_errors(const char *in, size_t size, + char16_t *utf16_output) { + size_t pos = 0; + char16_t *start{utf16_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 8 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the eight last + // leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (errors() || (utf8_continuation_mask & 1)) { + // rewind_and_convert_with_errors will seek a potential error from + // in+pos onward, with the ability to go back up to pos bytes, and + // read size-pos bytes forward. + result res = + scalar::utf8_to_utf16::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf16_output); + res.count += pos; + return res; + } + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf16( + in + pos, utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if (errors()) { + // rewind_and_convert_with_errors will seek a potential error from in+pos + // onward, with the ability to go back up to pos bytes, and read size-pos + // bytes forward. + result res = + scalar::utf8_to_utf16::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf16_output); + res.count += pos; + return res; + } + if (pos < size) { + // rewind_and_convert_with_errors will seek a potential error from in+pos + // onward, with the ability to go back up to pos bytes, and read size-pos + // bytes forward. + result res = + scalar::utf8_to_utf16::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf16_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + utf16_output += res.count; + } + } + return result(error_code::SUCCESS, utf16_output - start); + } + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } +}; // struct utf8_checker +} // namespace utf8_to_utf16 +} // unnamed namespace +} // namespace lsx +} // namespace simdutf +/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */ +/* begin file src/generic/utf8/utf16_length_from_utf8_bytemask.h */ namespace simdutf { -namespace westmere { +namespace lsx { namespace { -namespace utf8_to_utf16 { +namespace utf8 { + using namespace simd; +simdutf_really_inline size_t utf16_length_from_utf8_bytemask(const char *in, + size_t size) { + using vector_i8 = simd8; + using vector_u8 = simd8; + using vector_u64 = simd64; - simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { -// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) -// Bit 1 = Too Long (ASCII followed by continuation) -// Bit 2 = Overlong 3-byte -// Bit 4 = Surrogate -// Bit 5 = Overlong 2-byte -// Bit 7 = Two Continuations - constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ - // 11______ 11______ - constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ - constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ - constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ - constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ - constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ - constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ - // 11110100 101_____ - // 11110101 1001____ - // 11110101 101_____ - // 1111011_ 1001____ - // 1111011_ 101_____ - // 11111___ 1001____ - // 11111___ 101_____ - constexpr const uint8_t TOO_LARGE_1000 = 1<<6; - // 11110101 1000____ - // 1111011_ 1000____ - // 11111___ 1000____ - constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ - - const simd8 byte_1_high = prev1.shr<4>().lookup_16( - // 0_______ ________ - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - // 10______ ________ - TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, - // 1100____ ________ - TOO_SHORT | OVERLONG_2, - // 1101____ ________ - TOO_SHORT, - // 1110____ ________ - TOO_SHORT | OVERLONG_3 | SURROGATE, - // 1111____ ________ - TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 - ); - constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . - const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( - // ____0000 ________ - CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, - // ____0001 ________ - CARRY | OVERLONG_2, - // ____001_ ________ - CARRY, - CARRY, - - // ____0100 ________ - CARRY | TOO_LARGE, - // ____0101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - // ____011_ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - - // ____1___ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - // ____1101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000 - ); - const simd8 byte_2_high = input.shr<4>().lookup_16( - // ________ 0_______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + constexpr size_t N = vector_i8::SIZE; + constexpr size_t max_iterations = 255 / 2; - // ________ 1000____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, - // ________ 1001____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, - // ________ 101_____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + auto counters = vector_u64::zero(); + auto local = vector_u8::zero(); - // ________ 11______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT - ); - return (byte_1_high & byte_1_low & byte_2_high); - } - simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, - const simd8 prev_input, const simd8 sc) { - simd8 prev2 = input.prev<2>(prev_input); - simd8 prev3 = input.prev<3>(prev_input); - simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); - simd8 must23_80 = must23 & uint8_t(0x80); - return must23_80 ^ sc; - } + size_t iterations = 0; + size_t pos = 0; + size_t count = 0; + for (; pos + N <= size; pos += N) { + const auto input = + vector_i8::load(reinterpret_cast(in + pos)); + const auto continuation = input > int8_t(-65); + const auto utf_4bytes = vector_u8(input.value) >= uint8_t(240); - struct validating_transcoder { - // If this is nonzero, there has been a UTF-8 error. - simd8 error; + local -= vector_u8(continuation); + local -= vector_u8(utf_4bytes); - validating_transcoder() : error(uint8_t(0)) {} - // - // Check whether the current bytes are valid UTF-8. - // - simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { - // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes - // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) - simd8 prev1 = input.prev<1>(prev_input); - simd8 sc = check_special_cases(input, prev1); - this->error |= check_multibyte_lengths(input, prev_input, sc); - } - - - template - simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) { - size_t pos = 0; - char16_t* start{utf16_output}; - // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes, - // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate - // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, - // to give us a good margin. - size_t leading_byte = 0; - size_t margin = size; - for(; margin > 0 && leading_byte < 8; margin--) { - leading_byte += (int8_t(in[margin-1]) > -65); - } - // If the input is long enough, then we have that margin-1 is the eight last leading byte. - const size_t safety_margin = size - margin + 1; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 input(reinterpret_cast(in + pos)); - if(input.is_ascii()) { - input.store_ascii_as_utf16(utf16_output); - utf16_output += 64; - pos += 64; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - auto zero = simd8{uint8_t(0)}; - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_utf16(in + pos, - utf8_end_of_code_point_mask, utf16_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. - } - } - if(errors()) { return 0; } - if(pos < size) { - size_t howmany = scalar::utf8_to_utf16::convert(in + pos, size - pos, utf16_output); - if(howmany == 0) { return 0; } - utf16_output += howmany; - } - return utf16_output - start; - } - - template - simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) { - size_t pos = 0; - char16_t* start{utf16_output}; - // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes, - // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate - // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, - // to give us a good margin. - size_t leading_byte = 0; - size_t margin = size; - for(; margin > 0 && leading_byte < 8; margin--) { - leading_byte += (int8_t(in[margin-1]) > -65); - } - // If the input is long enough, then we have that margin-1 is the eight last leading byte. - const size_t safety_margin = size - margin + 1; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 input(reinterpret_cast(in + pos)); - if(input.is_ascii()) { - input.store_ascii_as_utf16(utf16_output); - utf16_output += 64; - pos += 64; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - auto zero = simd8{uint8_t(0)}; - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - if (errors()) { - // rewind_and_convert_with_errors will seek a potential error from in+pos onward, - // with the ability to go back up to pos bytes, and read size-pos bytes forward. - result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); - res.count += pos; - return res; - } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_utf16(in + pos, - utf8_end_of_code_point_mask, utf16_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. - } - } - if(errors()) { - // rewind_and_convert_with_errors will seek a potential error from in+pos onward, - // with the ability to go back up to pos bytes, and read size-pos bytes forward. - result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); - res.count += pos; - return res; - } - if(pos < size) { - // rewind_and_convert_with_errors will seek a potential error from in+pos onward, - // with the ability to go back up to pos bytes, and read size-pos bytes forward. - result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); - if (res.error) { // In case of error, we want the error position - res.count += pos; - return res; - } else { // In case of success, we want the number of word written - utf16_output += res.count; - } - } - return result(error_code::SUCCESS, utf16_output - start); + iterations += 1; + if (iterations == max_iterations) { + counters += sum_8bytes(local); + local = vector_u8::zero(); + iterations = 0; } + } - simdutf_really_inline bool errors() const { - return this->error.any_bits_set_anywhere(); - } + if (iterations > 0) { + count += local.sum_bytes(); + } - }; // struct utf8_checker -} // utf8_to_utf16 namespace + count += counters.sum(); + + return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos); +} + +} // namespace utf8 } // unnamed namespace -} // namespace westmere +} // namespace lsx } // namespace simdutf -/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */ -// transcoding from UTF-8 to UTF-32 -/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ +/* end file src/generic/utf8/utf16_length_from_utf8_bytemask.h */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + // transcoding from UTF-8 to UTF-32 +/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ namespace simdutf { -namespace westmere { +namespace lsx { namespace { namespace utf8_to_utf32 { using namespace simd; - -simdutf_warn_unused size_t convert_valid(const char* input, size_t size, - char32_t* utf32_output) noexcept { +simdutf_warn_unused size_t convert_valid(const char *input, size_t size, + char32_t *utf32_output) noexcept { size_t pos = 0; - char32_t* start{utf32_output}; + char32_t *start{utf32_output}; const size_t safety_margin = 16; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { + while (pos + 64 + safety_margin <= size) { simd8x64 in(reinterpret_cast(input + pos)); - if(in.is_ascii()) { + if (in.is_ascii()) { in.store_ascii_as_utf32(utf32_output); utf32_output += 64; pos += 64; } else { - // -65 is 0b10111111 in two-complement's, so largest possible continuation byte - uint64_t utf8_continuation_mask = in.lt(-65 + 1); - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - size_t max_starting_point = (pos + 64) - 12; - while(pos < max_starting_point) { - size_t consumed = convert_masked_utf8_to_utf32(input + pos, - utf8_end_of_code_point_mask, utf32_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; + // -65 is 0b10111111 in two-complement's, so largest possible continuation + // byte + uint64_t utf8_continuation_mask = in.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + size_t max_starting_point = (pos + 64) - 12; + while (pos < max_starting_point) { + size_t consumed = convert_masked_utf8_to_utf32( + input + pos, utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; } } } - utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output); + utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, + utf32_output); return utf32_output - start; } - } // namespace utf8_to_utf32 } // unnamed namespace -} // namespace westmere +} // namespace lsx } // namespace simdutf /* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ /* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */ - - namespace simdutf { -namespace westmere { +namespace lsx { namespace { namespace utf8_to_utf32 { using namespace simd; - - simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { -// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) -// Bit 1 = Too Long (ASCII followed by continuation) -// Bit 2 = Overlong 3-byte -// Bit 4 = Surrogate -// Bit 5 = Overlong 2-byte -// Bit 7 = Two Continuations - constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ - // 11______ 11______ - constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ - constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ - constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ - constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ - constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ - constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ - // 11110100 101_____ - // 11110101 1001____ - // 11110101 101_____ - // 1111011_ 1001____ - // 1111011_ 101_____ - // 11111___ 1001____ - // 11111___ 101_____ - constexpr const uint8_t TOO_LARGE_1000 = 1<<6; - // 11110101 1000____ - // 1111011_ 1000____ - // 11111___ 1000____ - constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ - - const simd8 byte_1_high = prev1.shr<4>().lookup_16( +simdutf_really_inline simd8 +check_special_cases(const simd8 input, const simd8 prev1) { + // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) + // Bit 1 = Too Long (ASCII followed by continuation) + // Bit 2 = Overlong 3-byte + // Bit 4 = Surrogate + // Bit 5 = Overlong 2-byte + // Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1 << 6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( // 0_______ ________ - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, // 10______ ________ TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, // 1100____ ________ @@ -37095,979 +64090,1485 @@ using namespace simd; // 1110____ ________ TOO_SHORT | OVERLONG_3 | SURROGATE, // 1111____ ________ - TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 - ); - constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . - const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( - // ____0000 ________ - CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, - // ____0001 ________ - CARRY | OVERLONG_2, - // ____001_ ________ - CARRY, - CARRY, - - // ____0100 ________ - CARRY | TOO_LARGE, - // ____0101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - // ____011_ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - - // ____1___ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - // ____1101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000 - ); - const simd8 byte_2_high = input.shr<4>().lookup_16( + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4); + constexpr const uint8_t CARRY = + TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = + (prev1 & 0x0F) + .lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000); + const simd8 byte_2_high = input.shr<4>().lookup_16( // ________ 0_______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, // ________ 1000____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | + OVERLONG_4, // ________ 1001____ TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, // ________ 101_____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, // ________ 11______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT - ); - return (byte_1_high & byte_1_low & byte_2_high); - } - simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, - const simd8 prev_input, const simd8 sc) { - simd8 prev2 = input.prev<2>(prev_input); - simd8 prev3 = input.prev<3>(prev_input); - simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); - simd8 must23_80 = must23 & uint8_t(0x80); - return must23_80 ^ sc; - } - - - struct validating_transcoder { - // If this is nonzero, there has been a UTF-8 error. - simd8 error; - - validating_transcoder() : error(uint8_t(0)) {} - // - // Check whether the current bytes are valid UTF-8. - // - simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { - // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes - // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) - simd8 prev1 = input.prev<1>(prev_input); - simd8 sc = check_special_cases(input, prev1); - this->error |= check_multibyte_lengths(input, prev_input, sc); - } - - - - simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) { - size_t pos = 0; - char32_t* start{utf32_output}; - // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes, - // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate - // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, - // to give us a good margin. - size_t leading_byte = 0; - size_t margin = size; - for(; margin > 0 && leading_byte < 8; margin--) { - leading_byte += (int8_t(in[margin-1]) > -65); - } - // If the input is long enough, then we have that margin-1 is the fourth last leading byte. - const size_t safety_margin = size - margin + 1; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 input(reinterpret_cast(in + pos)); - if(input.is_ascii()) { - input.store_ascii_as_utf32(utf32_output); - utf32_output += 64; - pos += 64; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - auto zero = simd8{uint8_t(0)}; - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_utf32(in + pos, - utf8_end_of_code_point_mask, utf32_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); + return (byte_1_high & byte_1_low & byte_2_high); +} +simdutf_really_inline simd8 +check_multibyte_lengths(const simd8 input, + const simd8 prev_input, + const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = + simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; +} + +struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, + const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ + // lead bytes (2, 3, 4-byte leads become large positive numbers instead of + // small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + simdutf_really_inline size_t convert(const char *in, size_t size, + char32_t *utf32_output) { + size_t pos = 0; + char32_t *start{utf32_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 words when calling convert_masked_utf8_to_utf32. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 16 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the fourth + // last leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); } - } - if(errors()) { return 0; } - if(pos < size) { - size_t howmany = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output); - if(howmany == 0) { return 0; } - utf32_output += howmany; - } - return utf32_output - start; - } - - simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) { - size_t pos = 0; - char32_t* start{utf32_output}; - // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes, - // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate - // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, - // to give us a good margin. - size_t leading_byte = 0; - size_t margin = size; - for(; margin > 0 && leading_byte < 8; margin--) { - leading_byte += (int8_t(in[margin-1]) > -65); - } - // If the input is long enough, then we have that margin-1 is the fourth last leading byte. - const size_t safety_margin = size - margin + 1; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 input(reinterpret_cast(in + pos)); - if(input.is_ascii()) { - input.store_ascii_as_utf32(utf32_output); - utf32_output += 64; - pos += 64; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - auto zero = simd8{uint8_t(0)}; - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - if (errors()) { - result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); - res.count += pos; - return res; - } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_utf32(in + pos, - utf8_end_of_code_point_mask, utf32_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (utf8_continuation_mask & 1) { + return 0; // we have an error + } + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf32( + in + pos, utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. } - if(errors()) { - result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); - res.count += pos; - return res; + } + if (errors()) { + return 0; + } + if (pos < size) { + size_t howmany = + scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output); + if (howmany == 0) { + return 0; } - if(pos < size) { - result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); - if (res.error) { // In case of error, we want the error position + utf32_output += howmany; + } + return utf32_output - start; + } + + simdutf_really_inline result convert_with_errors(const char *in, size_t size, + char32_t *utf32_output) { + size_t pos = 0; + char32_t *start{utf32_output}; + // In the worst case, we have the haswell kernel which can cause an overflow + // of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the + // last 16 bytes, and if the data is valid, then it is entirely safe because + // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot + // generally assume that you have valid UTF-8 input, so we are going to go + // back from the end counting 8 leading bytes, to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for (; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin - 1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the fourth + // last leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while (pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if (input.is_ascii()) { + input.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, + // it is not good enough. + static_assert( + (simd8x64::NUM_CHUNKS == 2) || + (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (errors() || (utf8_continuation_mask & 1)) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf32_output); res.count += pos; return res; - } else { // In case of success, we want the number of word written - utf32_output += res.count; } + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while (pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf32( + in + pos, utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. } - return result(error_code::SUCCESS, utf32_output - start); } - - simdutf_really_inline bool errors() const { - return this->error.any_bits_set_anywhere(); + if (errors()) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf32_output); + res.count += pos; + return res; + } + if (pos < size) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors( + pos, in + pos, size - pos, utf32_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + utf32_output += res.count; + } } + return result(error_code::SUCCESS, utf32_output - start); + } - }; // struct utf8_checker -} // utf8_to_utf32 namespace + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + +}; // struct utf8_checker +} // namespace utf8_to_utf32 } // unnamed namespace -} // namespace westmere +} // namespace lsx } // namespace simdutf /* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */ -// other functions -/* begin file src/generic/utf8.h */ +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF8 +/* begin file src/generic/utf8.h */ namespace simdutf { -namespace westmere { +namespace lsx { namespace { namespace utf8 { using namespace simd; -simdutf_really_inline size_t count_code_points(const char* in, size_t size) { - size_t pos = 0; - size_t count = 0; - for(;pos + 64 <= size; pos += 64) { - simd8x64 input(reinterpret_cast(in + pos)); - uint64_t utf8_continuation_mask = input.gt(-65); - count += count_ones(utf8_continuation_mask); +simdutf_really_inline size_t count_code_points(const char *in, size_t size) { + size_t pos = 0; + size_t count = 0; + for (; pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.gt(-65); + count += count_ones(utf8_continuation_mask); + } + return count + scalar::utf8::count_code_points(in + pos, size - pos); +} + +#ifdef SIMDUTF_SIMD_HAS_BYTEMASK +simdutf_really_inline size_t count_code_points_bytemask(const char *in, + size_t size) { + using vector_i8 = simd8; + using vector_u8 = simd8; + using vector_u64 = simd64; + + constexpr size_t N = vector_i8::SIZE; + constexpr size_t max_iterations = 255 / 4; + + size_t pos = 0; + size_t count = 0; + + auto counters = vector_u64::zero(); + auto local = vector_u8::zero(); + size_t iterations = 0; + for (; pos + 4 * N <= size; pos += 4 * N) { + const auto input0 = + simd8::load(reinterpret_cast(in + pos + 0 * N)); + const auto input1 = + simd8::load(reinterpret_cast(in + pos + 1 * N)); + const auto input2 = + simd8::load(reinterpret_cast(in + pos + 2 * N)); + const auto input3 = + simd8::load(reinterpret_cast(in + pos + 3 * N)); + const auto mask0 = input0 > int8_t(-65); + const auto mask1 = input1 > int8_t(-65); + const auto mask2 = input2 > int8_t(-65); + const auto mask3 = input3 > int8_t(-65); + + local -= vector_u8(mask0); + local -= vector_u8(mask1); + local -= vector_u8(mask2); + local -= vector_u8(mask3); + + iterations += 1; + if (iterations == max_iterations) { + counters += sum_8bytes(local); + local = vector_u8::zero(); + iterations = 0; + } + } + + if (iterations > 0) { + count += local.sum_bytes(); + } + + count += counters.sum(); + + return count + scalar::utf8::count_code_points(in + pos, size - pos); +} +#endif // SIMDUTF_SIMD_HAS_BYTEMASK + +simdutf_really_inline size_t utf16_length_from_utf8(const char *in, + size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for (; pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + // We count one word for anything that is not a continuation (so + // leading bytes). + count += 64 - count_ones(utf8_continuation_mask); + int64_t utf8_4byte = input.gteq_unsigned(240); + count += count_ones(utf8_4byte); + } + return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos); +} + +} // namespace utf8 +} // unnamed namespace +} // namespace lsx +} // namespace simdutf +/* end file src/generic/utf8.h */ +#endif // SIMDUTF_FEATURE_UTF8 + +#if SIMDUTF_FEATURE_UTF16 +/* begin file src/generic/utf16/count_code_points_bytemask.h */ +namespace simdutf { +namespace lsx { +namespace { +namespace utf16 { + +using namespace simd; + +template +simdutf_really_inline size_t count_code_points(const char16_t *in, + size_t size) { + using vector_u16 = simd16; + constexpr size_t N = vector_u16::ELEMENTS; + + size_t pos = 0; + size_t count = 0; + + constexpr size_t max_iterations = 65535; + const auto one = vector_u16::splat(1); + const auto zero = vector_u16::zero(); + + size_t iteration = 0; + + auto counters = zero; + for (; pos < size / N * N; pos += N) { + auto input = vector_u16::load(in + pos); + if simdutf_constexpr (!match_system(big_endian)) { + input = input.swap_bytes(); + } + + const auto t0 = input & uint16_t(0xfc00); + const auto t1 = t0 ^ uint16_t(0xdc00); + + // t2[0] == 1 iff input[0] outside range 0xdc00..dfff (the word is not a + // high surrogate) + const auto t2 = min(t1, one); + + counters += t2; + + iteration += 1; + if (iteration == max_iterations) { + count += counters.sum(); + counters = zero; + iteration = 0; + } + } + + if (iteration > 0) { + count += counters.sum(); + } + + return count + + scalar::utf16::count_code_points(in + pos, size - pos); +} + +} // namespace utf16 +} // unnamed namespace +} // namespace lsx +} // namespace simdutf +/* end file src/generic/utf16/count_code_points_bytemask.h */ +/* begin file src/generic/utf16/change_endianness.h */ +namespace simdutf { +namespace lsx { +namespace { +namespace utf16 { + +simdutf_really_inline void +change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) { + size_t pos = 0; + + while (pos < size / 32 * 32) { + simd16x32 input(reinterpret_cast(in + pos)); + input.swap_bytes(); + input.store(reinterpret_cast(output)); + pos += 32; + output += 32; + } + + scalar::utf16::change_endianness_utf16(in + pos, size - pos, output); +} + +} // namespace utf16 +} // unnamed namespace +} // namespace lsx +} // namespace simdutf +/* end file src/generic/utf16/change_endianness.h */ +/* begin file src/generic/utf16/utf8_length_from_utf16_bytemask.h */ +namespace simdutf { +namespace lsx { +namespace { +namespace utf16 { + +using namespace simd; + +template +simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in, + size_t size) { + size_t pos = 0; + + using vector_u16 = simd16; + constexpr size_t N = vector_u16::ELEMENTS; + + const auto one = vector_u16::splat(1); + + auto v_count = vector_u16::zero(); + + // each char16 yields at least one byte + size_t count = size / N * N; + + // in a single iteration the increment is 0, 1 or 2, despite we have + // three additions + constexpr size_t max_iterations = 65535 / 2; + size_t iteration = max_iterations; + + for (; pos < size / N * N; pos += N) { + auto input = vector_u16::load(reinterpret_cast(in + pos)); + if simdutf_constexpr (!match_system(big_endian)) { + input = input.swap_bytes(); + } + // 0xd800 .. 0xdbff - low surrogate + // 0xdc00 .. 0xdfff - high surrogate + const auto is_surrogate = ((input & uint16_t(0xf800)) == uint16_t(0xd800)); + + // c0 - chars that yield 2- or 3-byte UTF-8 codes + const auto c0 = min(input & uint16_t(0xff80), one); + + // c1 - chars that yield 3-byte UTF-8 codes (including surrogates) + const auto c1 = min(input & uint16_t(0xf800), one); + + /* + Explanation how the counting works. + + In the case of a non-surrogate character we count: + * always 1 -- see how `count` is initialized above; + * c0 = 1 if the current char yields 2 or 3 bytes; + * c1 = 1 if the current char yields 3 bytes. + + Thus, we always have correct count for the current char: + from 1, 2 or 3 bytes. + + A trickier part is how we count surrogate pairs. Whether + we encounter a surrogate (low or high), we count it as + 3 chars and then minus 1 (`is_surrogate` is -1 or 0). + Each surrogate char yields 2. A surrogate pair, that + is a low surrogate followed by a high one, yields + the expected 4 bytes. + + It also correctly handles cases when low surrogate is + processed by the this loop, but high surrogate is counted + by the scalar procedure. The scalar procedure uses exactly + the described approach, thanks to that for valid UTF-16 + strings it always count correctly. + */ + v_count += c0; + v_count += c1; + v_count += vector_u16(is_surrogate); + + iteration -= 1; + if (iteration == 0) { + count += v_count.sum(); + v_count = vector_u16::zero(); + iteration = max_iterations; + } + } + + if (iteration > 0) { + count += v_count.sum(); + } + + return count + scalar::utf16::utf8_length_from_utf16(in + pos, + size - pos); +} + +template +simdutf_really_inline result +utf8_length_from_utf16_with_replacement(const char16_t *in, size_t size) { + using vector_u16 = simd16; + constexpr size_t N = vector_u16::ELEMENTS; + if (N + 1 > size) { + return scalar::utf16::utf8_length_from_utf16_with_replacement( + in, size); + } // special case for short inputs + size_t pos = 0; + bool any_surrogates = false; + + const auto one = vector_u16::splat(1); + + auto v_count = vector_u16::zero(); + auto v_mismatched_count = vector_u16::zero(); + + size_t count = 0; + size_t mismatched_count = 0; + + // in a single iteration the increment is 0, 1 or 2, despite we have + // three additions + constexpr size_t max_iterations = 65535 / 2; + size_t iteration = max_iterations; + + if (scalar::utf16::is_low_surrogate(in[0])) { + any_surrogates = true; + mismatched_count += 1; + } + + for (; pos < (size - 1) / N * N; pos += N) { + auto input = vector_u16::load(reinterpret_cast(in + pos)); + if simdutf_constexpr (!match_system(big_endian)) { + input = input.swap_bytes(); + } + // 0xd800 .. 0xdbff - low surrogate + // 0xdc00 .. 0xdfff - high surrogate + const auto is_surrogate = ((input & uint16_t(0xf800)) == uint16_t(0xd800)); + + // c0 - chars that yield 2- or 3-byte UTF-8 codes + const auto c0 = min(input & uint16_t(0xff80), one); + + // c1 - chars that yield 3-byte UTF-8 codes (including surrogates) + const auto c1 = min(input & uint16_t(0xf800), one); + + v_count += c0; + v_count += c1; + v_count += vector_u16(is_surrogate); + if (is_surrogate.to_bitmask() != 0 || + scalar::utf16::is_low_surrogate(in[pos + N])) { + any_surrogates = true; + auto input_next = + vector_u16::load(reinterpret_cast(in + pos + 1)); + if simdutf_constexpr (!match_system(big_endian)) { + input_next = input_next.swap_bytes(); + } + + const auto lb_masked = input & (0xfc00); + const auto block_masked = input_next & (0xfc00); + + const auto lb_is_high = lb_masked == (0xd800); + const auto block_is_low = block_masked == (0xdc00); + + const auto illseq = min(vector_u16(lb_is_high ^ block_is_low), one); + + v_mismatched_count += illseq; + } + + iteration -= 1; + if (iteration == 0) { + count += v_count.sum(); + v_count = vector_u16::zero(); + mismatched_count += v_mismatched_count.sum(); + v_mismatched_count = vector_u16::zero(); + iteration = max_iterations; + } + } + + if (iteration > 0) { + count += v_count.sum(); + mismatched_count += v_mismatched_count.sum(); + } + + if (scalar::utf16::is_low_surrogate(in[pos])) { + any_surrogates = true; + if (!scalar::utf16::is_high_surrogate(in[pos - 1])) { + mismatched_count -= 1; + count += 2; + pos += 1; + } + } + count += pos; + count += mismatched_count; + if (scalar::utf16::is_high_surrogate(in[pos - 1])) { + any_surrogates = true; + if (pos == size) { + count += 2; + } else if (scalar::utf16::is_low_surrogate(in[pos])) { + pos += 1; + count += 2; } - return count + scalar::utf8::count_code_points(in + pos, size - pos); + } + result scalar_result = + scalar::utf16::utf8_length_from_utf16_with_replacement( + in + pos, size - pos); + return {any_surrogates ? SURROGATE : scalar_result.error, + count + scalar_result.count}; } -simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) { - size_t pos = 0; - size_t count = 0; - // This algorithm could no doubt be improved! - for(;pos + 64 <= size; pos += 64) { - simd8x64 input(reinterpret_cast(in + pos)); - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - // We count one word for anything that is not a continuation (so - // leading bytes). - count += 64 - count_ones(utf8_continuation_mask); - int64_t utf8_4byte = input.gteq_unsigned(240); - count += count_ones(utf8_4byte); - } - return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos); -} -} // utf8 namespace +} // namespace utf16 } // unnamed namespace -} // namespace westmere +} // namespace lsx } // namespace simdutf -/* end file src/generic/utf8.h */ -/* begin file src/generic/utf16.h */ +/* end file src/generic/utf16/utf8_length_from_utf16_bytemask.h */ +/* begin file src/generic/utf16/utf32_length_from_utf16.h */ namespace simdutf { -namespace westmere { +namespace lsx { namespace { namespace utf16 { template -simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) { - size_t pos = 0; - size_t count = 0; - for(;pos < size/32*32; pos += 32) { - simd16x32 input(reinterpret_cast(in + pos)); - if (!match_system(big_endian)) { input.swap_bytes(); } - uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF); - count += count_ones(not_pair) / 2; - } - return count + scalar::utf16::count_code_points(in + pos, size - pos); +simdutf_really_inline size_t utf32_length_from_utf16(const char16_t *in, + size_t size) { + return count_code_points(in, size); } -template -simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) { - size_t pos = 0; - size_t count = 0; - // This algorithm could no doubt be improved! - for(;pos < size/32*32; pos += 32) { - simd16x32 input(reinterpret_cast(in + pos)); - if (!match_system(big_endian)) { input.swap_bytes(); } - uint64_t ascii_mask = input.lteq(0x7F); - uint64_t twobyte_mask = input.lteq(0x7FF); - uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF); +} // namespace utf16 +} // unnamed namespace +} // namespace lsx +} // namespace simdutf +/* end file src/generic/utf16/utf32_length_from_utf16.h */ +/* begin file src/generic/utf16/to_well_formed.h */ +namespace simdutf { +namespace lsx { +namespace { +namespace utf16 { - size_t ascii_count = count_ones(ascii_mask) / 2; - size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2; - size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2; - size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2; - count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count; - } - return count + scalar::utf16::utf8_length_from_utf16(in + pos, size - pos); +// Note: this is direct translation of westmere implementation. + +/* + * Process one block of 8 characters. If in_place is false, + * copy the block from in to out. If there is a sequencing + * error in the block, overwrite the illsequenced characters + * with the replacement character. This function reads one + * character before the beginning of the buffer as a lookback. + * If that character is illsequenced, it too is overwritten. + */ +template +simdutf_really_inline void utf16fix_block(char16_t *out, const char16_t *in) { + const char16_t replacement = scalar::utf16::replacement(); + + using vector_u16 = simd16; + auto swap_if_needed = [](uint16_t x) simdutf_constexpr -> uint16_t { + return scalar::utf16::swap_if_needed(x); + }; + + const auto lookback = vector_u16::load(in - 1); + const auto block = vector_u16::load(in); + + const auto lb_masked = lookback & swap_if_needed(0xfc00); + const auto block_masked = block & swap_if_needed(0xfc00); + + const auto lb_is_high = lb_masked == swap_if_needed(0xd800); + const auto block_is_low = block_masked == swap_if_needed(0xdc00); + const auto illseq = lb_is_high ^ block_is_low; + if (!illseq.is_zero()) { + /* compute the cause of the illegal sequencing */ + const auto lb_illseq = ~block_is_low & lb_is_high; + const auto block_illseq = + (~lb_is_high & block_is_low) | lb_illseq.template byte_right_shift<2>(); + + /* fix illegal sequencing in the lookback */ + const auto lb = lb_illseq.first(); + out[-1] = char16_t((lb & replacement) | (~lb & out[-1])); + /* fix illegal sequencing in the main block */ + const auto mask = as_vector_u16(block_illseq); + const auto fixed = (~mask & block) | (mask & replacement); + + fixed.store(reinterpret_cast(out)); + } else if (!in_place) { + block.store(reinterpret_cast(out)); + } } template -simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) { - return count_code_points(in, size); -} +void to_well_formed(const char16_t *in, size_t n, char16_t *out) { + using vector_u16 = simd16; + constexpr size_t N = vector_u16::ELEMENTS; -simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) { - size_t pos = 0; + if (n < N + 1) { + scalar::utf16::to_well_formed_utf16(in, n, out); + return; + } - while (pos < size/32*32) { - simd16x32 input(reinterpret_cast(in + pos)); - input.swap_bytes(); - input.store(reinterpret_cast(output)); - pos += 32; - output += 32; + const char16_t replacement = scalar::utf16::replacement(); + + out[0] = + scalar::utf16::is_low_surrogate(in[0]) ? replacement : in[0]; + + /* duplicate code to have the compiler specialise utf16fix_block() */ + if (in == out) { + constexpr bool inplace = true; + for (size_t i = 1; i + N < n; i += N) { + utf16fix_block(out + i, in + i); + } + + utf16fix_block(out + n - N, in + n - N); + } else { + constexpr bool copy_data = false; + for (size_t i = 1; i + N < n; i += N) { + utf16fix_block(out + i, in + i); + } + + utf16fix_block(out + n - N, in + n - N); } - scalar::utf16::change_endianness_utf16(in + pos, size - pos, output); + out[n - 1] = scalar::utf16::is_high_surrogate(out[n - 1]) + ? replacement + : out[n - 1]; } -} // utf16 +} // namespace utf16 } // unnamed namespace -} // namespace westmere +} // namespace lsx } // namespace simdutf -/* end file src/generic/utf16.h */ -// transcoding from UTF-8 to Latin 1 -/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */ - +/* end file src/generic/utf16/to_well_formed.h */ +#endif // SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +/* begin file src/generic/validate_utf16.h */ namespace simdutf { -namespace westmere { +namespace lsx { namespace { -namespace utf8_to_latin1 { -using namespace simd; +namespace utf16 { +/* + UTF-16 validation + -------------------------------------------------- + In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning. - simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { -// For UTF-8 to Latin 1, we can allow any ASCII character, and any continuation byte, -// but the non-ASCII leading bytes must be 0b11000011 or 0b11000010 and nothing else. -// -// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) -// Bit 1 = Too Long (ASCII followed by continuation) -// Bit 2 = Overlong 3-byte -// Bit 4 = Surrogate -// Bit 5 = Overlong 2-byte -// Bit 7 = Two Continuations - constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ - // 11______ 11______ - constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ - constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ - constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ - constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ - constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ - constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ - // 11110100 101_____ - // 11110101 1001____ - // 11110101 101_____ - // 1111011_ 1001____ - // 1111011_ 101_____ - // 11111___ 1001____ - // 11111___ 101_____ - constexpr const uint8_t TOO_LARGE_1000 = 1<<6; - // 11110101 1000____ - // 1111011_ 1000____ - // 11111___ 1000____ - constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ - constexpr const uint8_t FORBIDDEN = 0xff; - - const simd8 byte_1_high = prev1.shr<4>().lookup_16( - // 0_______ ________ - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - // 10______ ________ - TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, - // 1100____ ________ - TOO_SHORT | OVERLONG_2, - // 1101____ ________ - FORBIDDEN, - // 1110____ ________ - FORBIDDEN, - // 1111____ ________ - FORBIDDEN - ); - constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . - const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( - // ____0000 ________ - CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, - // ____0001 ________ - CARRY | OVERLONG_2, - // ____001_ ________ - CARRY, - CARRY, - - // ____0100 ________ - FORBIDDEN, - // ____0101 ________ - FORBIDDEN, - // ____011_ ________ - FORBIDDEN, - FORBIDDEN, + In a vectorized algorithm we want to examine the most significant + nibble in order to select a fast path. If none of highest nibbles + are 0xD (13), than we are sure that UTF-16 chunk in a vector + register is valid. - // ____1___ ________ - FORBIDDEN, - FORBIDDEN, - FORBIDDEN, - FORBIDDEN, - FORBIDDEN, - // ____1101 ________ - FORBIDDEN, - FORBIDDEN, - FORBIDDEN - ); - const simd8 byte_2_high = input.shr<4>().lookup_16( - // ________ 0_______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + Let us analyze what we need to check if the nibble is 0xD. The + value of the preceding nibble determines what we have: - // ________ 1000____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, - // ________ 1001____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, - // ________ 101_____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + 0xd000 .. 0xd7ff - a valid word + 0xd800 .. 0xdbff - low surrogate + 0xdc00 .. 0xdfff - high surrogate - // ________ 11______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT - ); - return (byte_1_high & byte_1_low & byte_2_high); - } + Other constraints we have to consider: + - there must not be two consecutive low surrogates (0xd800 .. 0xdbff) + - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff) + - there must not be sole low surrogate nor high surrogate - struct validating_transcoder { - // If this is nonzero, there has been a UTF-8 error. - simd8 error; + We are going to build three bitmasks based on the 3rd nibble: + - V = valid word, + - L = low surrogate (0xd800 .. 0xdbff) + - H = high surrogate (0xdc00 .. 0xdfff) - validating_transcoder() : error(uint8_t(0)) {} - // - // Check whether the current bytes are valid UTF-8. - // - simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { - // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes - // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) - simd8 prev1 = input.prev<1>(prev_input); - this->error |= check_special_cases(input, prev1); - } - - - simdutf_really_inline size_t convert(const char* in, size_t size, char* latin1_output) { - size_t pos = 0; - char* start{latin1_output}; - // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last 16 bytes, - // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate - // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, - // to give us a good margin. - size_t leading_byte = 0; - size_t margin = size; - for(; margin > 0 && leading_byte < 8; margin--) { - leading_byte += (int8_t(in[margin-1]) > -65); //twos complement of -65 is 1011 1111 ... - } - // If the input is long enough, then we have that margin-1 is the eight last leading byte. - const size_t safety_margin = size - margin + 1; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 input(reinterpret_cast(in + pos)); - if(input.is_ascii()) { - input.store((int8_t*)latin1_output); - latin1_output += 64; - pos += 64; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - auto zero = simd8{uint8_t(0)}; - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in this case, we also have ASCII to account for. - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_latin1(in + pos, - utf8_end_of_code_point_mask, latin1_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. - } - } - if(errors()) { return 0; } - if(pos < size) { - size_t howmany = scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output); - if(howmany == 0) { return 0; } - latin1_output += howmany; - } - return latin1_output - start; - } - - simdutf_really_inline result convert_with_errors(const char* in, size_t size, char* latin1_output) { - size_t pos = 0; - char* start{latin1_output}; - // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last 16 bytes, - // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate - // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, - // to give us a good margin. - size_t leading_byte = 0; - size_t margin = size; - for(; margin > 0 && leading_byte < 8; margin--) { - leading_byte += (int8_t(in[margin-1]) > -65); - } - // If the input is long enough, then we have that margin-1 is the eight last leading byte. - const size_t safety_margin = size - margin + 1; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 input(reinterpret_cast(in + pos)); - if(input.is_ascii()) { - input.store((int8_t*)latin1_output); - latin1_output += 64; - pos += 64; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - auto zero = simd8{uint8_t(0)}; - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - if (errors()) { - // rewind_and_convert_with_errors will seek a potential error from in+pos onward, - // with the ability to go back up to pos bytes, and read size-pos bytes forward. - result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, in + pos, size - pos, latin1_output); - res.count += pos; - return res; - } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_latin1(in + pos, - utf8_end_of_code_point_mask, latin1_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. - } - } - if(errors()) { - // rewind_and_convert_with_errors will seek a potential error from in+pos onward, - // with the ability to go back up to pos bytes, and read size-pos bytes forward. - result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, in + pos, size - pos, latin1_output); - res.count += pos; - return res; - } - if(pos < size) { - // rewind_and_convert_with_errors will seek a potential error from in+pos onward, - // with the ability to go back up to pos bytes, and read size-pos bytes forward. - result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, in + pos, size - pos, latin1_output); - if (res.error) { // In case of error, we want the error position - res.count += pos; - return res; - } else { // In case of success, we want the number of word written - latin1_output += res.count; - } + 0 1 2 3 4 5 6 7 <--- word index + [ V | L | H | L | H | V | V | L ] + 1 0 0 0 0 1 1 0 - V = valid masks + 0 1 0 1 0 0 0 1 - L = low surrogate + 0 0 1 0 1 0 0 0 - H high surrogate + + + 1 0 0 0 0 1 1 0 V = valid masks + 0 1 0 1 0 0 0 0 a = L & (H >> 1) + 0 0 1 0 1 0 0 0 b = a << 1 + 1 1 1 1 1 1 1 0 c = V | a | b + ^ + the last bit can be zero, we just consume 7 + code units and recheck this word in the next iteration +*/ +template +const result validate_utf16_with_errors(const char16_t *input, size_t size) { + if (simdutf_unlikely(size == 0)) { + return result(error_code::SUCCESS, 0); + } + + const char16_t *start = input; + const char16_t *end = input + size; + + const auto v_d8 = simd8::splat(0xd8); + const auto v_f8 = simd8::splat(0xf8); + const auto v_fc = simd8::splat(0xfc); + const auto v_dc = simd8::splat(0xdc); + + while (input + simd16::SIZE * 2 < end) { + // 0. Load data: since the validation takes into account only higher + // byte of each word, we compress the two vectors into one which + // consists only the higher bytes. + auto in0 = simd16(input); + auto in1 = + simd16(input + simd16::SIZE / sizeof(char16_t)); + + // Function `utf16_gather_high_bytes` consumes two vectors of UTF-16 + // and yields a single vector having only higher bytes of characters. + const auto in = utf16_gather_high_bytes(in0, in1); + + // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). + const auto surrogates_wordmask = (in & v_f8) == v_d8; + const uint16_t surrogates_bitmask = + static_cast(surrogates_wordmask.to_bitmask()); + if (surrogates_bitmask == 0x0000) { + input += 16; + } else { + // 2. We have some surrogates that have to be distinguished: + // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) + // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) + // + // Fact: high surrogate has 11th bit set (3rd bit in the higher byte) + + // V - non-surrogate code units + // V = not surrogates_wordmask + const uint16_t V = static_cast(~surrogates_bitmask); + + // H - word-mask for high surrogates: the six highest bits are 0b1101'11 + const auto vH = (in & v_fc) == v_dc; + const uint16_t H = static_cast(vH.to_bitmask()); + + // L - word mask for low surrogates + // L = not H and surrogates_wordmask + const uint16_t L = static_cast(~H & surrogates_bitmask); + + const uint16_t a = static_cast( + L & (H >> 1)); // A low surrogate must be followed by high one. + // (A low surrogate placed in the 7th register's word + // is an exception we handle.) + const uint16_t b = static_cast( + a << 1); // Just mark that the opinput - startite fact is hold, + // thanks to that we have only two masks for valid case. + const uint16_t c = static_cast( + V | a | b); // Combine all the masks into the final one. + + if (c == 0xffff) { + // The whole input register contains valid UTF-16, i.e., + // either single code units or proper surrogate pairs. + input += 16; + } else if (c == 0x7fff) { + // The 15 lower code units of the input register contains valid UTF-16. + // The 15th word may be either a low or high surrogate. It the next + // iteration we 1) check if the low surrogate is followed by a high + // one, 2) reject sole high surrogate. + input += 15; + } else { + return result(error_code::SURROGATE, input - start); } - return result(error_code::SUCCESS, latin1_output - start); } + } + + return result(error_code::SUCCESS, input - start); +} + +template +const result validate_utf16_as_ascii_with_errors(const char16_t *input, + size_t size) { + if (simdutf_unlikely(size == 0)) { + return result(error_code::SUCCESS, 0); + } + size_t pos = 0; + for (; pos < size / 32 * 32; pos += 32) { + simd16x32 input_vec( + reinterpret_cast(input + pos)); + if simdutf_constexpr (!match_system(big_endian)) { + input_vec.swap_bytes(); + } + uint64_t matches = input_vec.lteq(uint16_t(0x7f)); + if (~matches) { + // Found a match, return the first one + int index = trailing_zeroes(~matches) / 2; + return result(error_code::TOO_LARGE, pos + index); + } + } + + // Scalar tail + while (pos < size) { - simdutf_really_inline bool errors() const { - return this->error.any_bits_set_anywhere(); + char16_t v = scalar::utf16::swap_if_needed(input[pos]); + if (v > 0x7F) { + return result(error_code::TOO_LARGE, pos); } + pos++; + } + return result(error_code::SUCCESS, size); +} - }; // struct utf8_checker -} // utf8_to_latin1 namespace +} // namespace utf16 } // unnamed namespace -} // namespace westmere +} // namespace lsx } // namespace simdutf -/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */ -/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */ +/* end file src/generic/validate_utf16.h */ +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF32 +/* begin file src/generic/utf32.h */ +#include namespace simdutf { -namespace westmere { +namespace lsx { namespace { -namespace utf8_to_latin1 { -using namespace simd; +namespace utf32 { +template T min(T a, T b) { return a <= b ? a : b; } - simdutf_really_inline size_t convert_valid(const char* in, size_t size, char* latin1_output) { - size_t pos = 0; - char* start{latin1_output}; - // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last 16 bytes, - // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate - // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, - // to give us a good margin. - size_t leading_byte = 0; - size_t margin = size; - for(; margin > 0 && leading_byte < 8; margin--) { - leading_byte += (int8_t(in[margin-1]) > -65); //twos complement of -65 is 1011 1111 ... - } - // If the input is long enough, then we have that margin-1 is the eight last leading byte. - const size_t safety_margin = size - margin + 1; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 input(reinterpret_cast(in + pos)); - if(input.is_ascii()) { - input.store((int8_t*)latin1_output); - latin1_output += 64; - pos += 64; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - uint64_t utf8_continuation_mask = input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in this case, we also have ASCII to account for. - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_latin1(in + pos, - utf8_end_of_code_point_mask, latin1_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. - } +simdutf_really_inline size_t utf8_length_from_utf32(const char32_t *input, + size_t length) { + using vector_u32 = simd32; + + const char32_t *start = input; + + // we add up to three ones in a single iteration (see the vectorized loop in + // section #2 below) + const size_t max_increment = 3; + + const size_t N = vector_u32::ELEMENTS; + +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + const auto v_0000007f = vector_u32::splat(0x0000007f); + const auto v_000007ff = vector_u32::splat(0x000007ff); + const auto v_0000ffff = vector_u32::splat(0x0000ffff); +#else + const auto v_ffffff80 = vector_u32::splat(0xffffff80); + const auto v_fffff800 = vector_u32::splat(0xfffff800); + const auto v_ffff0000 = vector_u32::splat(0xffff0000); + const auto one = vector_u32::splat(1); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP + + size_t counter = 0; + + // 1. vectorized loop unrolled 4 times + { + // we use vector of uint32 counters, this is why this limit is used + const size_t max_iterations = + std::numeric_limits::max() / (max_increment * 4); + size_t blocks = length / (N * 4); + length -= blocks * (N * 4); + while (blocks != 0) { + const size_t iterations = min(blocks, max_iterations); + blocks -= iterations; + + simd32 acc = vector_u32::zero(); + for (size_t i = 0; i < iterations; i++) { + const auto in0 = vector_u32(input + 0 * N); + const auto in1 = vector_u32(input + 1 * N); + const auto in2 = vector_u32(input + 2 * N); + const auto in3 = vector_u32(input + 3 * N); + +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + acc -= as_vector_u32(in0 > v_0000007f); + acc -= as_vector_u32(in1 > v_0000007f); + acc -= as_vector_u32(in2 > v_0000007f); + acc -= as_vector_u32(in3 > v_0000007f); + + acc -= as_vector_u32(in0 > v_000007ff); + acc -= as_vector_u32(in1 > v_000007ff); + acc -= as_vector_u32(in2 > v_000007ff); + acc -= as_vector_u32(in3 > v_000007ff); + + acc -= as_vector_u32(in0 > v_0000ffff); + acc -= as_vector_u32(in1 > v_0000ffff); + acc -= as_vector_u32(in2 > v_0000ffff); + acc -= as_vector_u32(in3 > v_0000ffff); +#else + acc += min(one, in0 & v_ffffff80); + acc += min(one, in1 & v_ffffff80); + acc += min(one, in2 & v_ffffff80); + acc += min(one, in3 & v_ffffff80); + + acc += min(one, in0 & v_fffff800); + acc += min(one, in1 & v_fffff800); + acc += min(one, in2 & v_fffff800); + acc += min(one, in3 & v_fffff800); + + acc += min(one, in0 & v_ffff0000); + acc += min(one, in1 & v_ffff0000); + acc += min(one, in2 & v_ffff0000); + acc += min(one, in3 & v_ffff0000); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP + + input += 4 * N; } - if(pos < size) { - size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos, latin1_output); - latin1_output += howmany; + + counter += acc.sum(); + } + } + + // 2. vectorized loop for tail + { + const size_t max_iterations = + std::numeric_limits::max() / max_increment; + size_t blocks = length / N; + length -= blocks * N; + while (blocks != 0) { + const size_t iterations = min(blocks, max_iterations); + blocks -= iterations; + + auto acc = vector_u32::zero(); + for (size_t i = 0; i < iterations; i++) { + const auto in = vector_u32(input); + +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + acc -= as_vector_u32(in > v_0000007f); + acc -= as_vector_u32(in > v_000007ff); + acc -= as_vector_u32(in > v_0000ffff); +#else + acc += min(one, in & v_ffffff80); + acc += min(one, in & v_fffff800); + acc += min(one, in & v_ffff0000); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP + + input += N; } - return latin1_output - start; + + counter += acc.sum(); } + } + const size_t consumed = input - start; + if (consumed != 0) { + // We don't count 0th bytes in the vectorized loops above, this + // is why we need to count them in the end. + counter += consumed; } -} // utf8_to_latin1 namespace -} // unnamed namespace -} // namespace westmere - // namespace simdutf -/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */ + return counter + scalar::utf32::utf8_length_from_utf32(input, length); +} + +} // namespace utf32 +} // unnamed namespace +} // namespace lsx +} // namespace simdutf +/* end file src/generic/utf32.h */ +#endif // SIMDUTF_FEATURE_UTF32 // // Implementation-specific overrides // - namespace simdutf { -namespace westmere { +namespace lsx { -simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept { +#if SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused int +implementation::detect_encodings(const char *input, + size_t length) const noexcept { // If there is a BOM, then we trust it. auto bom_encoding = simdutf::BOM::check_bom(input, length); - if(bom_encoding != encoding_type::unspecified) { return bom_encoding; } - if (length % 2 == 0) { - return sse_detect_encodings(input, length); - } else { - if (implementation::validate_utf8(input, length)) { - return simdutf::encoding_type::UTF8; - } else { - return simdutf::encoding_type::unspecified; + // todo: reimplement as a one-pass algorithm. + if (bom_encoding != encoding_type::unspecified) { + return bom_encoding; + } + int out = 0; + if (validate_utf8(input, length)) { + out |= encoding_type::UTF8; + } + if ((length % 2) == 0) { + if (validate_utf16le(reinterpret_cast(input), + length / 2)) { + out |= encoding_type::UTF16_LE; + } + } + if ((length % 4) == 0) { + if (validate_utf32(reinterpret_cast(input), length / 4)) { + out |= encoding_type::UTF32_LE; } } + return out; } +#endif // SIMDUTF_FEATURE_DETECT_ENCODING -simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { - return westmere::utf8_validation::generic_validate_utf8(buf, len); +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused bool +implementation::validate_utf8(const char *buf, size_t len) const noexcept { + return lsx::utf8_validation::generic_validate_utf8(buf, len); } +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING -simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept { - return westmere::utf8_validation::generic_validate_utf8_with_errors(buf, len); +#if SIMDUTF_FEATURE_UTF8 +simdutf_warn_unused result implementation::validate_utf8_with_errors( + const char *buf, size_t len) const noexcept { + return lsx::utf8_validation::generic_validate_utf8_with_errors(buf, len); } +#endif // SIMDUTF_FEATURE_UTF8 -simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept { - return westmere::utf8_validation::generic_validate_ascii(buf, len); +#if SIMDUTF_FEATURE_ASCII +simdutf_warn_unused bool +implementation::validate_ascii(const char *buf, size_t len) const noexcept { + return lsx::ascii_validation::generic_validate_ascii(buf, len); } -simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept { - return westmere::utf8_validation::generic_validate_ascii_with_errors(buf,len); +simdutf_warn_unused result implementation::validate_ascii_with_errors( + const char *buf, size_t len) const noexcept { + return lsx::ascii_validation::generic_validate_ascii_with_errors(buf, len); +} +#endif // SIMDUTF_FEATURE_ASCII +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII +simdutf_warn_unused bool +implementation::validate_utf16le_as_ascii(const char16_t *buf, + size_t len) const noexcept { + return lsx::utf16::validate_utf16_as_ascii_with_errors( + buf, len) + .error == SUCCESS; } -simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept { - const char16_t* tail = sse_validate_utf16(buf, len); - if (tail) { - return scalar::utf16::validate(tail, len - (tail - buf)); - } else { +simdutf_warn_unused bool +implementation::validate_utf16be_as_ascii(const char16_t *buf, + size_t len) const noexcept { + return lsx::utf16::validate_utf16_as_ascii_with_errors(buf, + len) + .error == SUCCESS; +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused bool +implementation::validate_utf16le(const char16_t *buf, + size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + // empty input is valid. protected the implementation from nullptr. + return true; + } + const auto res = + lsx::utf16::validate_utf16_with_errors(buf, len); + + if (res.is_err()) { return false; } + + if (res.count != len) { + return scalar::utf16::validate(buf + res.count, + len - res.count); + } + + return true; } +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING -simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept { - const char16_t* tail = sse_validate_utf16(buf, len); - if (tail) { - return scalar::utf16::validate(tail, len - (tail - buf)); - } else { +#if SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused bool +implementation::validate_utf16be(const char16_t *buf, + size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + // empty input is valid. protected the implementation from nullptr. + return true; + } + const auto res = + lsx::utf16::validate_utf16_with_errors(buf, len); + + if (res.is_err()) { return false; } + + if (res.count != len) { + return scalar::utf16::validate(buf + res.count, + len - res.count); + } + + return true; } -simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept { - result res = sse_validate_utf16_with_errors(buf, len); +simdutf_warn_unused result implementation::validate_utf16le_with_errors( + const char16_t *buf, size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + return result(error_code::SUCCESS, 0); + } + const result res = + lsx::utf16::validate_utf16_with_errors(buf, len); if (res.count != len) { - result scalar_res = scalar::utf16::validate_with_errors(buf + res.count, len - res.count); + const result scalar_res = + scalar::utf16::validate_with_errors( + buf + res.count, len - res.count); return result(scalar_res.error, res.count + scalar_res.count); } else { return res; } } -simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept { - result res = sse_validate_utf16_with_errors(buf, len); +simdutf_warn_unused result implementation::validate_utf16be_with_errors( + const char16_t *buf, size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + return result(error_code::SUCCESS, 0); + } + const result res = + lsx::utf16::validate_utf16_with_errors(buf, len); if (res.count != len) { - result scalar_res = scalar::utf16::validate_with_errors(buf + res.count, len - res.count); + const result scalar_res = + scalar::utf16::validate_with_errors(buf + res.count, + len - res.count); return result(scalar_res.error, res.count + scalar_res.count); } else { return res; } } -simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept { - const char32_t* tail = sse_validate_utf32le(buf, len); +void implementation::to_well_formed_utf16le(const char16_t *input, size_t len, + char16_t *output) const noexcept { + utf16::to_well_formed(input, len, output); +} + +void implementation::to_well_formed_utf16be(const char16_t *input, size_t len, + char16_t *output) const noexcept { + utf16::to_well_formed(input, len, output); +} +#endif // SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING +simdutf_warn_unused bool +implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + // empty input is valid. protected the implementation from nullptr. + return true; + } + const char32_t *tail = lsx_validate_utf32le(buf, len); if (tail) { return scalar::utf32::validate(tail, len - (tail - buf)); } else { return false; } } +#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING -simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept { - result res = sse_validate_utf32le_with_errors(buf, len); +#if SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused result implementation::validate_utf32_with_errors( + const char32_t *buf, size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + return result(error_code::SUCCESS, 0); + } + result res = lsx_validate_utf32le_with_errors(buf, len); if (res.count != len) { - result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count); + result scalar_res = + scalar::utf32::validate_with_errors(buf + res.count, len - res.count); return result(scalar_res.error, res.count + scalar_res.count); } else { return res; } } +#endif // SIMDUTF_FEATURE_UTF32 -simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept { - - std::pair ret = sse_convert_latin1_to_utf8(buf, len, utf8_output); +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_latin1_to_utf8( + const char *buf, size_t len, char *utf8_output) const noexcept { + std::pair ret = + lsx_convert_latin1_to_utf8(buf, len, utf8_output); size_t converted_chars = ret.second - utf8_output; if (ret.first != buf + len) { - const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert( - ret.first, len - (ret.first - buf), ret.second); + const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + converted_chars += scalar_converted_chars; + } + return converted_chars; +} +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + std::pair ret = + lsx_convert_latin1_to_utf16le(buf, len, utf16_output); + size_t converted_chars = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_converted_chars = + scalar::latin1_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + converted_chars += scalar_converted_chars; + } + return converted_chars; +} + +simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + std::pair ret = + lsx_convert_latin1_to_utf16be(buf, len, utf16_output); + size_t converted_chars = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_converted_chars = + scalar::latin1_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); converted_chars += scalar_converted_chars; } - return converted_chars; } +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 -simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - std::pair ret = sse_convert_latin1_to_utf16(buf, len, utf16_output); - if (ret.first == nullptr) { return 0; } - size_t converted_chars = ret.second - utf16_output; - if (ret.first != buf + len) { - const size_t scalar_converted_chars = scalar::latin1_to_utf16::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_converted_chars == 0) { return 0; } - converted_chars += scalar_converted_chars; - } - return converted_chars; -} - -simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - std::pair ret = sse_convert_latin1_to_utf16(buf, len, utf16_output); - if (ret.first == nullptr) { return 0; } - size_t converted_chars = ret.second - utf16_output; - if (ret.first != buf + len) { - const size_t scalar_converted_chars = scalar::latin1_to_utf16::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_converted_chars == 0) { return 0; } - converted_chars += scalar_converted_chars; - } - return converted_chars; -} - -simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept { - std::pair ret = sse_convert_latin1_to_utf32(buf, len, utf32_output); - if (ret.first == nullptr) { return 0; } - size_t converted_chars = ret.second - utf32_output; - if (ret.first != buf + len) { - const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_converted_chars == 0) { return 0; } - converted_chars += scalar_converted_chars; - } - return converted_chars; +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_latin1_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept { + std::pair ret = + lsx_convert_latin1_to_utf32(buf, len, utf32_output); + size_t converted_chars = ret.second - utf32_output; + if (ret.first != buf + len) { + const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert( + ret.first, len - (ret.first - buf), ret.second); + converted_chars += scalar_converted_chars; + } + return converted_chars; } +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 - -simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept { +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept { utf8_to_latin1::validating_transcoder converter; return converter.convert(buf, len, latin1_output); } -simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept { +simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors( + const char *buf, size_t len, char *latin1_output) const noexcept { utf8_to_latin1::validating_transcoder converter; return converter.convert_with_errors(buf, len, latin1_output); } -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept { - return westmere::utf8_to_latin1::convert_valid(buf,len,latin1_output); +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept { + return lsx::utf8_to_latin1::convert_valid(buf, len, latin1_output); } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 -simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { utf8_to_utf16::validating_transcoder converter; return converter.convert(buf, len, utf16_output); } -simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept { +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { utf8_to_utf16::validating_transcoder converter; return converter.convert(buf, len, utf16_output); } -simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept { +simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { utf8_to_utf16::validating_transcoder converter; - return converter.convert_with_errors(buf, len, utf16_output); + return converter.convert_with_errors(buf, len, + utf16_output); } -simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept { +simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { utf8_to_utf16::validating_transcoder converter; return converter.convert_with_errors(buf, len, utf16_output); } - -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* input, size_t size, - char16_t* utf16_output) const noexcept { - return utf8_to_utf16::convert_valid(input, size, utf16_output); +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le( + const char *input, size_t size, char16_t *utf16_output) const noexcept { + return utf8_to_utf16::convert_valid(input, size, + utf16_output); } -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* input, size_t size, - char16_t* utf16_output) const noexcept { - return utf8_to_utf16::convert_valid(input, size, utf16_output); +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be( + const char *input, size_t size, char16_t *utf16_output) const noexcept { + return utf8_to_utf16::convert_valid(input, size, + utf16_output); } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 -simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept { +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_utf8_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept { utf8_to_utf32::validating_transcoder converter; return converter.convert(buf, len, utf32_output); } -simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept { +simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors( + const char *buf, size_t len, char32_t *utf32_output) const noexcept { utf8_to_utf32::validating_transcoder converter; return converter.convert_with_errors(buf, len, utf32_output); } -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size, - char32_t* utf32_output) const noexcept { - return utf8_to_utf32::convert_valid(input, size, utf32_output); +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32( + const char *input, size_t size, char32_t *utf32_output) const noexcept { + return utf8_to_utf32::convert_valid(input, size, utf32_output); } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 -simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { - std::pair ret = sse_convert_utf16_to_latin1(buf, len, latin1_output); - if (ret.first == nullptr) { return 0; } +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair ret = + lsx_convert_utf16_to_latin1(buf, len, latin1_output); + if (ret.first == nullptr) { + return 0; + } size_t saved_bytes = ret.second - latin1_output; if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf16_to_latin1::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } + const size_t scalar_saved_bytes = + scalar::utf16_to_latin1::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } saved_bytes += scalar_saved_bytes; } return saved_bytes; } -simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { - std::pair ret = sse_convert_utf16_to_latin1(buf, len, latin1_output); - if (ret.first == nullptr) { return 0; } +simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair ret = + lsx_convert_utf16_to_latin1(buf, len, latin1_output); + if (ret.first == nullptr) { + return 0; + } size_t saved_bytes = ret.second - latin1_output; if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf16_to_latin1::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } + const size_t scalar_saved_bytes = + scalar::utf16_to_latin1::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } saved_bytes += scalar_saved_bytes; } return saved_bytes; } -simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept { - std::pair ret = sse_convert_utf16_to_latin1_with_errors(buf, len, latin1_output); - if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count +simdutf_warn_unused result +implementation::convert_utf16le_to_latin1_with_errors( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair ret = + lsx_convert_utf16_to_latin1_with_errors( + buf, len, latin1_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count if (ret.first.count != len) { // All good so far, but not finished - result scalar_res = scalar::utf16_to_latin1::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); + result scalar_res = + scalar::utf16_to_latin1::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); if (scalar_res.error) { scalar_res.count += ret.first.count; return scalar_res; @@ -38075,16 +65576,26 @@ simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors ret.second += scalar_res.count; } } - ret.first.count = ret.second - latin1_output; // Set count to the number of 8-bit code units written + ret.first.count = + ret.second - + latin1_output; // Set count to the number of 8-bit code units written return ret.first; } -simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept { - std::pair ret = sse_convert_utf16_to_latin1_with_errors(buf, len, latin1_output); - if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count +simdutf_warn_unused result +implementation::convert_utf16be_to_latin1_with_errors( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair ret = + lsx_convert_utf16_to_latin1_with_errors(buf, len, + latin1_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count if (ret.first.count != len) { // All good so far, but not finished - result scalar_res = scalar::utf16_to_latin1::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); + result scalar_res = + scalar::utf16_to_latin1::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); if (scalar_res.error) { scalar_res.count += ret.first.count; return scalar_res; @@ -38092,54 +65603,81 @@ simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors ret.second += scalar_res.count; } } - ret.first.count = ret.second - latin1_output; // Set count to the number of 8-bit code units written + ret.first.count = + ret.second - + latin1_output; // Set count to the number of 8-bit code units written return ret.first; } - -simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { - // optimization opportunity: we could provide an optimized function. +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + // optimization opportunity: implement a custom function. return convert_utf16be_to_latin1(buf, len, latin1_output); } -simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { - // optimization opportunity: we could provide an optimized function. +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + // optimization opportunity: implement a custom function. return convert_utf16le_to_latin1(buf, len, latin1_output); } +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 -simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - std::pair ret = sse_convert_utf16_to_utf8(buf, len, utf8_output); - if (ret.first == nullptr) { return 0; } +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + std::pair ret = + lsx_convert_utf16_to_utf8(buf, len, utf8_output); + if (ret.first == nullptr) { + return 0; + } size_t saved_bytes = ret.second - utf8_output; if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } + const size_t scalar_saved_bytes = + scalar::utf16_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } saved_bytes += scalar_saved_bytes; } return saved_bytes; } -simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - std::pair ret = sse_convert_utf16_to_utf8(buf, len, utf8_output); - if (ret.first == nullptr) { return 0; } +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + std::pair ret = + lsx_convert_utf16_to_utf8(buf, len, utf8_output); + if (ret.first == nullptr) { + return 0; + } size_t saved_bytes = ret.second - utf8_output; if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } + const size_t scalar_saved_bytes = + scalar::utf16_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } saved_bytes += scalar_saved_bytes; } return saved_bytes; } -simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of code units written even if finished - std::pair ret = westmere::sse_convert_utf16_to_utf8_with_errors(buf, len, utf8_output); - if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count +simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + lsx_convert_utf16_to_utf8_with_errors(buf, len, + utf8_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count if (ret.first.count != len) { // All good so far, but not finished - result scalar_res = scalar::utf16_to_utf8::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); + result scalar_res = + scalar::utf16_to_utf8::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); if (scalar_res.error) { scalar_res.count += ret.first.count; return scalar_res; @@ -38147,17 +65685,27 @@ simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(c ret.second += scalar_res.count; } } - ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit code units written + ret.first.count = + ret.second - + utf8_output; // Set count to the number of 8-bit code units written return ret.first; } -simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of code units written even if finished - std::pair ret = westmere::sse_convert_utf16_to_utf8_with_errors(buf, len, utf8_output); - if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count +simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + lsx_convert_utf16_to_utf8_with_errors(buf, len, + utf8_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count if (ret.first.count != len) { // All good so far, but not finished - result scalar_res = scalar::utf16_to_utf8::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); + result scalar_res = + scalar::utf16_to_utf8::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); if (scalar_res.error) { scalar_res.count += ret.first.count; return scalar_res; @@ -38165,74 +65713,58 @@ simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(c ret.second += scalar_res.count; } } - ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit code units written + ret.first.count = + ret.second - + utf8_output; // Set count to the number of 8-bit code units written return ret.first; } -simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { return convert_utf16le_to_utf8(buf, len, utf8_output); } -simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { return convert_utf16be_to_utf8(buf, len, utf8_output); } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 -simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept { - std::pair ret = sse_convert_utf32_to_latin1(buf, len, latin1_output); - if (ret.first == nullptr) { return 0; } - size_t saved_bytes = ret.second - latin1_output; - // if (ret.first != buf + len) { - if (ret.first < buf + len) { - const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_output) const noexcept { + if (simdutf_unlikely(len == 0)) { + return 0; } - return saved_bytes; -} - - -simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of code units written even if finished - std::pair ret = westmere::sse_convert_utf32_to_latin1_with_errors(buf, len, latin1_output); - if (ret.first.count != len) { - result scalar_res = scalar::utf32_to_latin1::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); - if (scalar_res.error) { - scalar_res.count += ret.first.count; - return scalar_res; - } else { - ret.second += scalar_res.count; - } + std::pair ret = + lsx_convert_utf32_to_utf8(buf, len, utf8_output); + if (ret.first == nullptr) { + return 0; } - ret.first.count = ret.second - latin1_output; // Set count to the number of 8-bit code units written - return ret.first; -} - -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept { - // optimization opportunity: we could provide an optimized function. - return convert_utf32_to_latin1(buf,len,latin1_output); -} - -simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { - std::pair ret = sse_convert_utf32_to_utf8(buf, len, utf8_output); - if (ret.first == nullptr) { return 0; } size_t saved_bytes = ret.second - utf8_output; if (ret.first != buf + len) { const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } saved_bytes += scalar_saved_bytes; } return saved_bytes; } -simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of code units written even if finished - std::pair ret = westmere::sse_convert_utf32_to_utf8_with_errors(buf, len, utf8_output); +simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors( + const char32_t *buf, size_t len, char *utf8_output) const noexcept { + if (simdutf_unlikely(len == 0)) { + return result(error_code::SUCCESS, 0); + } + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + lsx_convert_utf32_to_utf8_with_errors(buf, len, utf8_output); if (ret.first.count != len) { result scalar_res = scalar::utf32_to_utf8::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); + buf + ret.first.count, len - ret.first.count, ret.second); if (scalar_res.error) { scalar_res.count += ret.first.count; return scalar_res; @@ -38240,43 +65772,97 @@ simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(con ret.second += scalar_res.count; } } - ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit code units written + ret.first.count = + ret.second - + utf8_output; // Set count to the number of 8-bit code units written return ret.first; } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 -simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - std::pair ret = sse_convert_utf16_to_utf32(buf, len, utf32_output); - if (ret.first == nullptr) { return 0; } +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + std::pair ret = + lsx_convert_utf16_to_utf32(buf, len, utf32_output); + if (ret.first == nullptr) { + return 0; + } size_t saved_bytes = ret.second - utf32_output; if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } + const size_t scalar_saved_bytes = + scalar::utf16_to_utf32::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } saved_bytes += scalar_saved_bytes; } return saved_bytes; } -simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - std::pair ret = sse_convert_utf16_to_utf32(buf, len, utf32_output); - if (ret.first == nullptr) { return 0; } +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + std::pair ret = + lsx_convert_utf16_to_utf32(buf, len, utf32_output); + if (ret.first == nullptr) { + return 0; + } size_t saved_bytes = ret.second - utf32_output; if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } + const size_t scalar_saved_bytes = + scalar::utf16_to_utf32::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } saved_bytes += scalar_saved_bytes; } return saved_bytes; } -simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of code units written even if finished - std::pair ret = westmere::sse_convert_utf16_to_utf32_with_errors(buf, len, utf32_output); - if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count +simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + lsx_convert_utf16_to_utf32_with_errors(buf, len, + utf32_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = + scalar::utf16_to_utf32::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + utf32_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + lsx_convert_utf16_to_utf32_with_errors(buf, len, + utf32_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count if (ret.first.count != len) { // All good so far, but not finished - result scalar_res = scalar::utf16_to_utf32::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); + result scalar_res = + scalar::utf16_to_utf32::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); if (scalar_res.error) { scalar_res.count += ret.first.count; return scalar_res; @@ -38284,17 +65870,45 @@ simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors( ret.second += scalar_res.count; } } - ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit code units written + ret.first.count = + ret.second - + utf32_output; // Set count to the number of 8-bit code units written return ret.first; } +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::convert_utf32_to_latin1( + const char32_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair ret = + lsx_convert_utf32_to_latin1(buf, len, latin1_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - latin1_output; + + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} -simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of code units written even if finished - std::pair ret = westmere::sse_convert_utf16_to_utf32_with_errors(buf, len, utf32_output); - if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count +simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors( + const char32_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair ret = + lsx_convert_utf32_to_latin1_with_errors(buf, len, latin1_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count if (ret.first.count != len) { // All good so far, but not finished - result scalar_res = scalar::utf16_to_utf32::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); + result scalar_res = scalar::utf32_to_latin1::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); if (scalar_res.error) { scalar_res.count += ret.first.count; return scalar_res; @@ -38302,46 +65916,91 @@ simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors( ret.second += scalar_res.count; } } - ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit code units written + ret.first.count = + ret.second - + latin1_output; // Set count to the number of 8-bit code units written return ret.first; } -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1( + const char32_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair ret = + lsx_convert_utf32_to_latin1(buf, len, latin1_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - latin1_output; + + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert_valid( + ret.first, len - (ret.first - buf), ret.second); + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_output) const noexcept { + // optimization opportunity: implement a custom function. return convert_utf32_to_utf8(buf, len, utf8_output); } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 -simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - std::pair ret = sse_convert_utf32_to_utf16(buf, len, utf16_output); - if (ret.first == nullptr) { return 0; } +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + std::pair ret = + lsx_convert_utf32_to_utf16(buf, len, utf16_output); + if (ret.first == nullptr) { + return 0; + } size_t saved_bytes = ret.second - utf16_output; if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } + const size_t scalar_saved_bytes = + scalar::utf32_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } saved_bytes += scalar_saved_bytes; } + return saved_bytes; } -simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - std::pair ret = sse_convert_utf32_to_utf16(buf, len, utf16_output); - if (ret.first == nullptr) { return 0; } +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + std::pair ret = + lsx_convert_utf32_to_utf16(buf, len, utf16_output); + if (ret.first == nullptr) { + return 0; + } size_t saved_bytes = ret.second - utf16_output; if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } + const size_t scalar_saved_bytes = + scalar::utf32_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } saved_bytes += scalar_saved_bytes; } return saved_bytes; } -simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of code units written even if finished - std::pair ret = westmere::sse_convert_utf32_to_utf16_with_errors(buf, len, utf16_output); +simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + lsx_convert_utf32_to_utf16_with_errors(buf, len, + utf16_output); if (ret.first.count != len) { - result scalar_res = scalar::utf32_to_utf16::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); + result scalar_res = + scalar::utf32_to_utf16::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); if (scalar_res.error) { scalar_res.count += ret.first.count; return scalar_res; @@ -38349,16 +66008,23 @@ simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors( ret.second += scalar_res.count; } } - ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit code units written + ret.first.count = + ret.second - + utf16_output; // Set count to the number of 8-bit code units written return ret.first; } -simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of code units written even if finished - std::pair ret = westmere::sse_convert_utf32_to_utf16_with_errors(buf, len, utf16_output); +simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair ret = + lsx_convert_utf32_to_utf16_with_errors(buf, len, + utf16_output); if (ret.first.count != len) { - result scalar_res = scalar::utf32_to_utf16::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); + result scalar_res = + scalar::utf32_to_utf16::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); if (scalar_res.error) { scalar_res.count += ret.first.count; return scalar_res; @@ -38366,214 +66032,1242 @@ simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors( ret.second += scalar_res.count; } } - ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit code units written + ret.first.count = + ret.second - + utf16_output; // Set count to the number of 8-bit code units written return ret.first; } -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { return convert_utf32_to_utf16le(buf, len, utf16_output); } -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { return convert_utf32_to_utf16be(buf, len, utf16_output); } -simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { return convert_utf16le_to_utf32(buf, len, utf32_output); } -simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { return convert_utf16be_to_utf32(buf, len, utf32_output); } +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 -void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept { +#if SIMDUTF_FEATURE_UTF16 +void implementation::change_endianness_utf16(const char16_t *input, + size_t length, + char16_t *output) const noexcept { utf16::change_endianness_utf16(input, length, output); } -simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept { +simdutf_warn_unused size_t implementation::count_utf16le( + const char16_t *input, size_t length) const noexcept { return utf16::count_code_points(input, length); } -simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept { +simdutf_warn_unused size_t implementation::count_utf16be( + const char16_t *input, size_t length) const noexcept { return utf16::count_code_points(input, length); } +#endif // SIMDUTF_FEATURE_UTF16 -simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept { +#if SIMDUTF_FEATURE_UTF8 +simdutf_warn_unused size_t +implementation::count_utf8(const char *input, size_t length) const noexcept { return utf8::count_code_points(input, length); } +#endif // SIMDUTF_FEATURE_UTF8 -simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept { - return count_utf8(buf,len); +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::latin1_length_from_utf8( + const char *buf, size_t len) const noexcept { + return count_utf8(buf, len); } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 -simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept { - return scalar::utf16::latin1_length_from_utf16(length); +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +simdutf_warn_unused size_t implementation::utf8_length_from_latin1( + const char *input, size_t length) const noexcept { + const uint8_t *data = reinterpret_cast(input); + const uint8_t *data_end = data + length; + uint64_t result = 0; + while (data_end - data > 16) { + uint64_t two_bytes = 0; + __m128i input_vec = __lsx_vld(data, 0); + two_bytes = + __lsx_vpickve2gr_hu(__lsx_vpcnt_h(__lsx_vmskltz_b(input_vec)), 0); + result += 16 + two_bytes; + data += 16; + } + return result + scalar::latin1::utf8_length_from_latin1((const char *)data, + data_end - data); } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 -simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t length) const noexcept { - return scalar::utf32::latin1_length_from_utf32(length); +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::utf8_length_from_utf16le( + const char16_t *input, size_t length) const noexcept { + return utf16::utf8_length_from_utf16_bytemask(input, + length); } -simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept { - return utf16::utf8_length_from_utf16(input, length); +simdutf_warn_unused size_t implementation::utf8_length_from_utf16be( + const char16_t *input, size_t length) const noexcept { + return utf16::utf8_length_from_utf16_bytemask(input, length); } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 -simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept { - return utf16::utf8_length_from_utf16(input, length); +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf32_length_from_utf16le( + const char16_t *input, size_t length) const noexcept { + return utf16::utf32_length_from_utf16(input, length); } -simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept { - return scalar::latin1::utf16_length_from_latin1(length); +simdutf_warn_unused size_t implementation::utf32_length_from_utf16be( + const char16_t *input, size_t length) const noexcept { + return utf16::utf32_length_from_utf16(input, length); } +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 -simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept { - return scalar::latin1::utf32_length_from_latin1(length); +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +simdutf_warn_unused size_t implementation::utf16_length_from_utf8( + const char *input, size_t length) const noexcept { + return utf8::utf16_length_from_utf8_bytemask(input, length); } - -simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char * input, size_t len) const noexcept { - const uint8_t *str = reinterpret_cast(input); - size_t answer = len / sizeof(__m128i) * sizeof(__m128i); - size_t i = 0; - __m128i two_64bits = _mm_setzero_si128(); - while (i + sizeof(__m128i) <= len) { - __m128i runner = _mm_setzero_si128(); - size_t iterations = (len - i) / sizeof(__m128i); - if (iterations > 255) { - iterations = 255; - } - size_t max_i = i + iterations * sizeof(__m128i) - sizeof(__m128i); - for (; i + 4*sizeof(__m128i) <= max_i; i += 4*sizeof(__m128i)) { - __m128i input1 = _mm_loadu_si128((const __m128i *)(str + i)); - __m128i input2 = _mm_loadu_si128((const __m128i *)(str + i + sizeof(__m128i))); - __m128i input3 = _mm_loadu_si128((const __m128i *)(str + i + 2*sizeof(__m128i))); - __m128i input4 = _mm_loadu_si128((const __m128i *)(str + i + 3*sizeof(__m128i))); - __m128i input12 = _mm_add_epi8( - _mm_cmpgt_epi8( - _mm_setzero_si128(), - input1), - _mm_cmpgt_epi8( - _mm_setzero_si128(), - input2)); - __m128i input34 = _mm_add_epi8( - _mm_cmpgt_epi8( - _mm_setzero_si128(), - input3), - _mm_cmpgt_epi8( - _mm_setzero_si128(), - input4)); - __m128i input1234 = _mm_add_epi8(input12, input34); - runner = _mm_sub_epi8(runner, input1234); - } - for (; i <= max_i; i += sizeof(__m128i)) { - __m128i more_input = _mm_loadu_si128((const __m128i *)(str + i)); - runner = _mm_sub_epi8( - runner, _mm_cmpgt_epi8(_mm_setzero_si128(), more_input)); - } - two_64bits = _mm_add_epi64( - two_64bits, _mm_sad_epu8(runner, _mm_setzero_si128())); - } - answer += _mm_extract_epi64(two_64bits, 0) + - _mm_extract_epi64(two_64bits, 1); - return answer + scalar::latin1::utf8_length_from_latin1(reinterpret_cast(str + i), len - i); -} - -simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept { - return utf16::utf32_length_from_utf16(input, length); +simdutf_warn_unused result +implementation::utf8_length_from_utf16le_with_replacement( + const char16_t *input, size_t length) const noexcept { + return scalar::utf16::utf8_length_from_utf16_with_replacement< + endianness::LITTLE>(input, length); } -simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept { - return utf16::utf32_length_from_utf16(input, length); +simdutf_warn_unused result +implementation::utf8_length_from_utf16be_with_replacement( + const char16_t *input, size_t length) const noexcept { + return scalar::utf16::utf8_length_from_utf16_with_replacement< + endianness::BIG>(input, length); } -simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept { - return utf8::utf16_length_from_utf8(input, length); +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf8_length_from_utf32( + const char32_t *input, size_t length) const noexcept { + return utf32::utf8_length_from_utf32(input, length); } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 -simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept { - const __m128i v_00000000 = _mm_setzero_si128(); - const __m128i v_ffffff80 = _mm_set1_epi32((uint32_t)0xffffff80); - const __m128i v_fffff800 = _mm_set1_epi32((uint32_t)0xfffff800); - const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000); - size_t pos = 0; - size_t count = 0; - for(;pos + 4 <= length; pos += 4) { - __m128i in = _mm_loadu_si128((__m128i*)(input + pos)); - const __m128i ascii_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffffff80), v_00000000); - const __m128i one_two_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_fffff800), v_00000000); - const __m128i two_bytes_bytemask = _mm_xor_si128(one_two_bytes_bytemask, ascii_bytes_bytemask); - const __m128i one_two_three_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000); - const __m128i three_bytes_bytemask = _mm_xor_si128(one_two_three_bytes_bytemask, one_two_bytes_bytemask); - const uint16_t ascii_bytes_bitmask = static_cast(_mm_movemask_epi8(ascii_bytes_bytemask)); - const uint16_t two_bytes_bitmask = static_cast(_mm_movemask_epi8(two_bytes_bytemask)); - const uint16_t three_bytes_bitmask = static_cast(_mm_movemask_epi8(three_bytes_bytemask)); - - size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4; - size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4; - size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4; - count += 16 - 3*ascii_count - 2*two_bytes_count - three_bytes_count; - } - return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos); -} - -simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept { - const __m128i v_00000000 = _mm_setzero_si128(); - const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000); +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf16_length_from_utf32( + const char32_t *input, size_t length) const noexcept { + const __m128i v_ffff = lsx_splat_u32(0x0000ffff); size_t pos = 0; size_t count = 0; - for(;pos + 4 <= length; pos += 4) { - __m128i in = _mm_loadu_si128((__m128i*)(input + pos)); - const __m128i surrogate_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000); - const uint16_t surrogate_bitmask = static_cast(_mm_movemask_epi8(surrogate_bytemask)); - size_t surrogate_count = (16-count_ones(surrogate_bitmask))/4; + for (; pos + 4 <= length; pos += 4) { + __m128i in = __lsx_vld(reinterpret_cast(input + pos), 0); + const __m128i surrogate_bytemask = __lsx_vslt_wu(v_ffff, in); + size_t surrogate_count = __lsx_vpickve2gr_bu( + __lsx_vpcnt_b(__lsx_vmskltz_w(surrogate_bytemask)), 0); count += 4 + surrogate_count; } - return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos); + return count + + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos); } +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 -simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept { +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +simdutf_warn_unused size_t implementation::utf32_length_from_utf8( + const char *input, size_t length) const noexcept { return utf8::count_code_points(input, length); } +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 -simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char * input, size_t length) const noexcept { - return scalar::base64::maximal_binary_length_from_base64(input, length); -} - -simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept { - return (options & base64_url) ? compress_decode_base64(output, input, length, options) : compress_decode_base64(output, input, length, options); +#if SIMDUTF_FEATURE_BASE64 +simdutf_warn_unused result implementation::base64_to_binary( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + if (options & base64_default_or_url) { + if (options == base64_options::base64_default_or_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } } -simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept { - return scalar::base64::maximal_binary_length_from_base64(input, length); +simdutf_warn_unused full_result implementation::base64_to_binary_details( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + if (options & base64_default_or_url) { + if (options == base64_options::base64_default_or_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } } -simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept { - return (options & base64_url) ? compress_decode_base64(output, input, length, options) : compress_decode_base64(output, input, length, options); +simdutf_warn_unused result implementation::base64_to_binary( + const char16_t *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + if (options & base64_default_or_url) { + if (options == base64_options::base64_default_or_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } } -simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length, base64_options options) const noexcept { - return scalar::base64::base64_length_from_binary(length, options); +simdutf_warn_unused full_result implementation::base64_to_binary_details( + const char16_t *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + if (options & base64_default_or_url) { + if (options == base64_options::base64_default_or_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } else { + return compress_decode_base64( + output, input, length, options, last_chunk_options); + } + } } -size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept { - if(options == base64_url) { - return encode_base64(output, input, length); +size_t implementation::binary_to_base64(const char *input, size_t length, + char *output, + base64_options options) const noexcept { + if (options & base64_url) { + return encode_base64(output, input, length, options); } else { - return encode_base64(output, input, length); + return encode_base64(output, input, length, options); } } -} // namespace westmere + +size_t implementation::binary_to_base64_with_lines( + const char *input, size_t length, char *output, size_t line_length, + base64_options options) const noexcept { + return scalar::base64::tail_encode_base64_impl(output, input, length, + options, line_length); +} + +const char *implementation::find(const char *start, const char *end, + char character) const noexcept { + return util_find(start, end, character); +} + +const char16_t *implementation::find(const char16_t *start, const char16_t *end, + char16_t character) const noexcept { + return util_find(start, end, character); +} +#endif // SIMDUTF_FEATURE_BASE64 + +} // namespace lsx } // namespace simdutf -/* begin file src/simdutf/westmere/end.h */ -#if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE -// nothing needed. -#else -SIMDUTF_UNTARGET_REGION +/* begin file src/simdutf/lsx/end.h */ +#undef SIMDUTF_SIMD_HAS_UNSIGNED_CMP +/* end file src/simdutf/lsx/end.h */ +/* end file src/lsx/implementation.cpp */ #endif -/* end file src/simdutf/westmere/end.h */ -/* end file src/westmere/implementation.cpp */ +/* begin file src/simdutf_c.cpp */ +/* begin file include/simdutf_c.h */ +/*** + * simdutf_c.h.h - C API for simdutf + * This is currently experimental. + * We are committed to keeping the C API, but there might be mistakes in our + * implementation. Please report any issues you find. + */ + +#ifndef SIMDUTF_C_H +#define SIMDUTF_C_H + +#include +#include +#include + +#ifdef __has_include + #if __has_include() + #include + #else // __has_include() + #define char16_t uint16_t + #define char32_t uint32_t + #endif // __has_include() +#else // __has_include() + #define char16_t uint16_t + #define char32_t uint32_t +#endif // __has_include + +#ifdef __cplusplus +extern "C" { #endif +/* C-friendly subset of simdutf errors */ +typedef enum simdutf_error_code { + SIMDUTF_ERROR_SUCCESS = 0, + SIMDUTF_ERROR_HEADER_BITS, + SIMDUTF_ERROR_TOO_SHORT, + SIMDUTF_ERROR_TOO_LONG, + SIMDUTF_ERROR_OVERLONG, + SIMDUTF_ERROR_TOO_LARGE, + SIMDUTF_ERROR_SURROGATE, + SIMDUTF_ERROR_INVALID_BASE64_CHARACTER, + SIMDUTF_ERROR_BASE64_INPUT_REMAINDER, + SIMDUTF_ERROR_BASE64_EXTRA_BITS, + SIMDUTF_ERROR_OUTPUT_BUFFER_TOO_SMALL, + SIMDUTF_ERROR_OTHER +} simdutf_error_code; + +typedef struct simdutf_result { + simdutf_error_code error; + size_t count; /* position of error or number of code units validated */ +} simdutf_result; + +typedef enum simdutf_encoding_type { + SIMDUTF_ENCODING_UNSPECIFIED = 0, + SIMDUTF_ENCODING_UTF8 = 1, + SIMDUTF_ENCODING_UTF16_LE = 2, + SIMDUTF_ENCODING_UTF16_BE = 4, + SIMDUTF_ENCODING_UTF32_LE = 8, + SIMDUTF_ENCODING_UTF32_BE = 16 +} simdutf_encoding_type; + +/* Validate UTF-8: returns true iff input is valid UTF-8 */ +bool simdutf_validate_utf8(const char *buf, size_t len); + +/* Validate UTF-8 with detailed result */ +simdutf_result simdutf_validate_utf8_with_errors(const char *buf, size_t len); + +/* Encoding detection */ +simdutf_encoding_type simdutf_autodetect_encoding(const char *input, + size_t length); +int simdutf_detect_encodings(const char *input, size_t length); + +/* ASCII validation */ +bool simdutf_validate_ascii(const char *buf, size_t len); +simdutf_result simdutf_validate_ascii_with_errors(const char *buf, size_t len); + +/* UTF-16 ASCII checks */ +bool simdutf_validate_utf16_as_ascii(const char16_t *buf, size_t len); +bool simdutf_validate_utf16be_as_ascii(const char16_t *buf, size_t len); +bool simdutf_validate_utf16le_as_ascii(const char16_t *buf, size_t len); + +/* UTF-16/UTF-8/UTF-32 validation (native/endian-specific) */ +bool simdutf_validate_utf16(const char16_t *buf, size_t len); +bool simdutf_validate_utf16le(const char16_t *buf, size_t len); +bool simdutf_validate_utf16be(const char16_t *buf, size_t len); +simdutf_result simdutf_validate_utf16_with_errors(const char16_t *buf, + size_t len); +simdutf_result simdutf_validate_utf16le_with_errors(const char16_t *buf, + size_t len); +simdutf_result simdutf_validate_utf16be_with_errors(const char16_t *buf, + size_t len); + +bool simdutf_validate_utf32(const char32_t *buf, size_t len); +simdutf_result simdutf_validate_utf32_with_errors(const char32_t *buf, + size_t len); + +/* to_well_formed UTF-16 helpers */ +void simdutf_to_well_formed_utf16le(const char16_t *input, size_t len, + char16_t *output); +void simdutf_to_well_formed_utf16be(const char16_t *input, size_t len, + char16_t *output); +void simdutf_to_well_formed_utf16(const char16_t *input, size_t len, + char16_t *output); + +/* Counting */ +size_t simdutf_count_utf16(const char16_t *input, size_t length); +size_t simdutf_count_utf16le(const char16_t *input, size_t length); +size_t simdutf_count_utf16be(const char16_t *input, size_t length); +size_t simdutf_count_utf8(const char *input, size_t length); + +/* Length estimators */ +size_t simdutf_utf8_length_from_latin1(const char *input, size_t length); +size_t simdutf_latin1_length_from_utf8(const char *input, size_t length); +size_t simdutf_latin1_length_from_utf16(size_t length); +size_t simdutf_latin1_length_from_utf32(size_t length); +size_t simdutf_utf16_length_from_utf8(const char *input, size_t length); +size_t simdutf_utf32_length_from_utf8(const char *input, size_t length); +size_t simdutf_utf8_length_from_utf16(const char16_t *input, size_t length); +simdutf_result +simdutf_utf8_length_from_utf16_with_replacement(const char16_t *input, + size_t length); +size_t simdutf_utf8_length_from_utf16le(const char16_t *input, size_t length); +size_t simdutf_utf8_length_from_utf16be(const char16_t *input, size_t length); +simdutf_result +simdutf_utf8_length_from_utf16le_with_replacement(const char16_t *input, + size_t length); +simdutf_result +simdutf_utf8_length_from_utf16be_with_replacement(const char16_t *input, + size_t length); + +/* Conversions: latin1 <-> utf8, utf8 <-> utf16/utf32, utf16 <-> utf8, etc. */ +size_t simdutf_convert_latin1_to_utf8(const char *input, size_t length, + char *output); +size_t simdutf_convert_latin1_to_utf8_safe(const char *input, size_t length, + char *output, size_t utf8_len); +size_t simdutf_convert_latin1_to_utf16le(const char *input, size_t length, + char16_t *output); +size_t simdutf_convert_latin1_to_utf16be(const char *input, size_t length, + char16_t *output); +size_t simdutf_convert_latin1_to_utf32(const char *input, size_t length, + char32_t *output); + +size_t simdutf_convert_utf8_to_latin1(const char *input, size_t length, + char *output); +size_t simdutf_convert_utf8_to_utf16le(const char *input, size_t length, + char16_t *output); +size_t simdutf_convert_utf8_to_utf16be(const char *input, size_t length, + char16_t *output); +size_t simdutf_convert_utf8_to_utf16(const char *input, size_t length, + char16_t *output); + +size_t simdutf_convert_utf8_to_utf32(const char *input, size_t length, + char32_t *output); +simdutf_result simdutf_convert_utf8_to_latin1_with_errors(const char *input, + size_t length, + char *output); +simdutf_result simdutf_convert_utf8_to_utf16_with_errors(const char *input, + size_t length, + char16_t *output); +simdutf_result simdutf_convert_utf8_to_utf16le_with_errors(const char *input, + size_t length, + char16_t *output); +simdutf_result simdutf_convert_utf8_to_utf16be_with_errors(const char *input, + size_t length, + char16_t *output); +simdutf_result simdutf_convert_utf8_to_utf32_with_errors(const char *input, + size_t length, + char32_t *output); + +/* Conversions assuming valid input */ +size_t simdutf_convert_valid_utf8_to_latin1(const char *input, size_t length, + char *output); +size_t simdutf_convert_valid_utf8_to_utf16le(const char *input, size_t length, + char16_t *output); +size_t simdutf_convert_valid_utf8_to_utf16be(const char *input, size_t length, + char16_t *output); +size_t simdutf_convert_valid_utf8_to_utf32(const char *input, size_t length, + char32_t *output); + +/* UTF-16 -> UTF-8 and related conversions */ +size_t simdutf_convert_utf16_to_utf8(const char16_t *input, size_t length, + char *output); +size_t simdutf_convert_utf16le_to_utf8(const char16_t *input, size_t length, + char *output); +size_t simdutf_convert_utf16be_to_utf8(const char16_t *input, size_t length, + char *output); +size_t simdutf_convert_utf16_to_utf8_safe(const char16_t *input, size_t length, + char *output, size_t utf8_len); +size_t simdutf_convert_utf16_to_latin1(const char16_t *input, size_t length, + char *output); +size_t simdutf_convert_utf16le_to_latin1(const char16_t *input, size_t length, + char *output); +size_t simdutf_convert_utf16be_to_latin1(const char16_t *input, size_t length, + char *output); +simdutf_result +simdutf_convert_utf16_to_latin1_with_errors(const char16_t *input, + size_t length, char *output); +simdutf_result +simdutf_convert_utf16le_to_latin1_with_errors(const char16_t *input, + size_t length, char *output); +simdutf_result +simdutf_convert_utf16be_to_latin1_with_errors(const char16_t *input, + size_t length, char *output); + +simdutf_result simdutf_convert_utf16_to_utf8_with_errors(const char16_t *input, + size_t length, + char *output); +simdutf_result +simdutf_convert_utf16le_to_utf8_with_errors(const char16_t *input, + size_t length, char *output); +simdutf_result +simdutf_convert_utf16be_to_utf8_with_errors(const char16_t *input, + size_t length, char *output); + +size_t simdutf_convert_valid_utf16_to_utf8(const char16_t *input, size_t length, + char *output); +size_t simdutf_convert_valid_utf16_to_latin1(const char16_t *input, + size_t length, char *output); +size_t simdutf_convert_valid_utf16le_to_latin1(const char16_t *input, + size_t length, char *output); +size_t simdutf_convert_valid_utf16be_to_latin1(const char16_t *input, + size_t length, char *output); + +size_t simdutf_convert_valid_utf16le_to_utf8(const char16_t *input, + size_t length, char *output); +size_t simdutf_convert_valid_utf16be_to_utf8(const char16_t *input, + size_t length, char *output); + +/* UTF-16 <-> UTF-32 conversions */ +size_t simdutf_convert_utf16_to_utf32(const char16_t *input, size_t length, + char32_t *output); +size_t simdutf_convert_utf16le_to_utf32(const char16_t *input, size_t length, + char32_t *output); +size_t simdutf_convert_utf16be_to_utf32(const char16_t *input, size_t length, + char32_t *output); +simdutf_result simdutf_convert_utf16_to_utf32_with_errors(const char16_t *input, + size_t length, + char32_t *output); +simdutf_result +simdutf_convert_utf16le_to_utf32_with_errors(const char16_t *input, + size_t length, char32_t *output); +simdutf_result +simdutf_convert_utf16be_to_utf32_with_errors(const char16_t *input, + size_t length, char32_t *output); + +/* Valid UTF-16 conversions */ +size_t simdutf_convert_valid_utf16_to_utf32(const char16_t *input, + size_t length, char32_t *output); +size_t simdutf_convert_valid_utf16le_to_utf32(const char16_t *input, + size_t length, char32_t *output); +size_t simdutf_convert_valid_utf16be_to_utf32(const char16_t *input, + size_t length, char32_t *output); + +/* UTF-32 -> ... conversions */ +size_t simdutf_convert_utf32_to_utf8(const char32_t *input, size_t length, + char *output); +simdutf_result simdutf_convert_utf32_to_utf8_with_errors(const char32_t *input, + size_t length, + char *output); +size_t simdutf_convert_valid_utf32_to_utf8(const char32_t *input, size_t length, + char *output); + +size_t simdutf_convert_utf32_to_utf16(const char32_t *input, size_t length, + char16_t *output); +size_t simdutf_convert_utf32_to_utf16le(const char32_t *input, size_t length, + char16_t *output); +size_t simdutf_convert_utf32_to_utf16be(const char32_t *input, size_t length, + char16_t *output); +simdutf_result +simdutf_convert_utf32_to_latin1_with_errors(const char32_t *input, + size_t length, char *output); + +/* --- Find helpers --- */ +const char *simdutf_find(const char *start, const char *end, char character); +const char16_t *simdutf_find_utf16(const char16_t *start, const char16_t *end, + char16_t character); + +/* --- Base64 enums and helpers --- */ +typedef enum simdutf_base64_options { + SIMDUTF_BASE64_DEFAULT = 0, + SIMDUTF_BASE64_URL = 1, + SIMDUTF_BASE64_DEFAULT_NO_PADDING = 2, + SIMDUTF_BASE64_URL_WITH_PADDING = 3, + SIMDUTF_BASE64_DEFAULT_ACCEPT_GARBAGE = 4, + SIMDUTF_BASE64_URL_ACCEPT_GARBAGE = 5, + SIMDUTF_BASE64_DEFAULT_OR_URL = 8, + SIMDUTF_BASE64_DEFAULT_OR_URL_ACCEPT_GARBAGE = 12 +} simdutf_base64_options; + +typedef enum simdutf_last_chunk_handling_options { + SIMDUTF_LAST_CHUNK_LOOSE = 0, + SIMDUTF_LAST_CHUNK_STRICT = 1, + SIMDUTF_LAST_CHUNK_STOP_BEFORE_PARTIAL = 2, + SIMDUTF_LAST_CHUNK_ONLY_FULL_CHUNKS = 3 +} simdutf_last_chunk_handling_options; + +/* maximal binary length estimators */ +size_t simdutf_maximal_binary_length_from_base64(const char *input, + size_t length); +size_t simdutf_maximal_binary_length_from_base64_utf16(const char16_t *input, + size_t length); + +/* base64 decoding/encoding */ +simdutf_result simdutf_base64_to_binary( + const char *input, size_t length, char *output, + simdutf_base64_options options, + simdutf_last_chunk_handling_options last_chunk_options); +simdutf_result simdutf_base64_to_binary_utf16( + const char16_t *input, size_t length, char *output, + simdutf_base64_options options, + simdutf_last_chunk_handling_options last_chunk_options); + +size_t simdutf_base64_length_from_binary(size_t length, + simdutf_base64_options options); +size_t simdutf_base64_length_from_binary_with_lines( + size_t length, simdutf_base64_options options, size_t line_length); + +size_t simdutf_binary_to_base64(const char *input, size_t length, char *output, + simdutf_base64_options options); +size_t simdutf_binary_to_base64_with_lines(const char *input, size_t length, + char *output, size_t line_length, + simdutf_base64_options options); + +/* safe decoding that provides an in/out outlen parameter */ +simdutf_result simdutf_base64_to_binary_safe( + const char *input, size_t length, char *output, size_t *outlen, + simdutf_base64_options options, + simdutf_last_chunk_handling_options last_chunk_options, + bool decode_up_to_bad_char); +simdutf_result simdutf_base64_to_binary_safe_utf16( + const char16_t *input, size_t length, char *output, size_t *outlen, + simdutf_base64_options options, + simdutf_last_chunk_handling_options last_chunk_options, + bool decode_up_to_bad_char); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* SIMDUTF_C_H */ +/* end file include/simdutf_c.h */ + +static simdutf_result to_c_result(const simdutf::result &r) { + simdutf_result out; + out.error = static_cast(r.error); + out.count = r.count; + return out; +} + +/* The C wrapper depends on the library features. Only expose the C API + when all relevant feature is enabled. This helps the + single-header generator to omit the C wrapper when features are + disabled. */ +// clang-format off +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 && SIMDUTF_FEATURE_ASCII && SIMDUTF_FEATURE_BASE64 && SIMDUTF_FEATURE_DETECT_ENCODING +// clang-format on +extern "C" { + +bool simdutf_validate_utf8(const char *buf, size_t len) { + return simdutf::validate_utf8(buf, len); +} + +simdutf_result simdutf_validate_utf8_with_errors(const char *buf, size_t len) { + return to_c_result(simdutf::validate_utf8_with_errors(buf, len)); +} + +simdutf_encoding_type simdutf_autodetect_encoding(const char *input, + size_t length) { + return static_cast( + simdutf::autodetect_encoding(input, length)); +} + +int simdutf_detect_encodings(const char *input, size_t length) { + return simdutf::detect_encodings(input, length); +} + +bool simdutf_validate_ascii(const char *buf, size_t len) { + return simdutf::validate_ascii(buf, len); +} +simdutf_result simdutf_validate_ascii_with_errors(const char *buf, size_t len) { + return to_c_result(simdutf::validate_ascii_with_errors(buf, len)); +} + +bool simdutf_validate_utf16_as_ascii(const char16_t *buf, size_t len) { + return simdutf::validate_utf16_as_ascii( + reinterpret_cast(buf), len); +} +bool simdutf_validate_utf16be_as_ascii(const char16_t *buf, size_t len) { + return simdutf::validate_utf16be_as_ascii( + reinterpret_cast(buf), len); +} +bool simdutf_validate_utf16le_as_ascii(const char16_t *buf, size_t len) { + return simdutf::validate_utf16le_as_ascii( + reinterpret_cast(buf), len); +} + +bool simdutf_validate_utf16(const char16_t *buf, size_t len) { + return simdutf::validate_utf16(reinterpret_cast(buf), len); +} +bool simdutf_validate_utf16le(const char16_t *buf, size_t len) { + return simdutf::validate_utf16le(reinterpret_cast(buf), + len); +} +bool simdutf_validate_utf16be(const char16_t *buf, size_t len) { + return simdutf::validate_utf16be(reinterpret_cast(buf), + len); +} +simdutf_result simdutf_validate_utf16_with_errors(const char16_t *buf, + size_t len) { + return to_c_result(simdutf::validate_utf16_with_errors( + reinterpret_cast(buf), len)); +} +simdutf_result simdutf_validate_utf16le_with_errors(const char16_t *buf, + size_t len) { + return to_c_result(simdutf::validate_utf16le_with_errors( + reinterpret_cast(buf), len)); +} +simdutf_result simdutf_validate_utf16be_with_errors(const char16_t *buf, + size_t len) { + return to_c_result(simdutf::validate_utf16be_with_errors( + reinterpret_cast(buf), len)); +} + +bool simdutf_validate_utf32(const char32_t *buf, size_t len) { + return simdutf::validate_utf32(reinterpret_cast(buf), len); +} +simdutf_result simdutf_validate_utf32_with_errors(const char32_t *buf, + size_t len) { + return to_c_result(simdutf::validate_utf32_with_errors( + reinterpret_cast(buf), len)); +} + +void simdutf_to_well_formed_utf16le(const char16_t *input, size_t len, + char16_t *output) { + simdutf::to_well_formed_utf16le(reinterpret_cast(input), + len, reinterpret_cast(output)); +} +void simdutf_to_well_formed_utf16be(const char16_t *input, size_t len, + char16_t *output) { + simdutf::to_well_formed_utf16be(reinterpret_cast(input), + len, reinterpret_cast(output)); +} +void simdutf_to_well_formed_utf16(const char16_t *input, size_t len, + char16_t *output) { + simdutf::to_well_formed_utf16(reinterpret_cast(input), len, + reinterpret_cast(output)); +} + +size_t simdutf_count_utf16(const char16_t *input, size_t length) { + return simdutf::count_utf16(reinterpret_cast(input), + length); +} +size_t simdutf_count_utf16le(const char16_t *input, size_t length) { + return simdutf::count_utf16le(reinterpret_cast(input), + length); +} +size_t simdutf_count_utf16be(const char16_t *input, size_t length) { + return simdutf::count_utf16be(reinterpret_cast(input), + length); +} +size_t simdutf_count_utf8(const char *input, size_t length) { + return simdutf::count_utf8(input, length); +} + +size_t simdutf_utf8_length_from_latin1(const char *input, size_t length) { + return simdutf::utf8_length_from_latin1(input, length); +} +size_t simdutf_latin1_length_from_utf8(const char *input, size_t length) { + return simdutf::latin1_length_from_utf8(input, length); +} +size_t simdutf_latin1_length_from_utf16(size_t length) { + return simdutf::latin1_length_from_utf16(length); +} +size_t simdutf_latin1_length_from_utf32(size_t length) { + return simdutf::latin1_length_from_utf32(length); +} +size_t simdutf_utf16_length_from_utf8(const char *input, size_t length) { + return simdutf::utf16_length_from_utf8(input, length); +} +size_t simdutf_utf32_length_from_utf8(const char *input, size_t length) { + return simdutf::utf32_length_from_utf8(input, length); +} +size_t simdutf_utf8_length_from_utf16(const char16_t *input, size_t length) { + return simdutf::utf8_length_from_utf16( + reinterpret_cast(input), length); +} +simdutf_result +simdutf_utf8_length_from_utf16_with_replacement(const char16_t *input, + size_t length) { + return to_c_result(simdutf::utf8_length_from_utf16_with_replacement( + reinterpret_cast(input), length)); +} +size_t simdutf_utf8_length_from_utf16le(const char16_t *input, size_t length) { + return simdutf::utf8_length_from_utf16le( + reinterpret_cast(input), length); +} +size_t simdutf_utf8_length_from_utf16be(const char16_t *input, size_t length) { + return simdutf::utf8_length_from_utf16be( + reinterpret_cast(input), length); +} +simdutf_result +simdutf_utf8_length_from_utf16le_with_replacement(const char16_t *input, + size_t length) { + return to_c_result(simdutf::utf8_length_from_utf16le_with_replacement( + reinterpret_cast(input), length)); +} +simdutf_result +simdutf_utf8_length_from_utf16be_with_replacement(const char16_t *input, + size_t length) { + return to_c_result(simdutf::utf8_length_from_utf16be_with_replacement( + reinterpret_cast(input), length)); +} + +/* Conversions: latin1 <-> utf8, utf8 <-> utf16/utf32, utf16 <-> utf8, etc. */ +size_t simdutf_convert_latin1_to_utf8(const char *input, size_t length, + char *output) { + return simdutf::convert_latin1_to_utf8(input, length, output); +} + +size_t simdutf_convert_latin1_to_utf8_safe(const char *input, size_t length, + char *output, size_t utf8_len) { + return simdutf::convert_latin1_to_utf8_safe(input, length, output, utf8_len); +} +size_t simdutf_convert_latin1_to_utf16le(const char *input, size_t length, + char16_t *output) { + return simdutf::convert_latin1_to_utf16le( + input, length, reinterpret_cast(output)); +} +size_t simdutf_convert_latin1_to_utf16be(const char *input, size_t length, + char16_t *output) { + return simdutf::convert_latin1_to_utf16be( + input, length, reinterpret_cast(output)); +} +size_t simdutf_convert_latin1_to_utf32(const char *input, size_t length, + char32_t *output) { + return simdutf::convert_latin1_to_utf32(input, length, + reinterpret_cast(output)); +} + +size_t simdutf_convert_utf8_to_latin1(const char *input, size_t length, + char *output) { + return simdutf::convert_utf8_to_latin1(input, length, output); +} +size_t simdutf_convert_utf8_to_utf16le(const char *input, size_t length, + char16_t *output) { + return simdutf::convert_utf8_to_utf16le(input, length, + reinterpret_cast(output)); +} +size_t simdutf_convert_utf8_to_utf16(const char *input, size_t length, + char16_t *output) { + return simdutf::convert_utf8_to_utf16(input, length, + reinterpret_cast(output)); +} +size_t simdutf_convert_utf8_to_utf16be(const char *input, size_t length, + char16_t *output) { + return simdutf::convert_utf8_to_utf16be(input, length, + reinterpret_cast(output)); +} +size_t simdutf_convert_utf8_to_utf32(const char *input, size_t length, + char32_t *output) { + return simdutf::convert_utf8_to_utf32(input, length, + reinterpret_cast(output)); +} +simdutf_result simdutf_convert_utf8_to_latin1_with_errors(const char *input, + size_t length, + char *output) { + return to_c_result( + simdutf::convert_utf8_to_latin1_with_errors(input, length, output)); +} +simdutf_result simdutf_convert_utf8_to_utf16_with_errors(const char *input, + size_t length, + char16_t *output) { + return to_c_result(simdutf::convert_utf8_to_utf16_with_errors( + input, length, reinterpret_cast(output))); +} +simdutf_result simdutf_convert_utf8_to_utf16le_with_errors(const char *input, + size_t length, + char16_t *output) { + return to_c_result(simdutf::convert_utf8_to_utf16le_with_errors( + input, length, reinterpret_cast(output))); +} +simdutf_result simdutf_convert_utf8_to_utf16be_with_errors(const char *input, + size_t length, + char16_t *output) { + return to_c_result(simdutf::convert_utf8_to_utf16be_with_errors( + input, length, reinterpret_cast(output))); +} +simdutf_result simdutf_convert_utf8_to_utf32_with_errors(const char *input, + size_t length, + char32_t *output) { + return to_c_result(simdutf::convert_utf8_to_utf32_with_errors( + input, length, reinterpret_cast(output))); +} + +/* Conversions assuming valid input */ +size_t simdutf_convert_valid_utf8_to_latin1(const char *input, size_t length, + char *output) { + return simdutf::convert_valid_utf8_to_latin1(input, length, output); +} +size_t simdutf_convert_valid_utf8_to_utf16le(const char *input, size_t length, + char16_t *output) { + return simdutf::convert_valid_utf8_to_utf16le( + input, length, reinterpret_cast(output)); +} +size_t simdutf_convert_valid_utf8_to_utf16be(const char *input, size_t length, + char16_t *output) { + return simdutf::convert_valid_utf8_to_utf16be( + input, length, reinterpret_cast(output)); +} +size_t simdutf_convert_valid_utf8_to_utf32(const char *input, size_t length, + char32_t *output) { + return simdutf::convert_valid_utf8_to_utf32( + input, length, reinterpret_cast(output)); +} + +/* UTF-16 -> UTF-8 and related conversions */ +size_t simdutf_convert_utf16_to_utf8(const char16_t *input, size_t length, + char *output) { + return simdutf::convert_utf16_to_utf8( + reinterpret_cast(input), length, output); +} +size_t simdutf_convert_utf16_to_utf8_safe(const char16_t *input, size_t length, + char *output, size_t utf8_len) { + return simdutf::convert_utf16_to_utf8_safe( + reinterpret_cast(input), length, output, utf8_len); +} +size_t simdutf_convert_utf16_to_latin1(const char16_t *input, size_t length, + char *output) { + return simdutf::convert_utf16_to_latin1( + reinterpret_cast(input), length, output); +} +size_t simdutf_convert_utf16le_to_latin1(const char16_t *input, size_t length, + char *output) { + return simdutf::convert_utf16le_to_latin1( + reinterpret_cast(input), length, output); +} +size_t simdutf_convert_utf16be_to_latin1(const char16_t *input, size_t length, + char *output) { + return simdutf::convert_utf16be_to_latin1( + reinterpret_cast(input), length, output); +} +simdutf_result +simdutf_convert_utf16_to_latin1_with_errors(const char16_t *input, + size_t length, char *output) { + return to_c_result(simdutf::convert_utf16_to_latin1_with_errors( + reinterpret_cast(input), length, output)); +} +simdutf_result +simdutf_convert_utf16le_to_latin1_with_errors(const char16_t *input, + size_t length, char *output) { + return to_c_result(simdutf::convert_utf16le_to_latin1_with_errors( + reinterpret_cast(input), length, output)); +} +simdutf_result +simdutf_convert_utf16be_to_latin1_with_errors(const char16_t *input, + size_t length, char *output) { + return to_c_result(simdutf::convert_utf16be_to_latin1_with_errors( + reinterpret_cast(input), length, output)); +} + +simdutf_result simdutf_convert_utf16_to_utf8_with_errors(const char16_t *input, + size_t length, + char *output) { + return to_c_result(simdutf::convert_utf16_to_utf8_with_errors( + reinterpret_cast(input), length, output)); +} +simdutf_result +simdutf_convert_utf16le_to_utf8_with_errors(const char16_t *input, + size_t length, char *output) { + return to_c_result(simdutf::convert_utf16le_to_utf8_with_errors( + reinterpret_cast(input), length, output)); +} +simdutf_result +simdutf_convert_utf16be_to_utf8_with_errors(const char16_t *input, + size_t length, char *output) { + return to_c_result(simdutf::convert_utf16be_to_utf8_with_errors( + reinterpret_cast(input), length, output)); +} + +size_t simdutf_convert_utf16le_to_utf8(const char16_t *input, size_t length, + char *output) { + return simdutf::convert_utf16le_to_utf8( + reinterpret_cast(input), length, output); +} +size_t simdutf_convert_utf16be_to_utf8(const char16_t *input, size_t length, + char *output) { + return simdutf::convert_utf16be_to_utf8( + reinterpret_cast(input), length, output); +} + +size_t simdutf_convert_valid_utf16_to_utf8(const char16_t *input, size_t length, + char *output) { + return simdutf::convert_valid_utf16_to_utf8( + reinterpret_cast(input), length, output); +} +size_t simdutf_convert_valid_utf16_to_latin1(const char16_t *input, + size_t length, char *output) { + return simdutf::convert_valid_utf16_to_latin1( + reinterpret_cast(input), length, output); +} +size_t simdutf_convert_valid_utf16le_to_latin1(const char16_t *input, + size_t length, char *output) { + return simdutf::convert_valid_utf16le_to_latin1( + reinterpret_cast(input), length, output); +} +size_t simdutf_convert_valid_utf16be_to_latin1(const char16_t *input, + size_t length, char *output) { + return simdutf::convert_valid_utf16be_to_latin1( + reinterpret_cast(input), length, output); +} + +size_t simdutf_convert_valid_utf16le_to_utf8(const char16_t *input, + size_t length, char *output) { + return simdutf::convert_valid_utf16le_to_utf8( + reinterpret_cast(input), length, output); +} +size_t simdutf_convert_valid_utf16be_to_utf8(const char16_t *input, + size_t length, char *output) { + return simdutf::convert_valid_utf16be_to_utf8( + reinterpret_cast(input), length, output); +} + +/* UTF-16 <-> UTF-32 conversions */ +size_t simdutf_convert_utf16_to_utf32(const char16_t *input, size_t length, + char32_t *output) { + return simdutf::convert_utf16_to_utf32( + reinterpret_cast(input), length, + reinterpret_cast(output)); +} +size_t simdutf_convert_utf16le_to_utf32(const char16_t *input, size_t length, + char32_t *output) { + return simdutf::convert_utf16le_to_utf32( + reinterpret_cast(input), length, + reinterpret_cast(output)); +} +size_t simdutf_convert_utf16be_to_utf32(const char16_t *input, size_t length, + char32_t *output) { + return simdutf::convert_utf16be_to_utf32( + reinterpret_cast(input), length, + reinterpret_cast(output)); +} +simdutf_result simdutf_convert_utf16_to_utf32_with_errors(const char16_t *input, + size_t length, + char32_t *output) { + return to_c_result(simdutf::convert_utf16_to_utf32_with_errors( + reinterpret_cast(input), length, + reinterpret_cast(output))); +} +simdutf_result +simdutf_convert_utf16le_to_utf32_with_errors(const char16_t *input, + size_t length, char32_t *output) { + return to_c_result(simdutf::convert_utf16le_to_utf32_with_errors( + reinterpret_cast(input), length, + reinterpret_cast(output))); +} +simdutf_result +simdutf_convert_utf16be_to_utf32_with_errors(const char16_t *input, + size_t length, char32_t *output) { + return to_c_result(simdutf::convert_utf16be_to_utf32_with_errors( + reinterpret_cast(input), length, + reinterpret_cast(output))); +} + +/* Valid UTF-16 conversions */ +size_t simdutf_convert_valid_utf16_to_utf32(const char16_t *input, + size_t length, char32_t *output) { + return simdutf::convert_valid_utf16_to_utf32( + reinterpret_cast(input), length, + reinterpret_cast(output)); +} +size_t simdutf_convert_valid_utf16le_to_utf32(const char16_t *input, + size_t length, char32_t *output) { + return simdutf::convert_valid_utf16le_to_utf32( + reinterpret_cast(input), length, + reinterpret_cast(output)); +} +size_t simdutf_convert_valid_utf16be_to_utf32(const char16_t *input, + size_t length, char32_t *output) { + return simdutf::convert_valid_utf16be_to_utf32( + reinterpret_cast(input), length, + reinterpret_cast(output)); +} + +/* UTF-32 -> ... conversions */ +size_t simdutf_convert_utf32_to_utf8(const char32_t *input, size_t length, + char *output) { + return simdutf::convert_utf32_to_utf8( + reinterpret_cast(input), length, output); +} +simdutf_result simdutf_convert_utf32_to_utf8_with_errors(const char32_t *input, + size_t length, + char *output) { + return to_c_result(simdutf::convert_utf32_to_utf8_with_errors( + reinterpret_cast(input), length, output)); +} +size_t simdutf_convert_valid_utf32_to_utf8(const char32_t *input, size_t length, + char *output) { + return simdutf::convert_valid_utf32_to_utf8( + reinterpret_cast(input), length, output); +} + +size_t simdutf_convert_utf32_to_utf16(const char32_t *input, size_t length, + char16_t *output) { + return simdutf::convert_utf32_to_utf16( + reinterpret_cast(input), length, + reinterpret_cast(output)); +} +size_t simdutf_convert_utf32_to_utf16le(const char32_t *input, size_t length, + char16_t *output) { + return simdutf::convert_utf32_to_utf16le( + reinterpret_cast(input), length, + reinterpret_cast(output)); +} +size_t simdutf_convert_utf32_to_utf16be(const char32_t *input, size_t length, + char16_t *output) { + return simdutf::convert_utf32_to_utf16be( + reinterpret_cast(input), length, + reinterpret_cast(output)); +} +simdutf_result +simdutf_convert_utf32_to_latin1_with_errors(const char32_t *input, + size_t length, char *output) { + return to_c_result(simdutf::convert_utf32_to_latin1_with_errors( + reinterpret_cast(input), length, output)); +} + +/* --- find helpers --- */ +const char *simdutf_find(const char *start, const char *end, char character) { + return simdutf::find(start, end, character); +} +const char16_t *simdutf_find_utf16(const char16_t *start, const char16_t *end, + char16_t character) { + return simdutf::find(start, end, character); +} + +/* --- base64 helpers --- */ +size_t simdutf_maximal_binary_length_from_base64(const char *input, + size_t length) { + return simdutf::maximal_binary_length_from_base64(input, length); +} +size_t simdutf_maximal_binary_length_from_base64_utf16(const char16_t *input, + size_t length) { + return simdutf::maximal_binary_length_from_base64(input, length); +} + +simdutf_result simdutf_base64_to_binary( + const char *input, size_t length, char *output, + simdutf_base64_options options, + simdutf_last_chunk_handling_options last_chunk_options) { + return to_c_result(simdutf::base64_to_binary( + input, length, output, static_cast(options), + static_cast(last_chunk_options))); +} +simdutf_result simdutf_base64_to_binary_utf16( + const char16_t *input, size_t length, char *output, + simdutf_base64_options options, + simdutf_last_chunk_handling_options last_chunk_options) { + return to_c_result(simdutf::base64_to_binary( + input, length, output, static_cast(options), + static_cast(last_chunk_options))); +} + +size_t simdutf_base64_length_from_binary(size_t length, + simdutf_base64_options options) { + return simdutf::base64_length_from_binary( + length, static_cast(options)); +} +size_t simdutf_base64_length_from_binary_with_lines( + size_t length, simdutf_base64_options options, size_t line_length) { + return simdutf::base64_length_from_binary_with_lines( + length, static_cast(options), line_length); +} + +size_t simdutf_binary_to_base64(const char *input, size_t length, char *output, + simdutf_base64_options options) { + return simdutf::binary_to_base64( + input, length, output, static_cast(options)); +} +size_t simdutf_binary_to_base64_with_lines(const char *input, size_t length, + char *output, size_t line_length, + simdutf_base64_options options) { + return simdutf::binary_to_base64_with_lines( + input, length, output, line_length, + static_cast(options)); +} + +simdutf_result simdutf_base64_to_binary_safe( + const char *input, size_t length, char *output, size_t *outlen, + simdutf_base64_options options, + simdutf_last_chunk_handling_options last_chunk_options, + bool decode_up_to_bad_char) { + size_t local_out = outlen ? *outlen : 0; + simdutf::result r = simdutf::base64_to_binary_safe( + input, length, output, local_out, + static_cast(options), + static_cast(last_chunk_options), + decode_up_to_bad_char); + if (outlen) + *outlen = local_out; + return to_c_result(r); +} +simdutf_result simdutf_base64_to_binary_safe_utf16( + const char16_t *input, size_t length, char *output, size_t *outlen, + simdutf_base64_options options, + simdutf_last_chunk_handling_options last_chunk_options, + bool decode_up_to_bad_char) { + size_t local_out = outlen ? *outlen : 0; + simdutf::result r = simdutf::base64_to_binary_safe( + input, length, output, local_out, + static_cast(options), + static_cast(last_chunk_options), + decode_up_to_bad_char); + if (outlen) + *outlen = local_out; + return to_c_result(r); +} + +} // extern "C" +// clang-format off +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 && SIMDUTF_FEATURE_ASCII && SIMDUTF_FEATURE_BASE64 && SIMDUTF_FEATURE_DETECT_ENCODING +// clang-format on +/* end file src/simdutf_c.cpp */ SIMDUTF_POP_DISABLE_WARNINGS /* end file src/simdutf.cpp */ diff --git a/simdutf/simdutf.h b/simdutf/simdutf.h index 4fb7a87a..79081a1b 100644 --- a/simdutf/simdutf.h +++ b/simdutf/simdutf.h @@ -1,4 +1,4 @@ -/* auto-generated on 2024-05-07 22:33:11 -0400. Do not edit! */ +/* auto-generated on 2026-01-13 09:03:21 +0100. Do not edit! */ /* begin file include/simdutf.h */ #ifndef SIMDUTF_H #define SIMDUTF_H @@ -9,34 +9,49 @@ #define SIMDUTF_COMPILER_CHECK_H #ifndef __cplusplus -#error simdutf requires a C++ compiler + #error simdutf requires a C++ compiler #endif #ifndef SIMDUTF_CPLUSPLUS -#if defined(_MSVC_LANG) && !defined(__clang__) -#define SIMDUTF_CPLUSPLUS (_MSC_VER == 1900 ? 201103L : _MSVC_LANG) -#else -#define SIMDUTF_CPLUSPLUS __cplusplus + #if defined(_MSVC_LANG) && !defined(__clang__) + #define SIMDUTF_CPLUSPLUS (_MSC_VER == 1900 ? 201103L : _MSVC_LANG) + #else + #define SIMDUTF_CPLUSPLUS __cplusplus + #endif +#endif + +// C++ 26 +#if !defined(SIMDUTF_CPLUSPLUS26) && (SIMDUTF_CPLUSPLUS >= 202602L) + #define SIMDUTF_CPLUSPLUS26 1 +#endif + +// C++ 23 +#if !defined(SIMDUTF_CPLUSPLUS23) && (SIMDUTF_CPLUSPLUS >= 202302L) + #define SIMDUTF_CPLUSPLUS23 1 #endif + +// C++ 20 +#if !defined(SIMDUTF_CPLUSPLUS20) && (SIMDUTF_CPLUSPLUS >= 202002L) + #define SIMDUTF_CPLUSPLUS20 1 #endif // C++ 17 #if !defined(SIMDUTF_CPLUSPLUS17) && (SIMDUTF_CPLUSPLUS >= 201703L) -#define SIMDUTF_CPLUSPLUS17 1 + #define SIMDUTF_CPLUSPLUS17 1 #endif // C++ 14 #if !defined(SIMDUTF_CPLUSPLUS14) && (SIMDUTF_CPLUSPLUS >= 201402L) -#define SIMDUTF_CPLUSPLUS14 1 + #define SIMDUTF_CPLUSPLUS14 1 #endif // C++ 11 #if !defined(SIMDUTF_CPLUSPLUS11) && (SIMDUTF_CPLUSPLUS >= 201103L) -#define SIMDUTF_CPLUSPLUS11 1 + #define SIMDUTF_CPLUSPLUS11 1 #endif #ifndef SIMDUTF_CPLUSPLUS11 -#error simdutf requires a compiler compliant with the C++11 standard + #error simdutf requires a compiler compliant with the C++11 standard #endif #endif // SIMDUTF_COMPILER_CHECK_H @@ -45,19 +60,39 @@ #ifndef SIMDUTF_COMMON_DEFS_H #define SIMDUTF_COMMON_DEFS_H -#include /* begin file include/simdutf/portability.h */ #ifndef SIMDUTF_PORTABILITY_H #define SIMDUTF_PORTABILITY_H + +#include #include #include #include -#include -#include #ifndef _WIN32 -// strcasecmp, strncasecmp -#include + // strcasecmp, strncasecmp + #include +#endif + +#if defined(__apple_build_version__) + #if __apple_build_version__ < 14000000 + #define SIMDUTF_SPAN_DISABLED \ + 1 // apple-clang/13 doesn't support std::convertible_to + #endif +#endif + +#if SIMDUTF_CPLUSPLUS20 + #include + #if __cpp_concepts >= 201907L && __cpp_lib_span >= 202002L && \ + !defined(SIMDUTF_SPAN_DISABLED) + #define SIMDUTF_SPAN 1 + #endif // __cpp_concepts >= 201907L && __cpp_lib_span >= 202002L + #if __cpp_lib_atomic_ref >= 201806L + #define SIMDUTF_ATOMIC_REF 1 + #endif // __cpp_lib_atomic_ref + #if __has_cpp_attribute(maybe_unused) >= 201603L + #define SIMDUTF_MAYBE_UNUSED_AVAILABLE 1 + #endif // __has_cpp_attribute(maybe_unused) >= 201603L #endif /** @@ -66,131 +101,139 @@ */ #if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) -#define SIMDUTF_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + #define SIMDUTF_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) #elif defined(_WIN32) -#define SIMDUTF_IS_BIG_ENDIAN 0 + #define SIMDUTF_IS_BIG_ENDIAN 0 #else -#if defined(__APPLE__) || defined(__FreeBSD__) // defined __BYTE_ORDER__ && defined __ORDER_BIG_ENDIAN__ -#include -#elif defined(sun) || defined(__sun) // defined(__APPLE__) || defined(__FreeBSD__) -#include -#else // defined(__APPLE__) || defined(__FreeBSD__) - -#ifdef __has_include -#if __has_include() -#include -#endif //__has_include() -#endif //__has_include - -#endif // defined(__APPLE__) || defined(__FreeBSD__) - - -#ifndef !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) -#define SIMDUTF_IS_BIG_ENDIAN 0 -#endif + #if defined(__APPLE__) || \ + defined(__FreeBSD__) // defined __BYTE_ORDER__ && defined + // __ORDER_BIG_ENDIAN__ + #include + #elif defined(sun) || \ + defined(__sun) // defined(__APPLE__) || defined(__FreeBSD__) + #include + #else // defined(__APPLE__) || defined(__FreeBSD__) + + #ifdef __has_include + #if __has_include() + #include + #endif //__has_include() + #endif //__has_include + + #endif // defined(__APPLE__) || defined(__FreeBSD__) + + #ifndef !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) + #define SIMDUTF_IS_BIG_ENDIAN 0 + #endif -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -#define SIMDUTF_IS_BIG_ENDIAN 0 -#else // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -#define SIMDUTF_IS_BIG_ENDIAN 1 -#endif // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + #define SIMDUTF_IS_BIG_ENDIAN 0 + #else // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + #define SIMDUTF_IS_BIG_ENDIAN 1 + #endif // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ #endif // defined __BYTE_ORDER__ && defined __ORDER_BIG_ENDIAN__ - /** * At this point in time, SIMDUTF_IS_BIG_ENDIAN is defined. */ #ifdef _MSC_VER -#define SIMDUTF_VISUAL_STUDIO 1 -/** - * We want to differentiate carefully between - * clang under visual studio and regular visual - * studio. - * - * Under clang for Windows, we enable: - * * target pragmas so that part and only part of the - * code gets compiled for advanced instructions. - * - */ -#ifdef __clang__ -// clang under visual studio -#define SIMDUTF_CLANG_VISUAL_STUDIO 1 -#else -// just regular visual studio (best guess) -#define SIMDUTF_REGULAR_VISUAL_STUDIO 1 -#endif // __clang__ -#endif // _MSC_VER + #define SIMDUTF_VISUAL_STUDIO 1 + /** + * We want to differentiate carefully between + * clang under visual studio and regular visual + * studio. + * + * Under clang for Windows, we enable: + * * target pragmas so that part and only part of the + * code gets compiled for advanced instructions. + * + */ + #ifdef __clang__ + // clang under visual studio + #define SIMDUTF_CLANG_VISUAL_STUDIO 1 + #else + // just regular visual studio (best guess) + #define SIMDUTF_REGULAR_VISUAL_STUDIO 1 + #endif // __clang__ +#endif // _MSC_VER #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO -// https://en.wikipedia.org/wiki/C_alternative_tokens -// This header should have no effect, except maybe -// under Visual Studio. -#include + // https://en.wikipedia.org/wiki/C_alternative_tokens + // This header should have no effect, except maybe + // under Visual Studio. + #include #endif #if (defined(__x86_64__) || defined(_M_AMD64)) && !defined(_M_ARM64EC) -#define SIMDUTF_IS_X86_64 1 + #define SIMDUTF_IS_X86_64 1 #elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) -#define SIMDUTF_IS_ARM64 1 + #define SIMDUTF_IS_ARM64 1 #elif defined(__PPC64__) || defined(_M_PPC64) -//#define SIMDUTF_IS_PPC64 1 -// The simdutf library does yet support SIMD acceleration under -// POWER processors. Please see https://github.com/lemire/simdutf/issues/51 + #if defined(__VEC__) && defined(__ALTIVEC__) + #define SIMDUTF_IS_PPC64 1 + #endif #elif defined(__s390__) // s390 IBM system. Big endian. #elif (defined(__riscv) || defined(__riscv__)) && __riscv_xlen == 64 -// RISC-V 64-bit -#define SIMDUTF_IS_RISCV64 1 - -#if __clang_major__ >= 19 -// Does the compiler support target regions for RISC-V -#define SIMDUTF_HAS_RVV_TARGET_REGION 1 -#endif - -#if __riscv_v_intrinsic >= 11000 -#define SIMDUTF_HAS_RVV_INTRINSICS 1 -#endif + // RISC-V 64-bit + #define SIMDUTF_IS_RISCV64 1 + + // #if __riscv_v_intrinsic >= 1000000 + // #define SIMDUTF_HAS_RVV_INTRINSICS 1 + // #define SIMDUTF_HAS_RVV_TARGET_REGION 1 + // #elif ... + // Check for special compiler versions that implement pre v1.0 intrinsics + #if __riscv_v_intrinsic >= 11000 + #define SIMDUTF_HAS_RVV_INTRINSICS 1 + #endif -#define SIMDUTF_HAS_ZVBB_INTRINSICS 0 // there is currently no way to detect this + #define SIMDUTF_HAS_ZVBB_INTRINSICS \ + 0 // there is currently no way to detect this -#if SIMDUTF_HAS_RVV_INTRINSICS && __riscv_vector && __riscv_v_min_vlen >= 128 && __riscv_v_elen >= 64 -// RISC-V V extension -#define SIMDUTF_IS_RVV 1 -#if SIMDUTF_HAS_ZVBB_INTRINSICS && __riscv_zvbb >= 1000000 -// RISC-V Vector Basic Bit-manipulation -#define SIMDUTF_IS_ZVBB 1 -#endif -#endif + #if SIMDUTF_HAS_RVV_INTRINSICS && __riscv_vector && \ + __riscv_v_min_vlen >= 128 && __riscv_v_elen >= 64 + // RISC-V V extension + #define SIMDUTF_IS_RVV 1 + #if SIMDUTF_HAS_ZVBB_INTRINSICS && __riscv_zvbb >= 1000000 + // RISC-V Vector Basic Bit-manipulation + #define SIMDUTF_IS_ZVBB 1 + #endif + #endif #elif defined(__loongarch_lp64) -// LoongArch 64-bit + #if defined(__loongarch_sx) && defined(__loongarch_asx) + #define SIMDUTF_IS_LSX 1 + #define SIMDUTF_IS_LASX 1 // We can always run both + #elif defined(__loongarch_sx) + #define SIMDUTF_IS_LSX 1 + #endif #else -// The simdutf library is designed -// for 64-bit processors and it seems that you are not -// compiling for a known 64-bit platform. Please -// use a 64-bit target such as x64 or 64-bit ARM for best performance. -#define SIMDUTF_IS_32BITS 1 - -// We do not support 32-bit platforms, but it can be -// handy to identify them. -#if defined(_M_IX86) || defined(__i386__) -#define SIMDUTF_IS_X86_32BITS 1 -#elif defined(__arm__) || defined(_M_ARM) -#define SIMDUTF_IS_ARM_32BITS 1 -#elif defined(__PPC__) || defined(_M_PPC) -#define SIMDUTF_IS_PPC_32BITS 1 -#endif + // The simdutf library is designed + // for 64-bit processors and it seems that you are not + // compiling for a known 64-bit platform. Please + // use a 64-bit target such as x64 or 64-bit ARM for best performance. + #define SIMDUTF_IS_32BITS 1 + + // We do not support 32-bit platforms, but it can be + // handy to identify them. + #if defined(_M_IX86) || defined(__i386__) + #define SIMDUTF_IS_X86_32BITS 1 + #elif defined(__arm__) || defined(_M_ARM) + #define SIMDUTF_IS_ARM_32BITS 1 + #elif defined(__PPC__) || defined(_M_PPC) + #define SIMDUTF_IS_PPC_32BITS 1 + #endif #endif // defined(__x86_64__) || defined(_M_AMD64) #ifdef SIMDUTF_IS_32BITS -#ifndef SIMDUTF_NO_PORTABILITY_WARNING -// In the future, we may want to warn users of 32-bit systems that -// the simdutf does not support accelerated kernels for such systems. -#endif // SIMDUTF_NO_PORTABILITY_WARNING -#endif // SIMDUTF_IS_32BITS + #ifndef SIMDUTF_NO_PORTABILITY_WARNING + // In the future, we may want to warn users of 32-bit systems that + // the simdutf does not support accelerated kernels for such systems. + #endif // SIMDUTF_NO_PORTABILITY_WARNING +#endif // SIMDUTF_IS_32BITS // this is almost standard? #define SIMDUTF_STRINGIFY_IMPLEMENTATION_(a) #a @@ -207,93 +250,83 @@ // slower, but it should run everywhere. // -// Enable valid runtime implementations, and select SIMDUTF_BUILTIN_IMPLEMENTATION +// Enable valid runtime implementations, and select +// SIMDUTF_BUILTIN_IMPLEMENTATION // // We are going to use runtime dispatch. -#ifdef SIMDUTF_IS_X86_64 -#ifdef __clang__ -// clang does not have GCC push pop -// warning: clang attribute push can't be used within a namespace in clang up -// til 8.0 so SIMDUTF_TARGET_REGION and SIMDUTF_UNTARGET_REGION must be *outside* of a -// namespace. -#define SIMDUTF_TARGET_REGION(T) \ - _Pragma(SIMDUTF_STRINGIFY( \ - clang attribute push(__attribute__((target(T))), apply_to = function))) -#define SIMDUTF_UNTARGET_REGION _Pragma("clang attribute pop") -#elif defined(__GNUC__) -// GCC is easier -#define SIMDUTF_TARGET_REGION(T) \ - _Pragma("GCC push_options") _Pragma(SIMDUTF_STRINGIFY(GCC target(T))) -#define SIMDUTF_UNTARGET_REGION _Pragma("GCC pop_options") -#endif // clang then gcc - -#endif // x86 +#if defined(SIMDUTF_IS_X86_64) || defined(SIMDUTF_IS_LSX) + #ifdef __clang__ + // clang does not have GCC push pop + // warning: clang attribute push can't be used within a namespace in clang + // up til 8.0 so SIMDUTF_TARGET_REGION and SIMDUTF_UNTARGET_REGION must be + // *outside* of a namespace. + #define SIMDUTF_TARGET_REGION(T) \ + _Pragma(SIMDUTF_STRINGIFY(clang attribute push( \ + __attribute__((target(T))), apply_to = function))) + #define SIMDUTF_UNTARGET_REGION _Pragma("clang attribute pop") + #elif defined(__GNUC__) + // GCC is easier + #define SIMDUTF_TARGET_REGION(T) \ + _Pragma("GCC push_options") _Pragma(SIMDUTF_STRINGIFY(GCC target(T))) + #define SIMDUTF_UNTARGET_REGION _Pragma("GCC pop_options") + #endif // clang then gcc + +#endif // defined(SIMDUTF_IS_X86_64) || defined(SIMDUTF_IS_LSX) // Default target region macros don't do anything. #ifndef SIMDUTF_TARGET_REGION -#define SIMDUTF_TARGET_REGION(T) -#define SIMDUTF_UNTARGET_REGION + #define SIMDUTF_TARGET_REGION(T) + #define SIMDUTF_UNTARGET_REGION #endif // Is threading enabled? #if defined(_REENTRANT) || defined(_MT) -#ifndef SIMDUTF_THREADS_ENABLED -#define SIMDUTF_THREADS_ENABLED -#endif + #ifndef SIMDUTF_THREADS_ENABLED + #define SIMDUTF_THREADS_ENABLED + #endif #endif // workaround for large stack sizes under -O0. // https://github.com/simdutf/simdutf/issues/691 #ifdef __APPLE__ -#ifndef __OPTIMIZE__ -// Apple systems have small stack sizes in secondary threads. -// Lack of compiler optimization may generate high stack usage. -// Users may want to disable threads for safety, but only when -// in debug mode which we detect by the fact that the __OPTIMIZE__ -// macro is not defined. -#undef SIMDUTF_THREADS_ENABLED -#endif -#endif - -#ifdef SIMDUTF_VISUAL_STUDIO -// This is one case where we do not distinguish between -// regular visual studio and clang under visual studio. -// clang under Windows has _stricmp (like visual studio) but not strcasecmp (as clang normally has) -#define simdutf_strcasecmp _stricmp -#define simdutf_strncasecmp _strnicmp -#else -// The strcasecmp, strncasecmp, and strcasestr functions do not work with multibyte strings (e.g. UTF-8). -// So they are only useful for ASCII in our context. -// https://www.gnu.org/software/libunistring/manual/libunistring.html#char-_002a-strings -#define simdutf_strcasecmp strcasecmp -#define simdutf_strncasecmp strncasecmp + #ifndef __OPTIMIZE__ + // Apple systems have small stack sizes in secondary threads. + // Lack of compiler optimization may generate high stack usage. + // Users may want to disable threads for safety, but only when + // in debug mode which we detect by the fact that the __OPTIMIZE__ + // macro is not defined. + #undef SIMDUTF_THREADS_ENABLED + #endif #endif -#ifdef NDEBUG - #ifdef SIMDUTF_VISUAL_STUDIO -#define SIMDUTF_UNREACHABLE() __assume(0) -#define SIMDUTF_ASSUME(COND) __assume(COND) + // This is one case where we do not distinguish between + // regular visual studio and clang under visual studio. + // clang under Windows has _stricmp (like visual studio) but not strcasecmp + // (as clang normally has) + #define simdutf_strcasecmp _stricmp + #define simdutf_strncasecmp _strnicmp #else -#define SIMDUTF_UNREACHABLE() __builtin_unreachable(); -#define SIMDUTF_ASSUME(COND) do { if (!(COND)) __builtin_unreachable(); } while (0) -#endif - -#else // NDEBUG - -#define SIMDUTF_UNREACHABLE() assert(0); -#define SIMDUTF_ASSUME(COND) assert(COND) - + // The strcasecmp, strncasecmp, and strcasestr functions do not work with + // multibyte strings (e.g. UTF-8). So they are only useful for ASCII in our + // context. + // https://www.gnu.org/software/libunistring/manual/libunistring.html#char-_002a-strings + #define simdutf_strcasecmp strcasecmp + #define simdutf_strncasecmp strncasecmp #endif - #if defined(__GNUC__) && !defined(__clang__) -#if __GNUC__ >= 11 -#define SIMDUTF_GCC11ORMORE 1 -#endif // __GNUC__ >= 11 -#endif // defined(__GNUC__) && !defined(__clang__) - + #if __GNUC__ >= 11 + #define SIMDUTF_GCC11ORMORE 1 + #endif // __GNUC__ >= 11 + #if __GNUC__ == 10 + #define SIMDUTF_GCC10 1 + #endif // __GNUC__ == 10 + #if __GNUC__ < 10 + #define SIMDUTF_GCC9OROLDER 1 + #endif // __GNUC__ == 10 +#endif // defined(__GNUC__) && !defined(__clang__) #endif // SIMDUTF_PORTABILITY_H /* end file include/simdutf/portability.h */ @@ -311,218 +344,300 @@ */ #ifndef SIMDUTF_HAS_AVX512F -# if defined(__AVX512F__) && __AVX512F__ == 1 -# define SIMDUTF_HAS_AVX512F 1 -# endif + #if defined(__AVX512F__) && __AVX512F__ == 1 + #define SIMDUTF_HAS_AVX512F 1 + #endif #endif #ifndef SIMDUTF_HAS_AVX512DQ -# if defined(__AVX512DQ__) && __AVX512DQ__ == 1 -# define SIMDUTF_HAS_AVX512DQ 1 -# endif + #if defined(__AVX512DQ__) && __AVX512DQ__ == 1 + #define SIMDUTF_HAS_AVX512DQ 1 + #endif #endif #ifndef SIMDUTF_HAS_AVX512IFMA -# if defined(__AVX512IFMA__) && __AVX512IFMA__ == 1 -# define SIMDUTF_HAS_AVX512IFMA 1 -# endif + #if defined(__AVX512IFMA__) && __AVX512IFMA__ == 1 + #define SIMDUTF_HAS_AVX512IFMA 1 + #endif #endif #ifndef SIMDUTF_HAS_AVX512CD -# if defined(__AVX512CD__) && __AVX512CD__ == 1 -# define SIMDUTF_HAS_AVX512CD 1 -# endif + #if defined(__AVX512CD__) && __AVX512CD__ == 1 + #define SIMDUTF_HAS_AVX512CD 1 + #endif #endif #ifndef SIMDUTF_HAS_AVX512BW -# if defined(__AVX512BW__) && __AVX512BW__ == 1 -# define SIMDUTF_HAS_AVX512BW 1 -# endif + #if defined(__AVX512BW__) && __AVX512BW__ == 1 + #define SIMDUTF_HAS_AVX512BW 1 + #endif #endif #ifndef SIMDUTF_HAS_AVX512VL -# if defined(__AVX512VL__) && __AVX512VL__ == 1 -# define SIMDUTF_HAS_AVX512VL 1 -# endif + #if defined(__AVX512VL__) && __AVX512VL__ == 1 + #define SIMDUTF_HAS_AVX512VL 1 + #endif #endif #ifndef SIMDUTF_HAS_AVX512VBMI -# if defined(__AVX512VBMI__) && __AVX512VBMI__ == 1 -# define SIMDUTF_HAS_AVX512VBMI 1 -# endif + #if defined(__AVX512VBMI__) && __AVX512VBMI__ == 1 + #define SIMDUTF_HAS_AVX512VBMI 1 + #endif #endif #ifndef SIMDUTF_HAS_AVX512VBMI2 -# if defined(__AVX512VBMI2__) && __AVX512VBMI2__ == 1 -# define SIMDUTF_HAS_AVX512VBMI2 1 -# endif + #if defined(__AVX512VBMI2__) && __AVX512VBMI2__ == 1 + #define SIMDUTF_HAS_AVX512VBMI2 1 + #endif #endif #ifndef SIMDUTF_HAS_AVX512VNNI -# if defined(__AVX512VNNI__) && __AVX512VNNI__ == 1 -# define SIMDUTF_HAS_AVX512VNNI 1 -# endif + #if defined(__AVX512VNNI__) && __AVX512VNNI__ == 1 + #define SIMDUTF_HAS_AVX512VNNI 1 + #endif #endif #ifndef SIMDUTF_HAS_AVX512BITALG -# if defined(__AVX512BITALG__) && __AVX512BITALG__ == 1 -# define SIMDUTF_HAS_AVX512BITALG 1 -# endif + #if defined(__AVX512BITALG__) && __AVX512BITALG__ == 1 + #define SIMDUTF_HAS_AVX512BITALG 1 + #endif #endif #ifndef SIMDUTF_HAS_AVX512VPOPCNTDQ -# if defined(__AVX512VPOPCNTDQ__) && __AVX512VPOPCNTDQ__ == 1 -# define SIMDUTF_HAS_AVX512VPOPCNTDQ 1 -# endif + #if defined(__AVX512VPOPCNTDQ__) && __AVX512VPOPCNTDQ__ == 1 + #define SIMDUTF_HAS_AVX512VPOPCNTDQ 1 + #endif #endif #endif // SIMDUTF_AVX512_H_ /* end file include/simdutf/avx512.h */ - -#if defined(__GNUC__) - // Marks a block with a name so that MCA analysis can see it. - #define SIMDUTF_BEGIN_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-BEGIN " #name); - #define SIMDUTF_END_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-END " #name); - #define SIMDUTF_DEBUG_BLOCK(name, block) BEGIN_DEBUG_BLOCK(name); block; END_DEBUG_BLOCK(name); +// Sometimes logging is useful, but we want it disabled by default +// and free of any logging code in release builds. +#ifdef SIMDUTF_LOGGING + #include + #define simdutf_log(msg) \ + std::cout << "[" << __FUNCTION__ << "]: " << msg << std::endl \ + << "\t" << __FILE__ << ":" << __LINE__ << std::endl; + #define simdutf_log_assert(cond, msg) \ + do { \ + if (!(cond)) { \ + std::cerr << "[" << __FUNCTION__ << "]: " << msg << std::endl \ + << "\t" << __FILE__ << ":" << __LINE__ << std::endl; \ + std::abort(); \ + } \ + } while (0) #else - #define SIMDUTF_BEGIN_DEBUG_BLOCK(name) - #define SIMDUTF_END_DEBUG_BLOCK(name) - #define SIMDUTF_DEBUG_BLOCK(name, block) + #define simdutf_log(msg) + #define simdutf_log_assert(cond, msg) #endif -// Align to N-byte boundary -#define SIMDUTF_ROUNDUP_N(a, n) (((a) + ((n)-1)) & ~((n)-1)) -#define SIMDUTF_ROUNDDOWN_N(a, n) ((a) & ~((n)-1)) - -#define SIMDUTF_ISALIGNED_N(ptr, n) (((uintptr_t)(ptr) & ((n)-1)) == 0) - #if defined(SIMDUTF_REGULAR_VISUAL_STUDIO) + #define SIMDUTF_DEPRECATED __declspec(deprecated) - #define simdutf_really_inline __forceinline + #define simdutf_really_inline __forceinline // really inline in release mode + #define simdutf_always_inline __forceinline // always inline, no matter what #define simdutf_never_inline __declspec(noinline) #define simdutf_unused #define simdutf_warn_unused #ifndef simdutf_likely - #define simdutf_likely(x) x + #define simdutf_likely(x) x #endif #ifndef simdutf_unlikely - #define simdutf_unlikely(x) x + #define simdutf_unlikely(x) x #endif - #define SIMDUTF_PUSH_DISABLE_WARNINGS __pragma(warning( push )) - #define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS __pragma(warning( push, 0 )) - #define SIMDUTF_DISABLE_VS_WARNING(WARNING_NUMBER) __pragma(warning( disable : WARNING_NUMBER )) + #define SIMDUTF_PUSH_DISABLE_WARNINGS __pragma(warning(push)) + #define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS __pragma(warning(push, 0)) + #define SIMDUTF_DISABLE_VS_WARNING(WARNING_NUMBER) \ + __pragma(warning(disable : WARNING_NUMBER)) // Get rid of Intellisense-only warnings (Code Analysis) - // Though __has_include is C++17, it is supported in Visual Studio 2017 or better (_MSC_VER>=1910). + // Though __has_include is C++17, it is supported in Visual Studio 2017 or + // better (_MSC_VER>=1910). #ifdef __has_include - #if __has_include() - #include - #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS SIMDUTF_DISABLE_VS_WARNING(ALL_CPPCORECHECK_WARNINGS) - #endif + #if __has_include() + #include + #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS \ + SIMDUTF_DISABLE_VS_WARNING(ALL_CPPCORECHECK_WARNINGS) + #endif #endif #ifndef SIMDUTF_DISABLE_UNDESIRED_WARNINGS - #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS + #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS #endif #define SIMDUTF_DISABLE_DEPRECATED_WARNING SIMDUTF_DISABLE_VS_WARNING(4996) #define SIMDUTF_DISABLE_STRICT_OVERFLOW_WARNING - #define SIMDUTF_POP_DISABLE_WARNINGS __pragma(warning( pop )) - + #define SIMDUTF_POP_DISABLE_WARNINGS __pragma(warning(pop)) + #define SIMDUTF_DISABLE_UNUSED_WARNING #else // SIMDUTF_REGULAR_VISUAL_STUDIO - - #define simdutf_really_inline inline __attribute__((always_inline)) + #if defined(__OPTIMIZE__) || defined(NDEBUG) + #define simdutf_really_inline inline __attribute__((always_inline)) + #else + #define simdutf_really_inline inline + #endif + #define simdutf_always_inline \ + inline __attribute__((always_inline)) // always inline, no matter what + #define SIMDUTF_DEPRECATED __attribute__((deprecated)) #define simdutf_never_inline inline __attribute__((noinline)) #define simdutf_unused __attribute__((unused)) #define simdutf_warn_unused __attribute__((warn_unused_result)) #ifndef simdutf_likely - #define simdutf_likely(x) __builtin_expect(!!(x), 1) + #define simdutf_likely(x) __builtin_expect(!!(x), 1) #endif #ifndef simdutf_unlikely - #define simdutf_unlikely(x) __builtin_expect(!!(x), 0) + #define simdutf_unlikely(x) __builtin_expect(!!(x), 0) #endif - + // clang-format off #define SIMDUTF_PUSH_DISABLE_WARNINGS _Pragma("GCC diagnostic push") - // gcc doesn't seem to disable all warnings with all and extra, add warnings here as necessary - #define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS SIMDUTF_PUSH_DISABLE_WARNINGS \ - SIMDUTF_DISABLE_GCC_WARNING(-Weffc++) \ - SIMDUTF_DISABLE_GCC_WARNING(-Wall) \ - SIMDUTF_DISABLE_GCC_WARNING(-Wconversion) \ - SIMDUTF_DISABLE_GCC_WARNING(-Wextra) \ - SIMDUTF_DISABLE_GCC_WARNING(-Wattributes) \ - SIMDUTF_DISABLE_GCC_WARNING(-Wimplicit-fallthrough) \ - SIMDUTF_DISABLE_GCC_WARNING(-Wnon-virtual-dtor) \ - SIMDUTF_DISABLE_GCC_WARNING(-Wreturn-type) \ - SIMDUTF_DISABLE_GCC_WARNING(-Wshadow) \ - SIMDUTF_DISABLE_GCC_WARNING(-Wunused-parameter) \ + // gcc doesn't seem to disable all warnings with all and extra, add warnings + // here as necessary + #define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS \ + SIMDUTF_PUSH_DISABLE_WARNINGS \ + SIMDUTF_DISABLE_GCC_WARNING(-Weffc++) \ + SIMDUTF_DISABLE_GCC_WARNING(-Wall) \ + SIMDUTF_DISABLE_GCC_WARNING(-Wconversion) \ + SIMDUTF_DISABLE_GCC_WARNING(-Wextra) \ + SIMDUTF_DISABLE_GCC_WARNING(-Wattributes) \ + SIMDUTF_DISABLE_GCC_WARNING(-Wimplicit-fallthrough) \ + SIMDUTF_DISABLE_GCC_WARNING(-Wnon-virtual-dtor) \ + SIMDUTF_DISABLE_GCC_WARNING(-Wreturn-type) \ + SIMDUTF_DISABLE_GCC_WARNING(-Wshadow) \ + SIMDUTF_DISABLE_GCC_WARNING(-Wunused-parameter) \ SIMDUTF_DISABLE_GCC_WARNING(-Wunused-variable) #define SIMDUTF_PRAGMA(P) _Pragma(#P) - #define SIMDUTF_DISABLE_GCC_WARNING(WARNING) SIMDUTF_PRAGMA(GCC diagnostic ignored #WARNING) + #define SIMDUTF_DISABLE_GCC_WARNING(WARNING) \ + SIMDUTF_PRAGMA(GCC diagnostic ignored #WARNING) #if defined(SIMDUTF_CLANG_VISUAL_STUDIO) - #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS SIMDUTF_DISABLE_GCC_WARNING(-Wmicrosoft-include) + #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS \ + SIMDUTF_DISABLE_GCC_WARNING(-Wmicrosoft-include) #else - #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS + #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS #endif - #define SIMDUTF_DISABLE_DEPRECATED_WARNING SIMDUTF_DISABLE_GCC_WARNING(-Wdeprecated-declarations) - #define SIMDUTF_DISABLE_STRICT_OVERFLOW_WARNING SIMDUTF_DISABLE_GCC_WARNING(-Wstrict-overflow) + #define SIMDUTF_DISABLE_DEPRECATED_WARNING \ + SIMDUTF_DISABLE_GCC_WARNING(-Wdeprecated-declarations) + #define SIMDUTF_DISABLE_STRICT_OVERFLOW_WARNING \ + SIMDUTF_DISABLE_GCC_WARNING(-Wstrict-overflow) #define SIMDUTF_POP_DISABLE_WARNINGS _Pragma("GCC diagnostic pop") + #define SIMDUTF_DISABLE_UNUSED_WARNING \ + SIMDUTF_PUSH_DISABLE_WARNINGS \ + SIMDUTF_DISABLE_GCC_WARNING(-Wunused-function) \ + SIMDUTF_DISABLE_GCC_WARNING(-Wunused-const-variable) + // clang-format on +#endif // MSC_VER +// Conditional constexpr macro: expands to constexpr for C++17+, empty otherwise +#if SIMDUTF_CPLUSPLUS17 + #define simdutf_constexpr constexpr +#else + #define simdutf_constexpr +#endif -#endif // MSC_VER +// Will evaluate to constexpr in C++23 or later. This makes it possible to mark +// functions constexpr if the "if consteval" feature is available to use. +#if SIMDUTF_CPLUSPLUS23 + #define simdutf_constexpr23 constexpr +#else + #define simdutf_constexpr23 +#endif #ifndef SIMDUTF_DLLIMPORTEXPORT - #if defined(SIMDUTF_VISUAL_STUDIO) - /** - * It does not matter here whether you are using - * the regular visual studio or clang under visual - * studio. - */ - #if SIMDUTF_USING_LIBRARY - #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllimport) - #else + #if defined(SIMDUTF_VISUAL_STUDIO) // Visual Studio + /** + * Windows users need to do some extra work when building + * or using a dynamic library (DLL). When building, we need + * to set SIMDUTF_DLLIMPORTEXPORT to __declspec(dllexport). + * When *using* the DLL, the user needs to set + * SIMDUTF_DLLIMPORTEXPORT __declspec(dllimport). + * + * Static libraries not need require such work. + * + * It does not matter here whether you are using + * the regular visual studio or clang under visual + * studio, you still need to handle these issues. + * + * Non-Windows systems do not have this complexity. + */ + #if SIMDUTF_BUILDING_WINDOWS_DYNAMIC_LIBRARY + + // We set SIMDUTF_BUILDING_WINDOWS_DYNAMIC_LIBRARY when we build a DLL + // under Windows. It should never happen that both + // SIMDUTF_BUILDING_WINDOWS_DYNAMIC_LIBRARY and + // SIMDUTF_USING_WINDOWS_DYNAMIC_LIBRARY are set. #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllexport) - #endif + #elif SIMDUTF_USING_WINDOWS_DYNAMIC_LIBRARY + // Windows user who call a dynamic library should set + // SIMDUTF_USING_WINDOWS_DYNAMIC_LIBRARY to 1. + + #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllimport) #else + // We assume by default static linkage #define SIMDUTF_DLLIMPORTEXPORT #endif + #else // defined(SIMDUTF_VISUAL_STUDIO) + // Non-Windows systems do not have this complexity. + #define SIMDUTF_DLLIMPORTEXPORT + #endif // defined(SIMDUTF_VISUAL_STUDIO) #endif -/// If EXPR is an error, returns it. -#define SIMDUTF_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } } - +#if SIMDUTF_MAYBE_UNUSED_AVAILABLE + #define simdutf_maybe_unused [[maybe_unused]] +#else + #define simdutf_maybe_unused +#endif #endif // SIMDUTF_COMMON_DEFS_H /* end file include/simdutf/common_defs.h */ /* begin file include/simdutf/encoding_types.h */ +#ifndef SIMDUTF_ENCODING_TYPES_H +#define SIMDUTF_ENCODING_TYPES_H #include +#if !defined(SIMDUTF_NO_STD_TEXT_ENCODING) && \ + defined(__cpp_lib_text_encoding) && __cpp_lib_text_encoding >= 202306L + #define SIMDUTF_HAS_STD_TEXT_ENCODING 1 + #include +#endif + namespace simdutf { enum encoding_type { - UTF8 = 1, // BOM 0xef 0xbb 0xbf - UTF16_LE = 2, // BOM 0xff 0xfe - UTF16_BE = 4, // BOM 0xfe 0xff - UTF32_LE = 8, // BOM 0xff 0xfe 0x00 0x00 - UTF32_BE = 16, // BOM 0x00 0x00 0xfe 0xff - Latin1 = 32, - - unspecified = 0 + UTF8 = 1, // BOM 0xef 0xbb 0xbf + UTF16_LE = 2, // BOM 0xff 0xfe + UTF16_BE = 4, // BOM 0xfe 0xff + UTF32_LE = 8, // BOM 0xff 0xfe 0x00 0x00 + UTF32_BE = 16, // BOM 0x00 0x00 0xfe 0xff + Latin1 = 32, + + unspecified = 0 }; +#ifndef SIMDUTF_IS_BIG_ENDIAN + #error "SIMDUTF_IS_BIG_ENDIAN needs to be defined." +#endif + enum endianness { - LITTLE = 0, - BIG = 1 + LITTLE = 0, + BIG = 1, + NATIVE = +#if SIMDUTF_IS_BIG_ENDIAN + BIG +#else + LITTLE +#endif }; -bool match_system(endianness e); +simdutf_warn_unused simdutf_really_inline constexpr bool +match_system(endianness e) { + return e == endianness::NATIVE; +} -std::string to_string(encoding_type bom); +simdutf_warn_unused std::string to_string(encoding_type bom); // Note that BOM for UTF8 is discouraged. namespace BOM { @@ -534,18 +649,138 @@ namespace BOM { * @return the corresponding encoding */ -encoding_type check_bom(const uint8_t* byte, size_t length); -encoding_type check_bom(const char* byte, size_t length); +simdutf_warn_unused encoding_type check_bom(const uint8_t *byte, size_t length); +simdutf_warn_unused encoding_type check_bom(const char *byte, size_t length); /** * Returns the size, in bytes, of the BOM for a given encoding type. * Note that UTF8 BOM are discouraged. * @param bom the encoding type * @return the size in bytes of the corresponding BOM */ -size_t bom_byte_size(encoding_type bom); +simdutf_warn_unused size_t bom_byte_size(encoding_type bom); + +} // namespace BOM + +#ifdef SIMDUTF_HAS_STD_TEXT_ENCODING +/** + * Convert a simdutf encoding type to a std::text_encoding. + * + * @param enc the simdutf encoding type + * @return the corresponding std::text_encoding, or + * std::text_encoding::id::unknown for unspecified/unsupported + */ +simdutf_warn_unused constexpr std::text_encoding +to_std_encoding(encoding_type enc) noexcept { + switch (enc) { + case UTF8: + return std::text_encoding(std::text_encoding::id::UTF8); + case UTF16_LE: + return std::text_encoding(std::text_encoding::id::UTF16LE); + case UTF16_BE: + return std::text_encoding(std::text_encoding::id::UTF16BE); + case UTF32_LE: + return std::text_encoding(std::text_encoding::id::UTF32LE); + case UTF32_BE: + return std::text_encoding(std::text_encoding::id::UTF32BE); + case Latin1: + return std::text_encoding(std::text_encoding::id::ISOLatin1); + case unspecified: + default: + return std::text_encoding(std::text_encoding::id::unknown); + } +} + +/** + * Convert a std::text_encoding to a simdutf encoding type. + * + * @param enc the std::text_encoding + * @return the corresponding simdutf encoding type, or + * encoding_type::unspecified if the encoding is not supported + */ +simdutf_warn_unused constexpr encoding_type +from_std_encoding(const std::text_encoding &enc) noexcept { + switch (enc.mib()) { + case std::text_encoding::id::UTF8: + return UTF8; + case std::text_encoding::id::UTF16LE: + return UTF16_LE; + case std::text_encoding::id::UTF16BE: + return UTF16_BE; + case std::text_encoding::id::UTF32LE: + return UTF32_LE; + case std::text_encoding::id::UTF32BE: + return UTF32_BE; + case std::text_encoding::id::ISOLatin1: + return Latin1; + default: + return unspecified; + } +} + +/** + * Get the native-endian UTF-16 encoding type for this system. + * + * @return UTF16_LE on little-endian systems, UTF16_BE on big-endian systems + */ +simdutf_warn_unused constexpr encoding_type native_utf16_encoding() noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return UTF16_BE; + #else + return UTF16_LE; + #endif +} + +/** + * Get the native-endian UTF-32 encoding type for this system. + * + * @return UTF32_LE on little-endian systems, UTF32_BE on big-endian systems + */ +simdutf_warn_unused constexpr encoding_type native_utf32_encoding() noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return UTF32_BE; + #else + return UTF32_LE; + #endif +} + +/** + * Convert a std::text_encoding to a simdutf encoding type, + * using native endianness for UTF-16/UTF-32 without explicit endianness. + * + * When the input is std::text_encoding::id::UTF16 or UTF32 (without LE/BE + * suffix), this returns the native-endian simdutf variant. + * + * @param enc the std::text_encoding + * @return the corresponding simdutf encoding type, or + * encoding_type::unspecified if the encoding is not supported + */ +simdutf_warn_unused constexpr encoding_type +from_std_encoding_native(const std::text_encoding &enc) noexcept { + switch (enc.mib()) { + case std::text_encoding::id::UTF8: + return UTF8; + case std::text_encoding::id::UTF16: + return native_utf16_encoding(); + case std::text_encoding::id::UTF16LE: + return UTF16_LE; + case std::text_encoding::id::UTF16BE: + return UTF16_BE; + case std::text_encoding::id::UTF32: + return native_utf32_encoding(); + case std::text_encoding::id::UTF32LE: + return UTF32_LE; + case std::text_encoding::id::UTF32BE: + return UTF32_BE; + case std::text_encoding::id::ISOLatin1: + return Latin1; + default: + return unspecified; + } +} +#endif // SIMDUTF_HAS_STD_TEXT_ENCODING -} // BOM namespace -} // simdutf namespace +} // namespace simdutf +#endif /* end file include/simdutf/encoding_types.h */ /* begin file include/simdutf/error.h */ #ifndef SIMDUTF_ERROR_H @@ -554,32 +789,125 @@ namespace simdutf { enum error_code { SUCCESS = 0, - HEADER_BITS, // Any byte must have fewer than 5 header bits. - TOO_SHORT, // The leading byte must be followed by N-1 continuation bytes, where N is the UTF-8 character length - // This is also the error when the input is truncated. - TOO_LONG, // We either have too many consecutive continuation bytes or the string starts with a continuation byte. - OVERLONG, // The decoded character must be above U+7F for two-byte characters, U+7FF for three-byte characters, - // and U+FFFF for four-byte characters. - TOO_LARGE, // The decoded character must be less than or equal to U+10FFFF,less than or equal than U+7F for ASCII OR less than equal than U+FF for Latin1 - SURROGATE, // The decoded character must be not be in U+D800...DFFF (UTF-8 or UTF-32) OR - // a high surrogate must be followed by a low surrogate and a low surrogate must be preceded by a high surrogate (UTF-16) OR - // there must be no surrogate at all (Latin1) - INVALID_BASE64_CHARACTER, // Found a character that cannot be part of a valid base64 string. - BASE64_INPUT_REMAINDER, // The base64 input terminates with a single character, excluding padding (=). - OUTPUT_BUFFER_TOO_SMALL, // The provided buffer is too small. - OTHER // Not related to validation/transcoding. + HEADER_BITS, // Any byte must have fewer than 5 header bits. + TOO_SHORT, // The leading byte must be followed by N-1 continuation bytes, + // where N is the UTF-8 character length This is also the error + // when the input is truncated. + TOO_LONG, // We either have too many consecutive continuation bytes or the + // string starts with a continuation byte. + OVERLONG, // The decoded character must be above U+7F for two-byte characters, + // U+7FF for three-byte characters, and U+FFFF for four-byte + // characters. + TOO_LARGE, // The decoded character must be less than or equal to + // U+10FFFF,less than or equal than U+7F for ASCII OR less than + // equal than U+FF for Latin1 + SURROGATE, // The decoded character must be not be in U+D800...DFFF (UTF-8 or + // UTF-32) + // OR + // a high surrogate must be followed by a low surrogate + // and a low surrogate must be preceded by a high surrogate + // (UTF-16) + // OR + // there must be no surrogate at all and one is + // found (Latin1 functions) + // OR + // *specifically* for the function + // utf8_length_from_utf16_with_replacement, a surrogate (whether + // in error or not) has been found (I.e., whether we are in the + // Basic Multilingual Plane or not). + INVALID_BASE64_CHARACTER, // Found a character that cannot be part of a valid + // base64 string. This may include a misplaced + // padding character ('='). + BASE64_INPUT_REMAINDER, // The base64 input terminates with a single + // character, excluding padding (=). It is also used + // in strict mode when padding is not adequate. + BASE64_EXTRA_BITS, // The base64 input terminates with non-zero + // padding bits. + OUTPUT_BUFFER_TOO_SMALL, // The provided buffer is too small. + OTHER // Not related to validation/transcoding. }; +#if SIMDUTF_CPLUSPLUS17 +inline std::string_view error_to_string(error_code code) noexcept { + switch (code) { + case SUCCESS: + return "SUCCESS"; + case HEADER_BITS: + return "HEADER_BITS"; + case TOO_SHORT: + return "TOO_SHORT"; + case TOO_LONG: + return "TOO_LONG"; + case OVERLONG: + return "OVERLONG"; + case TOO_LARGE: + return "TOO_LARGE"; + case SURROGATE: + return "SURROGATE"; + case INVALID_BASE64_CHARACTER: + return "INVALID_BASE64_CHARACTER"; + case BASE64_INPUT_REMAINDER: + return "BASE64_INPUT_REMAINDER"; + case BASE64_EXTRA_BITS: + return "BASE64_EXTRA_BITS"; + case OUTPUT_BUFFER_TOO_SMALL: + return "OUTPUT_BUFFER_TOO_SMALL"; + default: + return "OTHER"; + } +} +#endif struct result { error_code error; - size_t count; // In case of error, indicates the position of the error. In case of success, indicates the number of code units validated/written. + size_t count; // In case of error, indicates the position of the error. In + // case of success, indicates the number of code units + // validated/written. + + simdutf_really_inline simdutf_constexpr23 result() noexcept + : error{error_code::SUCCESS}, count{0} {} - simdutf_really_inline result() : error{error_code::SUCCESS}, count{0} {} + simdutf_really_inline simdutf_constexpr23 result(error_code err, + size_t pos) noexcept + : error{err}, count{pos} {} - simdutf_really_inline result(error_code _err, size_t _pos) : error{_err}, count{_pos} {} + simdutf_really_inline simdutf_constexpr23 bool is_ok() const noexcept { + return error == error_code::SUCCESS; + } + + simdutf_really_inline simdutf_constexpr23 bool is_err() const noexcept { + return error != error_code::SUCCESS; + } }; -} +struct full_result { + error_code error; + size_t input_count; + size_t output_count; + bool padding_error = false; // true if the error is due to padding, only + // meaningful when error is not SUCCESS + + simdutf_really_inline simdutf_constexpr23 full_result() noexcept + : error{error_code::SUCCESS}, input_count{0}, output_count{0} {} + + simdutf_really_inline simdutf_constexpr23 full_result(error_code err, + size_t pos_in, + size_t pos_out) noexcept + : error{err}, input_count{pos_in}, output_count{pos_out} {} + simdutf_really_inline simdutf_constexpr23 full_result( + error_code err, size_t pos_in, size_t pos_out, bool padding_err) noexcept + : error{err}, input_count{pos_in}, output_count{pos_out}, + padding_error{padding_err} {} + + simdutf_really_inline simdutf_constexpr23 operator result() const noexcept { + if (error == error_code::SUCCESS) { + return result{error, output_count}; + } else { + return result{error, input_count}; + } + } +}; + +} // namespace simdutf #endif /* end file include/simdutf/error.h */ @@ -594,22 +922,22 @@ SIMDUTF_DISABLE_UNDESIRED_WARNINGS #define SIMDUTF_SIMDUTF_VERSION_H /** The version of simdutf being used (major.minor.revision) */ -#define SIMDUTF_VERSION "5.2.8" +#define SIMDUTF_VERSION "8.0.0" namespace simdutf { enum { /** * The major version (MAJOR.minor.revision) of simdutf being used. */ - SIMDUTF_VERSION_MAJOR = 5, + SIMDUTF_VERSION_MAJOR = 8, /** * The minor version (major.MINOR.revision) of simdutf being used. */ - SIMDUTF_VERSION_MINOR = 2, + SIMDUTF_VERSION_MINOR = 0, /** * The revision (major.minor.REVISION) of simdutf being used. */ - SIMDUTF_VERSION_REVISION = 8 + SIMDUTF_VERSION_REVISION = 0 }; } // namespace simdutf @@ -618,12 +946,13 @@ enum { /* begin file include/simdutf/implementation.h */ #ifndef SIMDUTF_IMPLEMENTATION_H #define SIMDUTF_IMPLEMENTATION_H -#include #if !defined(SIMDUTF_NO_THREADS) -#include + #include +#endif +#include +#ifdef SIMDUTF_INTERNAL_TESTS + #include #endif -#include -#include /* begin file include/simdutf/internal/isadetection.h */ /* From https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h @@ -676,23 +1005,33 @@ POSSIBILITY OF SUCH DAMAGE. #include #include #if defined(_MSC_VER) -#include + #include #elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID) -#include + #include #endif // RISC-V ISA detection utilities #if SIMDUTF_IS_RISCV64 && defined(__linux__) -#include // for syscall + #include // for syscall // We define these ourselves, for backwards compatibility -struct simdutf_riscv_hwprobe { int64_t key; uint64_t value; }; -#define simdutf_riscv_hwprobe(...) syscall(258, __VA_ARGS__) -#define SIMDUTF_RISCV_HWPROBE_KEY_IMA_EXT_0 4 -#define SIMDUTF_RISCV_HWPROBE_IMA_V (1 << 2) -#define SIMDUTF_RISCV_HWPROBE_EXT_ZVBB (1 << 17) +struct simdutf_riscv_hwprobe { + int64_t key; + uint64_t value; +}; + #define simdutf_riscv_hwprobe(...) syscall(258, __VA_ARGS__) + #define SIMDUTF_RISCV_HWPROBE_KEY_IMA_EXT_0 4 + #define SIMDUTF_RISCV_HWPROBE_IMA_V (1 << 2) + #define SIMDUTF_RISCV_HWPROBE_EXT_ZVBB (1 << 17) #endif // SIMDUTF_IS_RISCV64 && defined(__linux__) +#if defined(__loongarch__) && defined(__linux__) + #include +// bits/hwcap.h +// #define HWCAP_LOONGARCH_LSX (1 << 4) +// #define HWCAP_LOONGARCH_LASX (1 << 5) +#endif + namespace simdutf { namespace internal { @@ -717,6 +1056,8 @@ enum instruction_set { AVX512VPOPCNTDQ = 0x2000, RVV = 0x4000, ZVBB = 0x8000, + LSX = 0x40000, + LASX = 0x80000, }; #if defined(__PPC64__) @@ -729,15 +1070,16 @@ static inline uint32_t detect_supported_architectures() { static inline uint32_t detect_supported_architectures() { uint32_t host_isa = instruction_set::DEFAULT; -#if SIMDUTF_IS_RVV + #if SIMDUTF_IS_RVV host_isa |= instruction_set::RVV; -#endif -#if SIMDUTF_IS_ZVBB + #endif + #if SIMDUTF_IS_ZVBB host_isa |= instruction_set::ZVBB; -#endif -#if defined(__linux__) - simdutf_riscv_hwprobe probes[] = { { SIMDUTF_RISCV_HWPROBE_KEY_IMA_EXT_0, 0 } }; - long ret = simdutf_riscv_hwprobe(&probes, sizeof probes/sizeof *probes, 0, nullptr, 0); + #endif + #if defined(__linux__) + simdutf_riscv_hwprobe probes[] = {{SIMDUTF_RISCV_HWPROBE_KEY_IMA_EXT_0, 0}}; + long ret = simdutf_riscv_hwprobe(&probes, sizeof probes / sizeof *probes, 0, + nullptr, 0); if (ret == 0) { uint64_t extensions = probes[0].value; if (extensions & SIMDUTF_RISCV_HWPROBE_IMA_V) @@ -745,11 +1087,11 @@ static inline uint32_t detect_supported_architectures() { if (extensions & SIMDUTF_RISCV_HWPROBE_EXT_ZVBB) host_isa |= instruction_set::ZVBB; } -#endif -#if defined(RUN_IN_SPIKE_SIMULATOR) + #endif + #if defined(RUN_IN_SPIKE_SIMULATOR) // Proxy Kernel does not implement yet hwprobe syscall host_isa |= instruction_set::RVV; -#endif + #endif return host_isa; } @@ -761,80 +1103,82 @@ static inline uint32_t detect_supported_architectures() { #elif defined(__x86_64__) || defined(_M_AMD64) // x64 - namespace { namespace cpuid_bit { - // Can be found on Intel ISA Reference for CPUID - - // EAX = 0x01 - constexpr uint32_t pclmulqdq = uint32_t(1) << 1; ///< @private bit 1 of ECX for EAX=0x1 - constexpr uint32_t sse42 = uint32_t(1) << 20; ///< @private bit 20 of ECX for EAX=0x1 - constexpr uint32_t osxsave = (uint32_t(1) << 26) | (uint32_t(1) << 27); ///< @private bits 26+27 of ECX for EAX=0x1 - - // EAX = 0x7f (Structured Extended Feature Flags), ECX = 0x00 (Sub-leaf) - // See: "Table 3-8. Information Returned by CPUID Instruction" - namespace ebx { - constexpr uint32_t bmi1 = uint32_t(1) << 3; - constexpr uint32_t avx2 = uint32_t(1) << 5; - constexpr uint32_t bmi2 = uint32_t(1) << 8; - constexpr uint32_t avx512f = uint32_t(1) << 16; - constexpr uint32_t avx512dq = uint32_t(1) << 17; - constexpr uint32_t avx512ifma = uint32_t(1) << 21; - constexpr uint32_t avx512cd = uint32_t(1) << 28; - constexpr uint32_t avx512bw = uint32_t(1) << 30; - constexpr uint32_t avx512vl = uint32_t(1) << 31; - } - - namespace ecx { - constexpr uint32_t avx512vbmi = uint32_t(1) << 1; - constexpr uint32_t avx512vbmi2 = uint32_t(1) << 6; - constexpr uint32_t avx512vnni = uint32_t(1) << 11; - constexpr uint32_t avx512bitalg = uint32_t(1) << 12; - constexpr uint32_t avx512vpopcnt = uint32_t(1) << 14; - } - namespace edx { - constexpr uint32_t avx512vp2intersect = uint32_t(1) << 8; - } - namespace xcr0_bit { - constexpr uint64_t avx256_saved = uint64_t(1) << 2; ///< @private bit 2 = AVX - constexpr uint64_t avx512_saved = uint64_t(7) << 5; ///< @private bits 5,6,7 = opmask, ZMM_hi256, hi16_ZMM - } - } +// Can be found on Intel ISA Reference for CPUID + +// EAX = 0x01 +constexpr uint32_t pclmulqdq = uint32_t(1) + << 1; ///< @private bit 1 of ECX for EAX=0x1 +constexpr uint32_t sse42 = uint32_t(1) + << 20; ///< @private bit 20 of ECX for EAX=0x1 +constexpr uint32_t osxsave = + (uint32_t(1) << 26) | + (uint32_t(1) << 27); ///< @private bits 26+27 of ECX for EAX=0x1 + +// EAX = 0x7f (Structured Extended Feature Flags), ECX = 0x00 (Sub-leaf) +// See: "Table 3-8. Information Returned by CPUID Instruction" +namespace ebx { +constexpr uint32_t bmi1 = uint32_t(1) << 3; +constexpr uint32_t avx2 = uint32_t(1) << 5; +constexpr uint32_t bmi2 = uint32_t(1) << 8; +constexpr uint32_t avx512f = uint32_t(1) << 16; +constexpr uint32_t avx512dq = uint32_t(1) << 17; +constexpr uint32_t avx512ifma = uint32_t(1) << 21; +constexpr uint32_t avx512cd = uint32_t(1) << 28; +constexpr uint32_t avx512bw = uint32_t(1) << 30; +constexpr uint32_t avx512vl = uint32_t(1) << 31; +} // namespace ebx + +namespace ecx { +constexpr uint32_t avx512vbmi = uint32_t(1) << 1; +constexpr uint32_t avx512vbmi2 = uint32_t(1) << 6; +constexpr uint32_t avx512vnni = uint32_t(1) << 11; +constexpr uint32_t avx512bitalg = uint32_t(1) << 12; +constexpr uint32_t avx512vpopcnt = uint32_t(1) << 14; +} // namespace ecx +namespace edx { +constexpr uint32_t avx512vp2intersect = uint32_t(1) << 8; } - - +namespace xcr0_bit { +constexpr uint64_t avx256_saved = uint64_t(1) << 2; ///< @private bit 2 = AVX +constexpr uint64_t avx512_saved = + uint64_t(7) << 5; ///< @private bits 5,6,7 = opmask, ZMM_hi256, hi16_ZMM +} // namespace xcr0_bit +} // namespace cpuid_bit +} // namespace static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) { -#if defined(_MSC_VER) + #if defined(_MSC_VER) int cpu_info[4]; __cpuidex(cpu_info, *eax, *ecx); *eax = cpu_info[0]; *ebx = cpu_info[1]; *ecx = cpu_info[2]; *edx = cpu_info[3]; -#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID) + #elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID) uint32_t level = *eax; __get_cpuid(level, eax, ebx, ecx, edx); -#else + #else uint32_t a = *eax, b, c = *ecx, d; asm volatile("cpuid\n\t" : "+a"(a), "=b"(b), "+c"(c), "=d"(d)); *eax = a; *ebx = b; *ecx = c; *edx = d; -#endif + #endif } static inline uint64_t xgetbv() { - #if defined(_MSC_VER) - return _xgetbv(0); - #else - uint32_t xcr0_lo, xcr0_hi; - asm volatile("xgetbv\n\t" : "=a" (xcr0_lo), "=d" (xcr0_hi) : "c" (0)); - return xcr0_lo | ((uint64_t)xcr0_hi << 32); - #endif - } + #if defined(_MSC_VER) + return _xgetbv(0); + #else + uint32_t xcr0_lo, xcr0_hi; + asm volatile("xgetbv\n\t" : "=a"(xcr0_lo), "=d"(xcr0_hi) : "c"(0)); + return xcr0_lo | ((uint64_t)xcr0_hi << 32); + #endif +} static inline uint32_t detect_supported_architectures() { uint32_t eax; @@ -878,7 +1222,8 @@ static inline uint32_t detect_supported_architectures() { if (ebx & cpuid_bit::ebx::bmi2) { host_isa |= instruction_set::BMI2; } - if (!((xcr0 & cpuid_bit::xcr0_bit::avx512_saved) == cpuid_bit::xcr0_bit::avx512_saved)) { + if (!((xcr0 & cpuid_bit::xcr0_bit::avx512_saved) == + cpuid_bit::xcr0_bit::avx512_saved)) { return host_isa; } if (ebx & cpuid_bit::ebx::avx512f) { @@ -904,6 +1249,22 @@ static inline uint32_t detect_supported_architectures() { } return host_isa; } +#elif defined(__loongarch__) + +static inline uint32_t detect_supported_architectures() { + uint32_t host_isa = instruction_set::DEFAULT; + #if defined(__linux__) + uint64_t hwcap = 0; + hwcap = getauxval(AT_HWCAP); + if (hwcap & HWCAP_LOONGARCH_LSX) { + host_isa |= instruction_set::LSX; + } + if (hwcap & HWCAP_LOONGARCH_LASX) { + host_isa |= instruction_set::LASX; + } + #endif + return host_isa; +} #else // fallback // includes 32-bit ARM. @@ -911,7 +1272,6 @@ static inline uint32_t detect_supported_architectures() { return instruction_set::DEFAULT; } - #endif // end SIMD extension detection code } // namespace internal @@ -920,44 +1280,3747 @@ static inline uint32_t detect_supported_architectures() { #endif // SIMDutf_INTERNAL_ISADETECTION_H /* end file include/simdutf/internal/isadetection.h */ +#if SIMDUTF_SPAN + #include + #include + #include + #include +#endif +#if SIMDUTF_CPLUSPLUS17 + #include +#endif +// The following defines are conditionally enabled/disabled during amalgamation. +// By default all features are enabled, regular code shouldn't check them. Only +// when user code really relies of a selected subset, it's good to verify these +// flags, like: +// +// #if !SIMDUTF_FEATURE_UTF16 +// # error("Please amalgamate simdutf with UTF-16 support") +// #endif +// +#define SIMDUTF_FEATURE_DETECT_ENCODING 1 +#define SIMDUTF_FEATURE_ASCII 1 +#define SIMDUTF_FEATURE_LATIN1 1 +#define SIMDUTF_FEATURE_UTF8 1 +#define SIMDUTF_FEATURE_UTF16 1 +#define SIMDUTF_FEATURE_UTF32 1 +#define SIMDUTF_FEATURE_BASE64 1 + +#if SIMDUTF_CPLUSPLUS23 +/* begin file include/simdutf/constexpr_ptr.h */ +#ifndef SIMDUTF_CONSTEXPR_PTR_H +#define SIMDUTF_CONSTEXPR_PTR_H + +#include namespace simdutf { - +namespace detail { /** - * Autodetect the encoding of the input, a single encoding is recommended. - * E.g., the function might return simdutf::encoding_type::UTF8, - * simdutf::encoding_type::UTF16_LE, simdutf::encoding_type::UTF16_BE, or - * simdutf::encoding_type::UTF32_LE. - * - * @param input the string to analyze. - * @param length the length of the string in bytes. - * @return the detected encoding type + * The constexpr_ptr class is a workaround for reinterpret_cast not being + * allowed during constant evaluation. */ -simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const char * input, size_t length) noexcept; -simdutf_really_inline simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const uint8_t * input, size_t length) noexcept { - return autodetect_encoding(reinterpret_cast(input), length); +template + requires(sizeof(to) == sizeof(from)) +struct constexpr_ptr { + const from *p; + + constexpr explicit constexpr_ptr(const from *ptr) noexcept : p(ptr) {} + + constexpr to operator*() const noexcept { return static_cast(*p); } + + constexpr constexpr_ptr &operator++() noexcept { + ++p; + return *this; + } + + constexpr constexpr_ptr operator++(int) noexcept { + auto old = *this; + ++p; + return old; + } + + constexpr constexpr_ptr &operator--() noexcept { + --p; + return *this; + } + + constexpr constexpr_ptr operator--(int) noexcept { + auto old = *this; + --p; + return old; + } + + constexpr constexpr_ptr &operator+=(std::ptrdiff_t n) noexcept { + p += n; + return *this; + } + + constexpr constexpr_ptr &operator-=(std::ptrdiff_t n) noexcept { + p -= n; + return *this; + } + + constexpr constexpr_ptr operator+(std::ptrdiff_t n) const noexcept { + return constexpr_ptr{p + n}; + } + + constexpr constexpr_ptr operator-(std::ptrdiff_t n) const noexcept { + return constexpr_ptr{p - n}; + } + + constexpr std::ptrdiff_t operator-(const constexpr_ptr &o) const noexcept { + return p - o.p; + } + + constexpr to operator[](std::ptrdiff_t n) const noexcept { + return static_cast(*(p + n)); + } + + // to prevent compilation errors for memcpy, even if it is never + // called during constant evaluation + constexpr operator const void *() const noexcept { return p; } +}; + +template +constexpr constexpr_ptr constexpr_cast_ptr(from *p) noexcept { + return constexpr_ptr{p}; } /** - * Autodetect the possible encodings of the input in one pass. - * E.g., if the input might be UTF-16LE or UTF-8, this function returns - * the value (simdutf::encoding_type::UTF8 | simdutf::encoding_type::UTF16_LE). - * - * Overridden by each implementation. - * - * @param input the string to analyze. - * @param length the length of the string in bytes. - * @return the detected encoding type + * helper type for constexpr_writeptr, so it is possible to + * do "*ptr = val;" */ -simdutf_warn_unused int detect_encodings(const char * input, size_t length) noexcept; -simdutf_really_inline simdutf_warn_unused int detect_encodings(const uint8_t * input, size_t length) noexcept { - return detect_encodings(reinterpret_cast(input), length); -} +template +struct constexpr_write_ptr_proxy { + + constexpr explicit constexpr_write_ptr_proxy(TargetType *raw) : p(raw) {} + + constexpr constexpr_write_ptr_proxy &operator=(SrcType v) { + *p = static_cast(v); + return *this; + } + + TargetType *p; +}; /** - * Validate the UTF-8 string. This function may be best when you expect - * the input to be almost always valid. Otherwise, consider using - * validate_utf8_with_errors. + * helper for working around reinterpret_cast not being allowed during constexpr + * evaluation. will try to act as a SrcType* but actually write to the pointer + * given in the constructor, which is of another type TargetType + */ +template struct constexpr_write_ptr { + constexpr explicit constexpr_write_ptr(TargetType *raw) : p(raw) {} + + constexpr constexpr_write_ptr_proxy operator*() const { + return constexpr_write_ptr_proxy{p}; + } + + constexpr constexpr_write_ptr_proxy + operator[](std::ptrdiff_t n) const { + return constexpr_write_ptr_proxy{p + n}; + } + + constexpr constexpr_write_ptr &operator++() { + ++p; + return *this; + } + + constexpr constexpr_write_ptr operator++(int) { + constexpr_write_ptr old = *this; + ++p; + return old; + } + + constexpr std::ptrdiff_t operator-(const constexpr_write_ptr &other) const { + return p - other.p; + } + + TargetType *p; +}; + +template +constexpr auto constexpr_cast_writeptr(TargetType *raw) { + return constexpr_write_ptr{raw}; +} + +} // namespace detail +} // namespace simdutf +#endif +/* end file include/simdutf/constexpr_ptr.h */ +#endif + +#if SIMDUTF_SPAN +/// helpers placed in namespace detail are not a part of the public API +namespace simdutf { +namespace detail { +/** + * matches a byte, in the many ways C++ allows. note that these + * are all distinct types. + */ +template +concept byte_like = std::is_same_v || // + std::is_same_v || // + std::is_same_v || // + std::is_same_v || // + std::is_same_v; + +template +concept is_byte_like = byte_like>; + +template +concept is_pointer = std::is_pointer_v; + +/** + * matches anything that behaves like std::span and points to character-like + * data such as: std::byte, char, unsigned char, signed char, std::int8_t, + * std::uint8_t + */ +template +concept input_span_of_byte_like = requires(const T &t) { + { t.size() } noexcept -> std::convertible_to; + { t.data() } noexcept -> is_pointer; + { *t.data() } noexcept -> is_byte_like; +}; + +template +concept is_mutable = !std::is_const_v>; + +/** + * like span_of_byte_like, but for an output span (intended to be written to) + */ +template +concept output_span_of_byte_like = requires(T &t) { + { t.size() } noexcept -> std::convertible_to; + { t.data() } noexcept -> is_pointer; + { *t.data() } noexcept -> is_byte_like; + { *t.data() } noexcept -> is_mutable; +}; + +/** + * a pointer like object, when indexed, results in a byte like result. + * valid examples: char*, const char*, std::array + * invalid examples: int*, std::array + */ +template +concept indexes_into_byte_like = requires(InputPtr p) { + { std::decay_t{} } -> simdutf::detail::byte_like; +}; +template +concept indexes_into_utf16 = requires(InputPtr p) { + { std::decay_t{} } -> std::same_as; +}; +template +concept indexes_into_utf32 = requires(InputPtr p) { + { std::decay_t{} } -> std::same_as; +}; + +template +concept index_assignable_from_char = requires(InputPtr p, char s) { + { p[0] = s }; +}; + +/** + * a pointer like object that results in a uint32_t when indexed. + * valid examples: uint32_t* + */ +template +concept indexes_into_uint32 = requires(InputPtr p) { + { std::decay_t{} } -> std::same_as; +}; +} // namespace detail +} // namespace simdutf +#endif // SIMDUTF_SPAN + +// these includes are needed for constexpr support. they are +// not part of the public api. +/* begin file include/simdutf/scalar/swap_bytes.h */ +#ifndef SIMDUTF_SWAP_BYTES_H +#define SIMDUTF_SWAP_BYTES_H + +namespace simdutf { +namespace scalar { + +constexpr inline simdutf_warn_unused uint16_t +u16_swap_bytes(const uint16_t word) { + return uint16_t((word >> 8) | (word << 8)); +} + +constexpr inline simdutf_warn_unused uint32_t +u32_swap_bytes(const uint32_t word) { + return ((word >> 24) & 0xff) | // move byte 3 to byte 0 + ((word << 8) & 0xff0000) | // move byte 1 to byte 2 + ((word >> 8) & 0xff00) | // move byte 2 to byte 1 + ((word << 24) & 0xff000000); // byte 0 to byte 3 +} + +namespace utf32 { +template constexpr uint32_t swap_if_needed(uint32_t c) { + return !match_system(big_endian) ? scalar::u32_swap_bytes(c) : c; +} +} // namespace utf32 + +namespace utf16 { +template constexpr uint16_t swap_if_needed(uint16_t c) { + return !match_system(big_endian) ? scalar::u16_swap_bytes(c) : c; +} +} // namespace utf16 + +} // namespace scalar +} // namespace simdutf + +#endif +/* end file include/simdutf/scalar/swap_bytes.h */ +/* begin file include/simdutf/scalar/ascii.h */ +#ifndef SIMDUTF_ASCII_H +#define SIMDUTF_ASCII_H + +namespace simdutf { +namespace scalar { +namespace { +namespace ascii { + +template +#if SIMDUTF_CPLUSPLUS20 + requires simdutf::detail::indexes_into_byte_like +#endif +simdutf_warn_unused simdutf_constexpr23 bool validate(InputPtr data, + size_t len) noexcept { + uint64_t pos = 0; + +#if SIMDUTF_CPLUSPLUS23 + // avoid memcpy during constant evaluation + if !consteval +#endif + // process in blocks of 16 bytes when possible + { + for (; pos + 16 <= len; pos += 16) { + uint64_t v1; + std::memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2; + std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | v2}; + if ((v & 0x8080808080808080) != 0) { + return false; + } + } + } + + // process the tail byte-by-byte + for (; pos < len; pos++) { + if (static_cast(data[pos]) >= 0b10000000) { + return false; + } + } + return true; +} +template +#if SIMDUTF_CPLUSPLUS20 + requires simdutf::detail::indexes_into_byte_like +#endif +simdutf_warn_unused simdutf_constexpr23 result +validate_with_errors(InputPtr data, size_t len) noexcept { + size_t pos = 0; +#if SIMDUTF_CPLUSPLUS23 + // avoid memcpy during constant evaluation + if !consteval +#endif + { + // process in blocks of 16 bytes when possible + for (; pos + 16 <= len; pos += 16) { + uint64_t v1; + std::memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2; + std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | v2}; + if ((v & 0x8080808080808080) != 0) { + for (; pos < len; pos++) { + if (static_cast(data[pos]) >= 0b10000000) { + return result(error_code::TOO_LARGE, pos); + } + } + } + } + } + + // process the tail byte-by-byte + for (; pos < len; pos++) { + if (static_cast(data[pos]) >= 0b10000000) { + return result(error_code::TOO_LARGE, pos); + } + } + return result(error_code::SUCCESS, pos); +} + +} // namespace ascii +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file include/simdutf/scalar/ascii.h */ +/* begin file include/simdutf/scalar/atomic_util.h */ +#ifndef SIMDUTF_ATOMIC_UTIL_H +#define SIMDUTF_ATOMIC_UTIL_H +#if SIMDUTF_ATOMIC_REF + #include +namespace simdutf { +namespace scalar { + +// This function is a memcpy that uses atomic operations to read from the +// source. +inline void memcpy_atomic_read(char *dst, const char *src, size_t len) { + static_assert(std::atomic_ref::required_alignment == sizeof(char), + "std::atomic_ref requires the same alignment as char_type"); + // We expect all 64-bit systems to be able to read 64-bit words from an + // aligned memory region atomically. You might be able to do better on + // specific systems, e.g., x64 systems can read 128-bit words atomically. + constexpr size_t alignment = sizeof(uint64_t); + + // Lambda for atomic byte-by-byte copy + auto bbb_memcpy_atomic_read = [](char *bytedst, const char *bytesrc, + size_t bytelen) noexcept { + char *mutable_src = const_cast(bytesrc); + for (size_t j = 0; j < bytelen; ++j) { + bytedst[j] = + std::atomic_ref(mutable_src[j]).load(std::memory_order_relaxed); + } + }; + + // Handle unaligned start + size_t offset = reinterpret_cast(src) % alignment; + if (offset) { + size_t to_align = std::min(len, alignment - offset); + bbb_memcpy_atomic_read(dst, src, to_align); + src += to_align; + dst += to_align; + len -= to_align; + } + + // Process aligned 64-bit chunks + while (len >= alignment) { + auto *src_aligned = reinterpret_cast(const_cast(src)); + const auto dst_value = + std::atomic_ref(*src_aligned).load(std::memory_order_relaxed); + std::memcpy(dst, &dst_value, sizeof(uint64_t)); + src += alignment; + dst += alignment; + len -= alignment; + } + + // Handle remaining bytes + if (len) { + bbb_memcpy_atomic_read(dst, src, len); + } +} + +// This function is a memcpy that uses atomic operations to write to the +// destination. +inline void memcpy_atomic_write(char *dst, const char *src, size_t len) { + static_assert(std::atomic_ref::required_alignment == sizeof(char), + "std::atomic_ref requires the same alignment as char"); + // We expect all 64-bit systems to be able to write 64-bit words to an aligned + // memory region atomically. + // You might be able to do better on specific systems, e.g., x64 systems can + // write 128-bit words atomically. + constexpr size_t alignment = sizeof(uint64_t); + + // Lambda for atomic byte-by-byte write + auto bbb_memcpy_atomic_write = [](char *bytedst, const char *bytesrc, + size_t bytelen) noexcept { + for (size_t j = 0; j < bytelen; ++j) { + std::atomic_ref(bytedst[j]) + .store(bytesrc[j], std::memory_order_relaxed); + } + }; + + // Handle unaligned start + size_t offset = reinterpret_cast(dst) % alignment; + if (offset) { + size_t to_align = std::min(len, alignment - offset); + bbb_memcpy_atomic_write(dst, src, to_align); + dst += to_align; + src += to_align; + len -= to_align; + } + + // Process aligned 64-bit chunks + while (len >= alignment) { + auto *dst_aligned = reinterpret_cast(dst); + uint64_t src_val; + std::memcpy(&src_val, src, sizeof(uint64_t)); // Non-atomic read from src + std::atomic_ref(*dst_aligned) + .store(src_val, std::memory_order_relaxed); + dst += alignment; + src += alignment; + len -= alignment; + } + + // Handle remaining bytes + if (len) { + bbb_memcpy_atomic_write(dst, src, len); + } +} +} // namespace scalar +} // namespace simdutf +#endif // SIMDUTF_ATOMIC_REF +#endif // SIMDUTF_ATOMIC_UTIL_H +/* end file include/simdutf/scalar/atomic_util.h */ +/* begin file include/simdutf/scalar/latin1.h */ +#ifndef SIMDUTF_LATIN1_H +#define SIMDUTF_LATIN1_H + +namespace simdutf { +namespace scalar { +namespace { +namespace latin1 { + +simdutf_really_inline size_t utf8_length_from_latin1(const char *buf, + size_t len) { + const uint8_t *c = reinterpret_cast(buf); + size_t answer = 0; + for (size_t i = 0; i < len; i++) { + if ((c[i] >> 7)) { + answer++; + } + } + return answer + len; +} + +} // namespace latin1 +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file include/simdutf/scalar/latin1.h */ +/* begin file include/simdutf/scalar/latin1_to_utf16/latin1_to_utf16.h */ +#ifndef SIMDUTF_LATIN1_TO_UTF16_H +#define SIMDUTF_LATIN1_TO_UTF16_H + +namespace simdutf { +namespace scalar { +namespace { +namespace latin1_to_utf16 { + +template +#if SIMDUTF_CPLUSPLUS20 + requires simdutf::detail::indexes_into_byte_like +#endif +simdutf_constexpr23 size_t convert(InputPtr data, size_t len, + char16_t *utf16_output) { + size_t pos = 0; + char16_t *start{utf16_output}; + + while (pos < len) { + uint16_t word = + uint8_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point + *utf16_output++ = + char16_t(match_system(big_endian) ? word : u16_swap_bytes(word)); + pos++; + } + + return utf16_output - start; +} + +template +inline result convert_with_errors(const char *buf, size_t len, + char16_t *utf16_output) { + const uint8_t *data = reinterpret_cast(buf); + size_t pos = 0; + char16_t *start{utf16_output}; + + while (pos < len) { + uint16_t word = + uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point + *utf16_output++ = + char16_t(match_system(big_endian) ? word : u16_swap_bytes(word)); + pos++; + } + + return result(error_code::SUCCESS, utf16_output - start); +} + +} // namespace latin1_to_utf16 +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file include/simdutf/scalar/latin1_to_utf16/latin1_to_utf16.h */ +/* begin file include/simdutf/scalar/latin1_to_utf32/latin1_to_utf32.h */ +#ifndef SIMDUTF_LATIN1_TO_UTF32_H +#define SIMDUTF_LATIN1_TO_UTF32_H + +namespace simdutf { +namespace scalar { +namespace { +namespace latin1_to_utf32 { + +template +#if SIMDUTF_CPLUSPLUS20 + requires simdutf::detail::indexes_into_byte_like +#endif +simdutf_constexpr23 size_t convert(InputPtr data, size_t len, + char32_t *utf32_output) { + char32_t *start{utf32_output}; + for (size_t i = 0; i < len; i++) { + *utf32_output++ = uint8_t(data[i]); + } + return utf32_output - start; +} + +} // namespace latin1_to_utf32 +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file include/simdutf/scalar/latin1_to_utf32/latin1_to_utf32.h */ +/* begin file include/simdutf/scalar/latin1_to_utf8/latin1_to_utf8.h */ +#ifndef SIMDUTF_LATIN1_TO_UTF8_H +#define SIMDUTF_LATIN1_TO_UTF8_H + +namespace simdutf { +namespace scalar { +namespace { +namespace latin1_to_utf8 { + +template +#if SIMDUTF_CPLUSPLUS20 + requires(simdutf::detail::indexes_into_byte_like && + simdutf::detail::index_assignable_from_char) +#endif +simdutf_constexpr23 size_t convert(InputPtr data, size_t len, + OutputPtr utf8_output) { + // const unsigned char *data = reinterpret_cast(buf); + size_t pos = 0; + size_t utf8_pos = 0; + + while (pos < len) { +#if SIMDUTF_CPLUSPLUS23 + if !consteval +#endif + { + // try to convert the next block of 16 ASCII bytes + if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that + // they are ascii + uint64_t v1; + ::memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2; + ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | + v2}; // We are only interested in these bits: 1000 1000 1000 + // 1000, so it makes sense to concatenate everything + if ((v & 0x8080808080808080) == + 0) { // if NONE of these are set, e.g. all of them are zero, then + // everything is ASCII + size_t final_pos = pos + 16; + while (pos < final_pos) { + utf8_output[utf8_pos++] = char(data[pos]); + pos++; + } + continue; + } + } // if (pos + 16 <= len) + } // !consteval scope + + unsigned char byte = data[pos]; + if ((byte & 0x80) == 0) { // if ASCII + // will generate one UTF-8 bytes + utf8_output[utf8_pos++] = char(byte); + pos++; + } else { + // will generate two UTF-8 bytes + utf8_output[utf8_pos++] = char((byte >> 6) | 0b11000000); + utf8_output[utf8_pos++] = char((byte & 0b111111) | 0b10000000); + pos++; + } + } // while + return utf8_pos; +} + +simdutf_really_inline size_t convert(const char *buf, size_t len, + char *utf8_output) { + return convert(reinterpret_cast(buf), len, + utf8_output); +} + +inline size_t convert_safe(const char *buf, size_t len, char *utf8_output, + size_t utf8_len) { + const unsigned char *data = reinterpret_cast(buf); + size_t pos = 0; + size_t skip_pos = 0; + size_t utf8_pos = 0; + while (pos < len && utf8_pos < utf8_len) { + // try to convert the next block of 16 ASCII bytes + if (pos >= skip_pos && pos + 16 <= len && + utf8_pos + 16 <= utf8_len) { // if it is safe to read 16 more bytes, + // check that they are ascii + uint64_t v1; + ::memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2; + ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | + v2}; // We are only interested in these bits: 1000 1000 1000 + // 1000, so it makes sense to concatenate everything + if ((v & 0x8080808080808080) == + 0) { // if NONE of these are set, e.g. all of them are zero, then + // everything is ASCII + ::memcpy(utf8_output + utf8_pos, buf + pos, 16); + utf8_pos += 16; + pos += 16; + } else { + // At least one of the next 16 bytes are not ASCII, we will process them + // one by one + skip_pos = pos + 16; + } + } else { + const auto byte = data[pos]; + if ((byte & 0x80) == 0) { // if ASCII + // will generate one UTF-8 bytes + utf8_output[utf8_pos++] = char(byte); + pos++; + } else if (utf8_pos + 2 <= utf8_len) { + // will generate two UTF-8 bytes + utf8_output[utf8_pos++] = char((byte >> 6) | 0b11000000); + utf8_output[utf8_pos++] = char((byte & 0b111111) | 0b10000000); + pos++; + } else { + break; + } + } + } + return utf8_pos; +} + +template +#if SIMDUTF_CPLUSPLUS20 + requires(simdutf::detail::indexes_into_byte_like && + simdutf::detail::index_assignable_from_char) +#endif +simdutf_constexpr23 size_t convert_safe_constexpr(InputPtr data, size_t len, + OutputPtr utf8_output, + size_t utf8_len) { + size_t pos = 0; + size_t utf8_pos = 0; + while (pos < len && utf8_pos < utf8_len) { + const unsigned char byte = data[pos]; + if ((byte & 0x80) == 0) { // if ASCII + // will generate one UTF-8 bytes + utf8_output[utf8_pos++] = char(byte); + pos++; + } else if (utf8_pos + 2 <= utf8_len) { + // will generate two UTF-8 bytes + utf8_output[utf8_pos++] = char((byte >> 6) | 0b11000000); + utf8_output[utf8_pos++] = char((byte & 0b111111) | 0b10000000); + pos++; + } else { + break; + } + } + return utf8_pos; +} + +template +#if SIMDUTF_CPLUSPLUS20 + requires simdutf::detail::indexes_into_byte_like +#endif +simdutf_constexpr23 simdutf_warn_unused size_t +utf8_length_from_latin1(InputPtr input, size_t length) noexcept { + size_t answer = length; + size_t i = 0; + +#if SIMDUTF_CPLUSPLUS23 + if !consteval +#endif + { + auto pop = [](uint64_t v) { + return (size_t)(((v >> 7) & UINT64_C(0x0101010101010101)) * + UINT64_C(0x0101010101010101) >> + 56); + }; + for (; i + 32 <= length; i += 32) { + uint64_t v; + memcpy(&v, input + i, 8); + answer += pop(v); + memcpy(&v, input + i + 8, sizeof(v)); + answer += pop(v); + memcpy(&v, input + i + 16, sizeof(v)); + answer += pop(v); + memcpy(&v, input + i + 24, sizeof(v)); + answer += pop(v); + } + for (; i + 8 <= length; i += 8) { + uint64_t v; + memcpy(&v, input + i, sizeof(v)); + answer += pop(v); + } + } // !consteval scope + for (; i + 1 <= length; i += 1) { + answer += static_cast(input[i]) >> 7; + } + return answer; +} + +} // namespace latin1_to_utf8 +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file include/simdutf/scalar/latin1_to_utf8/latin1_to_utf8.h */ +/* begin file include/simdutf/scalar/utf16.h */ +#ifndef SIMDUTF_UTF16_H +#define SIMDUTF_UTF16_H + +namespace simdutf { +namespace scalar { +namespace utf16 { + +template +simdutf_warn_unused simdutf_constexpr23 bool +validate_as_ascii(const char16_t *data, size_t len) noexcept { + for (size_t pos = 0; pos < len; pos++) { + char16_t word = scalar::utf16::swap_if_needed(data[pos]); + if (word >= 0x80) { + return false; + } + } + return true; +} + +template +inline simdutf_warn_unused simdutf_constexpr23 bool +validate(const char16_t *data, size_t len) noexcept { + uint64_t pos = 0; + while (pos < len) { + char16_t word = scalar::utf16::swap_if_needed(data[pos]); + if ((word & 0xF800) == 0xD800) { + if (pos + 1 >= len) { + return false; + } + char16_t diff = char16_t(word - 0xD800); + if (diff > 0x3FF) { + return false; + } + char16_t next_word = !match_system(big_endian) + ? u16_swap_bytes(data[pos + 1]) + : data[pos + 1]; + char16_t diff2 = char16_t(next_word - 0xDC00); + if (diff2 > 0x3FF) { + return false; + } + pos += 2; + } else { + pos++; + } + } + return true; +} + +template +inline simdutf_warn_unused simdutf_constexpr23 result +validate_with_errors(const char16_t *data, size_t len) noexcept { + size_t pos = 0; + while (pos < len) { + char16_t word = scalar::utf16::swap_if_needed(data[pos]); + if ((word & 0xF800) == 0xD800) { + if (pos + 1 >= len) { + return result(error_code::SURROGATE, pos); + } + char16_t diff = char16_t(word - 0xD800); + if (diff > 0x3FF) { + return result(error_code::SURROGATE, pos); + } + char16_t next_word = !match_system(big_endian) + ? u16_swap_bytes(data[pos + 1]) + : data[pos + 1]; + char16_t diff2 = uint16_t(next_word - 0xDC00); + if (diff2 > 0x3FF) { + return result(error_code::SURROGATE, pos); + } + pos += 2; + } else { + pos++; + } + } + return result(error_code::SUCCESS, pos); +} + +template +simdutf_constexpr23 size_t count_code_points(const char16_t *p, size_t len) { + // We are not BOM aware. + size_t counter{0}; + for (size_t i = 0; i < len; i++) { + char16_t word = scalar::utf16::swap_if_needed(p[i]); + counter += ((word & 0xFC00) != 0xDC00); + } + return counter; +} + +template +simdutf_constexpr23 size_t utf8_length_from_utf16(const char16_t *p, + size_t len) { + // We are not BOM aware. + size_t counter{0}; + for (size_t i = 0; i < len; i++) { + char16_t word = scalar::utf16::swap_if_needed(p[i]); + counter++; // ASCII + counter += static_cast( + word > + 0x7F); // non-ASCII is at least 2 bytes, surrogates are 2*2 == 4 bytes + counter += static_cast((word > 0x7FF && word <= 0xD7FF) || + (word >= 0xE000)); // three-byte + } + return counter; +} + +template +simdutf_constexpr23 size_t utf32_length_from_utf16(const char16_t *p, + size_t len) { + // We are not BOM aware. + size_t counter{0}; + for (size_t i = 0; i < len; i++) { + char16_t word = scalar::utf16::swap_if_needed(p[i]); + counter += ((word & 0xFC00) != 0xDC00); + } + return counter; +} + +simdutf_really_inline simdutf_constexpr23 void +change_endianness_utf16(const char16_t *input, size_t size, char16_t *output) { + for (size_t i = 0; i < size; i++) { + *output++ = char16_t(input[i] >> 8 | input[i] << 8); + } +} + +template +simdutf_warn_unused simdutf_constexpr23 size_t +trim_partial_utf16(const char16_t *input, size_t length) { + if (length == 0) { + return 0; + } + uint16_t last_word = uint16_t(input[length - 1]); + last_word = scalar::utf16::swap_if_needed(last_word); + length -= ((last_word & 0xFC00) == 0xD800); + return length; +} + +template +simdutf_constexpr bool is_high_surrogate(char16_t c) { + c = scalar::utf16::swap_if_needed(c); + return (0xd800 <= c && c <= 0xdbff); +} + +template +simdutf_constexpr bool is_low_surrogate(char16_t c) { + c = scalar::utf16::swap_if_needed(c); + return (0xdc00 <= c && c <= 0xdfff); +} + +simdutf_really_inline constexpr bool high_surrogate(char16_t c) { + return (0xd800 <= c && c <= 0xdbff); +} + +simdutf_really_inline constexpr bool low_surrogate(char16_t c) { + return (0xdc00 <= c && c <= 0xdfff); +} + +template +simdutf_constexpr23 result +utf8_length_from_utf16_with_replacement(const char16_t *p, size_t len) { + bool any_surrogates = false; + // We are not BOM aware. + size_t counter{0}; + for (size_t i = 0; i < len; i++) { + if (is_high_surrogate(p[i])) { + any_surrogates = true; + // surrogate pair + if (i + 1 < len && is_low_surrogate(p[i + 1])) { + counter += 4; + i++; // skip low surrogate + } else { + counter += 3; // unpaired high surrogate replaced by U+FFFD + } + continue; + } else if (is_low_surrogate(p[i])) { + any_surrogates = true; + counter += 3; // unpaired low surrogate replaced by U+FFFD + continue; + } + char16_t word = !match_system(big_endian) ? u16_swap_bytes(p[i]) : p[i]; + counter++; // at least 1 byte + counter += + static_cast(word > 0x7F); // non-ASCII is at least 2 bytes + counter += static_cast(word > 0x7FF); // three-byte + } + return {any_surrogates ? error_code::SURROGATE : error_code::SUCCESS, + counter}; +} + +// variable templates are a C++14 extension +template constexpr char16_t replacement() { + return !match_system(big_endian) ? scalar::u16_swap_bytes(0xfffd) : 0xfffd; +} + +template +simdutf_constexpr23 void to_well_formed_utf16(const char16_t *input, size_t len, + char16_t *output) { + const char16_t replacement = utf16::replacement(); + bool high_surrogate_prev = false, high_surrogate, low_surrogate; + size_t i = 0; + for (; i < len; i++) { + char16_t c = input[i]; + high_surrogate = is_high_surrogate(c); + low_surrogate = is_low_surrogate(c); + if (high_surrogate_prev && !low_surrogate) { + output[i - 1] = replacement; + } + + if (!high_surrogate_prev && low_surrogate) { + output[i] = replacement; + } else { + output[i] = input[i]; + } + high_surrogate_prev = high_surrogate; + } + + /* string may not end with high surrogate */ + if (high_surrogate_prev) { + output[i - 1] = replacement; + } +} + +} // namespace utf16 +} // namespace scalar +} // namespace simdutf + +#endif +/* end file include/simdutf/scalar/utf16.h */ +/* begin file include/simdutf/scalar/utf16_to_latin1/utf16_to_latin1.h */ +#ifndef SIMDUTF_UTF16_TO_LATIN1_H +#define SIMDUTF_UTF16_TO_LATIN1_H + +#include // for std::memcpy + +namespace simdutf { +namespace scalar { +namespace { +namespace utf16_to_latin1 { + +template +#if SIMDUTF_CPLUSPLUS20 + requires(simdutf::detail::indexes_into_utf16 && + simdutf::detail::index_assignable_from_char) +#endif +simdutf_constexpr23 size_t convert(InputPtr data, size_t len, + OutputPtr latin_output) { + if (len == 0) { + return 0; + } + size_t pos = 0; + const auto latin_output_start = latin_output; + uint16_t word = 0; + uint16_t too_large = 0; + + while (pos < len) { + word = !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos]; + too_large |= word; + *latin_output++ = char(word & 0xFF); + pos++; + } + if ((too_large & 0xFF00) != 0) { + return 0; + } + + return latin_output - latin_output_start; +} + +template +#if SIMDUTF_CPLUSPLUS20 + requires(simdutf::detail::indexes_into_utf16 && + simdutf::detail::index_assignable_from_char) +#endif +simdutf_constexpr23 result convert_with_errors(InputPtr data, size_t len, + OutputPtr latin_output) { + if (len == 0) { + return result(error_code::SUCCESS, 0); + } + size_t pos = 0; + auto start = latin_output; + uint16_t word; + + while (pos < len) { +#if SIMDUTF_CPLUSPLUS23 + if !consteval +#endif + { + if (pos + 16 <= len) { // if it is safe to read 32 more bytes, check that + // they are Latin1 + uint64_t v1, v2, v3, v4; + ::memcpy(&v1, data + pos, sizeof(uint64_t)); + ::memcpy(&v2, data + pos + 4, sizeof(uint64_t)); + ::memcpy(&v3, data + pos + 8, sizeof(uint64_t)); + ::memcpy(&v4, data + pos + 12, sizeof(uint64_t)); + + if simdutf_constexpr (!match_system(big_endian)) { + v1 = (v1 >> 8) | (v1 << (64 - 8)); + } + if simdutf_constexpr (!match_system(big_endian)) { + v2 = (v2 >> 8) | (v2 << (64 - 8)); + } + if simdutf_constexpr (!match_system(big_endian)) { + v3 = (v3 >> 8) | (v3 << (64 - 8)); + } + if simdutf_constexpr (!match_system(big_endian)) { + v4 = (v4 >> 8) | (v4 << (64 - 8)); + } + + if (((v1 | v2 | v3 | v4) & 0xFF00FF00FF00FF00) == 0) { + size_t final_pos = pos + 16; + while (pos < final_pos) { + *latin_output++ = !match_system(big_endian) + ? char(u16_swap_bytes(data[pos])) + : char(data[pos]); + pos++; + } + continue; + } + } + } + + word = !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos]; + if ((word & 0xFF00) == 0) { + *latin_output++ = char(word & 0xFF); + pos++; + } else { + return result(error_code::TOO_LARGE, pos); + } + } + return result(error_code::SUCCESS, latin_output - start); +} + +} // namespace utf16_to_latin1 +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file include/simdutf/scalar/utf16_to_latin1/utf16_to_latin1.h */ +/* begin file include/simdutf/scalar/utf16_to_latin1/valid_utf16_to_latin1.h */ +#ifndef SIMDUTF_VALID_UTF16_TO_LATIN1_H +#define SIMDUTF_VALID_UTF16_TO_LATIN1_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf16_to_latin1 { + +template +simdutf_constexpr23 inline size_t +convert_valid_impl(InputIterator data, size_t len, + OutputIterator latin_output) { + static_assert( + std::is_same::type, uint16_t>::value, + "must decay to uint16_t"); + size_t pos = 0; + const auto start = latin_output; + uint16_t word = 0; + + while (pos < len) { + word = !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos]; + *latin_output++ = char(word); + pos++; + } + + return latin_output - start; +} + +template +simdutf_really_inline size_t convert_valid(const char16_t *buf, size_t len, + char *latin_output) { + return convert_valid_impl(reinterpret_cast(buf), + len, latin_output); +} +} // namespace utf16_to_latin1 +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file include/simdutf/scalar/utf16_to_latin1/valid_utf16_to_latin1.h */ +/* begin file include/simdutf/scalar/utf16_to_utf32/utf16_to_utf32.h */ +#ifndef SIMDUTF_UTF16_TO_UTF32_H +#define SIMDUTF_UTF16_TO_UTF32_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf16_to_utf32 { + +template +simdutf_constexpr23 size_t convert(const char16_t *data, size_t len, + char32_t *utf32_output) { + size_t pos = 0; + char32_t *start{utf32_output}; + while (pos < len) { + uint16_t word = + !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos]; + if ((word & 0xF800) != 0xD800) { + // No surrogate pair, extend 16-bit word to 32-bit word + *utf32_output++ = char32_t(word); + pos++; + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + if (diff > 0x3FF) { + return 0; + } + if (pos + 1 >= len) { + return 0; + } // minimal bound checking + uint16_t next_word = !match_system(big_endian) + ? u16_swap_bytes(data[pos + 1]) + : data[pos + 1]; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if (diff2 > 0x3FF) { + return 0; + } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf32_output++ = char32_t(value); + pos += 2; + } + } + return utf32_output - start; +} + +template +simdutf_constexpr23 result convert_with_errors(const char16_t *data, size_t len, + char32_t *utf32_output) { + size_t pos = 0; + char32_t *start{utf32_output}; + while (pos < len) { + uint16_t word = + !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos]; + if ((word & 0xF800) != 0xD800) { + // No surrogate pair, extend 16-bit word to 32-bit word + *utf32_output++ = char32_t(word); + pos++; + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + if (diff > 0x3FF) { + return result(error_code::SURROGATE, pos); + } + if (pos + 1 >= len) { + return result(error_code::SURROGATE, pos); + } // minimal bound checking + uint16_t next_word = !match_system(big_endian) + ? u16_swap_bytes(data[pos + 1]) + : data[pos + 1]; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if (diff2 > 0x3FF) { + return result(error_code::SURROGATE, pos); + } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf32_output++ = char32_t(value); + pos += 2; + } + } + return result(error_code::SUCCESS, utf32_output - start); +} + +} // namespace utf16_to_utf32 +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file include/simdutf/scalar/utf16_to_utf32/utf16_to_utf32.h */ +/* begin file include/simdutf/scalar/utf16_to_utf32/valid_utf16_to_utf32.h */ +#ifndef SIMDUTF_VALID_UTF16_TO_UTF32_H +#define SIMDUTF_VALID_UTF16_TO_UTF32_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf16_to_utf32 { + +template +simdutf_constexpr23 size_t convert_valid(const char16_t *data, size_t len, + char32_t *utf32_output) { + size_t pos = 0; + char32_t *start{utf32_output}; + while (pos < len) { + uint16_t word = + !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos]; + if ((word & 0xF800) != 0xD800) { + // No surrogate pair, extend 16-bit word to 32-bit word + *utf32_output++ = char32_t(word); + pos++; + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + if (pos + 1 >= len) { + return 0; + } // minimal bound checking + uint16_t next_word = !match_system(big_endian) + ? u16_swap_bytes(data[pos + 1]) + : data[pos + 1]; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf32_output++ = char32_t(value); + pos += 2; + } + } + return utf32_output - start; +} + +} // namespace utf16_to_utf32 +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file include/simdutf/scalar/utf16_to_utf32/valid_utf16_to_utf32.h */ +/* begin file include/simdutf/scalar/utf16_to_utf8/utf16_to_utf8.h */ +#ifndef SIMDUTF_UTF16_TO_UTF8_H +#define SIMDUTF_UTF16_TO_UTF8_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf16_to_utf8 { + +template +#if SIMDUTF_CPLUSPLUS20 + requires simdutf::detail::indexes_into_utf16 +// FIXME constrain output as well +#endif +simdutf_constexpr23 size_t convert(InputPtr data, size_t len, + OutputPtr utf8_output) { + size_t pos = 0; + const auto start = utf8_output; + while (pos < len) { +#if SIMDUTF_CPLUSPLUS23 + if !consteval +#endif + { + // try to convert the next block of 8 bytes + if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that + // they are ascii + uint64_t v; + ::memcpy(&v, data + pos, sizeof(uint64_t)); + if simdutf_constexpr (!match_system(big_endian)) { + v = (v >> 8) | (v << (64 - 8)); + } + if ((v & 0xFF80FF80FF80FF80) == 0) { + size_t final_pos = pos + 4; + while (pos < final_pos) { + *utf8_output++ = !match_system(big_endian) + ? char(u16_swap_bytes(data[pos])) + : char(data[pos]); + pos++; + } + continue; + } + } + } + uint16_t word = + !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos]; + if ((word & 0xFF80) == 0) { + // will generate one UTF-8 bytes + *utf8_output++ = char(word); + pos++; + } else if ((word & 0xF800) == 0) { + // will generate two UTF-8 bytes + // we have 0b110XXXXX 0b10XXXXXX + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } else if ((word & 0xF800) != 0xD800) { + // will generate three UTF-8 bytes + // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } else { + // must be a surrogate pair + if (pos + 1 >= len) { + return 0; + } + uint16_t diff = uint16_t(word - 0xD800); + if (diff > 0x3FF) { + return 0; + } + uint16_t next_word = !match_system(big_endian) + ? u16_swap_bytes(data[pos + 1]) + : data[pos + 1]; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if (diff2 > 0x3FF) { + return 0; + } + uint32_t value = (diff << 10) + diff2 + 0x10000; + // will generate four UTF-8 bytes + // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX + *utf8_output++ = char((value >> 18) | 0b11110000); + *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); + pos += 2; + } + } + return utf8_output - start; +} + +template +#if SIMDUTF_CPLUSPLUS20 + requires(simdutf::detail::indexes_into_utf16 && + simdutf::detail::index_assignable_from_char) +#endif +simdutf_constexpr23 full_result convert_with_errors(InputPtr data, size_t len, + OutputPtr utf8_output, + size_t utf8_len = 0) { + if (check_output && utf8_len == 0) { + return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, 0, 0); + } + + size_t pos = 0; + auto start = utf8_output; + auto end = utf8_output + utf8_len; + + while (pos < len) { +#if SIMDUTF_CPLUSPLUS23 + if !consteval +#endif + { + // try to convert the next block of 8 bytes + if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that + // they are ascii + uint64_t v; + ::memcpy(&v, data + pos, sizeof(uint64_t)); + if simdutf_constexpr (!match_system(big_endian)) + v = (v >> 8) | (v << (64 - 8)); + if ((v & 0xFF80FF80FF80FF80) == 0) { + size_t final_pos = pos + 4; + while (pos < final_pos) { + if (check_output && size_t(end - utf8_output) < 1) { + return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, pos, + utf8_output - start); + } + *utf8_output++ = !match_system(big_endian) + ? char(u16_swap_bytes(data[pos])) + : char(data[pos]); + pos++; + } + continue; + } + } + } + + uint16_t word = + !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos]; + if ((word & 0xFF80) == 0) { + // will generate one UTF-8 bytes + if (check_output && size_t(end - utf8_output) < 1) { + return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, pos, + utf8_output - start); + } + *utf8_output++ = char(word); + pos++; + } else if ((word & 0xF800) == 0) { + // will generate two UTF-8 bytes + // we have 0b110XXXXX 0b10XXXXXX + if (check_output && size_t(end - utf8_output) < 2) { + return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, pos, + utf8_output - start); + } + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + + } else if ((word & 0xF800) != 0xD800) { + // will generate three UTF-8 bytes + // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX + if (check_output && size_t(end - utf8_output) < 3) { + return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, pos, + utf8_output - start); + } + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } else { + + if (check_output && size_t(end - utf8_output) < 4) { + return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, pos, + utf8_output - start); + } + // must be a surrogate pair + if (pos + 1 >= len) { + return full_result(error_code::SURROGATE, pos, utf8_output - start); + } + uint16_t diff = uint16_t(word - 0xD800); + if (diff > 0x3FF) { + return full_result(error_code::SURROGATE, pos, utf8_output - start); + } + uint16_t next_word = !match_system(big_endian) + ? u16_swap_bytes(data[pos + 1]) + : data[pos + 1]; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if (diff2 > 0x3FF) { + return full_result(error_code::SURROGATE, pos, utf8_output - start); + } + uint32_t value = (diff << 10) + diff2 + 0x10000; + // will generate four UTF-8 bytes + // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX + *utf8_output++ = char((value >> 18) | 0b11110000); + *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); + pos += 2; + } + } + return full_result(error_code::SUCCESS, pos, utf8_output - start); +} + +template +inline result simple_convert_with_errors(const char16_t *buf, size_t len, + char *utf8_output) { + return convert_with_errors(buf, len, utf8_output, 0); +} + +} // namespace utf16_to_utf8 +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file include/simdutf/scalar/utf16_to_utf8/utf16_to_utf8.h */ +/* begin file include/simdutf/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */ +#ifndef SIMDUTF_VALID_UTF16_TO_UTF8_H +#define SIMDUTF_VALID_UTF16_TO_UTF8_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf16_to_utf8 { + +template +#if SIMDUTF_CPLUSPLUS20 + requires(simdutf::detail::indexes_into_utf16 && + simdutf::detail::index_assignable_from_char) +#endif +simdutf_constexpr23 size_t convert_valid(InputPtr data, size_t len, + OutputPtr utf8_output) { + size_t pos = 0; + auto start = utf8_output; + while (pos < len) { +#if SIMDUTF_CPLUSPLUS23 + if !consteval +#endif + { + // try to convert the next block of 4 ASCII characters + if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that + // they are ascii + uint64_t v; + ::memcpy(&v, data + pos, sizeof(uint64_t)); + if simdutf_constexpr (!match_system(big_endian)) { + v = (v >> 8) | (v << (64 - 8)); + } + if ((v & 0xFF80FF80FF80FF80) == 0) { + size_t final_pos = pos + 4; + while (pos < final_pos) { + *utf8_output++ = !match_system(big_endian) + ? char(u16_swap_bytes(data[pos])) + : char(data[pos]); + pos++; + } + continue; + } + } + } + + uint16_t word = + !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos]; + if ((word & 0xFF80) == 0) { + // will generate one UTF-8 bytes + *utf8_output++ = char(word); + pos++; + } else if ((word & 0xF800) == 0) { + // will generate two UTF-8 bytes + // we have 0b110XXXXX 0b10XXXXXX + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } else if ((word & 0xF800) != 0xD800) { + // will generate three UTF-8 bytes + // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + if (pos + 1 >= len) { + return 0; + } // minimal bound checking + uint16_t next_word = !match_system(big_endian) + ? u16_swap_bytes(data[pos + 1]) + : data[pos + 1]; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + uint32_t value = (diff << 10) + diff2 + 0x10000; + // will generate four UTF-8 bytes + // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX + *utf8_output++ = char((value >> 18) | 0b11110000); + *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); + pos += 2; + } + } + return utf8_output - start; +} + +} // namespace utf16_to_utf8 +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file include/simdutf/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */ +/* begin file include/simdutf/scalar/utf32.h */ +#ifndef SIMDUTF_UTF32_H +#define SIMDUTF_UTF32_H + +namespace simdutf { +namespace scalar { +namespace utf32 { + +template +#if SIMDUTF_CPLUSPLUS20 + requires simdutf::detail::indexes_into_uint32 +#endif +simdutf_warn_unused simdutf_constexpr23 bool validate(InputPtr data, + size_t len) noexcept { + uint64_t pos = 0; + for (; pos < len; pos++) { + uint32_t word = data[pos]; + if (word > 0x10FFFF || (word >= 0xD800 && word <= 0xDFFF)) { + return false; + } + } + return true; +} + +simdutf_warn_unused simdutf_really_inline bool validate(const char32_t *buf, + size_t len) noexcept { + return validate(reinterpret_cast(buf), len); +} + +template +#if SIMDUTF_CPLUSPLUS20 + requires simdutf::detail::indexes_into_uint32 +#endif +simdutf_warn_unused simdutf_constexpr23 result +validate_with_errors(InputPtr data, size_t len) noexcept { + size_t pos = 0; + for (; pos < len; pos++) { + uint32_t word = data[pos]; + if (word > 0x10FFFF) { + return result(error_code::TOO_LARGE, pos); + } + if (word >= 0xD800 && word <= 0xDFFF) { + return result(error_code::SURROGATE, pos); + } + } + return result(error_code::SUCCESS, pos); +} + +simdutf_warn_unused simdutf_really_inline result +validate_with_errors(const char32_t *buf, size_t len) noexcept { + return validate_with_errors(reinterpret_cast(buf), len); +} + +inline simdutf_constexpr23 size_t utf8_length_from_utf32(const char32_t *p, + size_t len) { + // We are not BOM aware. + size_t counter{0}; + for (size_t i = 0; i < len; i++) { + // credit: @ttsugriy for the vectorizable approach + counter++; // ASCII + counter += static_cast(p[i] > 0x7F); // two-byte + counter += static_cast(p[i] > 0x7FF); // three-byte + counter += static_cast(p[i] > 0xFFFF); // four-bytes + } + return counter; +} + +inline simdutf_warn_unused simdutf_constexpr23 size_t +utf16_length_from_utf32(const char32_t *p, size_t len) { + // We are not BOM aware. + size_t counter{0}; + for (size_t i = 0; i < len; i++) { + counter++; // non-surrogate word + counter += static_cast(p[i] > 0xFFFF); // surrogate pair + } + return counter; +} + +} // namespace utf32 +} // namespace scalar +} // namespace simdutf + +#endif +/* end file include/simdutf/scalar/utf32.h */ +/* begin file include/simdutf/scalar/utf32_to_latin1/utf32_to_latin1.h */ +#ifndef SIMDUTF_UTF32_TO_LATIN1_H +#define SIMDUTF_UTF32_TO_LATIN1_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf32_to_latin1 { + +inline simdutf_constexpr23 size_t convert(const char32_t *data, size_t len, + char *latin1_output) { + char *start = latin1_output; + uint32_t utf32_char; + size_t pos = 0; + uint32_t too_large = 0; + + while (pos < len) { + utf32_char = (uint32_t)data[pos]; + too_large |= utf32_char; + *latin1_output++ = (char)(utf32_char & 0xFF); + pos++; + } + if ((too_large & 0xFFFFFF00) != 0) { + return 0; + } + return latin1_output - start; +} + +inline simdutf_constexpr23 result convert_with_errors(const char32_t *data, + size_t len, + char *latin1_output) { + char *start{latin1_output}; + size_t pos = 0; + while (pos < len) { +#if SIMDUTF_CPLUSPLUS23 + if !consteval +#endif + { + if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that + // they are Latin1 + uint64_t v; + ::memcpy(&v, data + pos, sizeof(uint64_t)); + if ((v & 0xFFFFFF00FFFFFF00) == 0) { + *latin1_output++ = char(data[pos]); + *latin1_output++ = char(data[pos + 1]); + pos += 2; + continue; + } + } + } + + uint32_t utf32_char = data[pos]; + if ((utf32_char & 0xFFFFFF00) == + 0) { // Check if the character can be represented in Latin-1 + *latin1_output++ = (char)(utf32_char & 0xFF); + pos++; + } else { + return result(error_code::TOO_LARGE, pos); + }; + } + return result(error_code::SUCCESS, latin1_output - start); +} + +} // namespace utf32_to_latin1 +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file include/simdutf/scalar/utf32_to_latin1/utf32_to_latin1.h */ +/* begin file include/simdutf/scalar/utf32_to_latin1/valid_utf32_to_latin1.h */ +#ifndef SIMDUTF_VALID_UTF32_TO_LATIN1_H +#define SIMDUTF_VALID_UTF32_TO_LATIN1_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf32_to_latin1 { + +template +simdutf_constexpr23 size_t convert_valid(ReadPtr data, size_t len, + WritePtr latin1_output) { + static_assert( + std::is_same::type, uint32_t>::value, + "dereferencing the data pointer must result in a uint32_t"); + auto start = latin1_output; + uint32_t utf32_char; + size_t pos = 0; + + while (pos < len) { + utf32_char = data[pos]; + +#if SIMDUTF_CPLUSPLUS23 + // avoid using the 8 byte at a time optimization in constant evaluation + // mode. memcpy can't be used and replacing it with bitwise or gave worse + // codegen (when not during constant evaluation). + if !consteval { +#endif + if (pos + 2 <= len) { + // if it is safe to read 8 more bytes, check that they are Latin1 + uint64_t v; + std::memcpy(&v, data + pos, sizeof(uint64_t)); + if ((v & 0xFFFFFF00FFFFFF00) == 0) { + *latin1_output++ = char(data[pos]); + *latin1_output++ = char(data[pos + 1]); + pos += 2; + continue; + } else { + // output can not be represented in latin1 + return 0; + } + } +#if SIMDUTF_CPLUSPLUS23 + } // if ! consteval +#endif + if ((utf32_char & 0xFFFFFF00) == 0) { + *latin1_output++ = char(utf32_char); + } else { + // output can not be represented in latin1 + return 0; + } + pos++; + } + return latin1_output - start; +} + +simdutf_really_inline size_t convert_valid(const char32_t *buf, size_t len, + char *latin1_output) { + return convert_valid(reinterpret_cast(buf), len, + latin1_output); +} + +} // namespace utf32_to_latin1 +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file include/simdutf/scalar/utf32_to_latin1/valid_utf32_to_latin1.h */ +/* begin file include/simdutf/scalar/utf32_to_utf16/utf32_to_utf16.h */ +#ifndef SIMDUTF_UTF32_TO_UTF16_H +#define SIMDUTF_UTF32_TO_UTF16_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf32_to_utf16 { + +template +simdutf_constexpr23 size_t convert(const char32_t *data, size_t len, + char16_t *utf16_output) { + size_t pos = 0; + char16_t *start{utf16_output}; + while (pos < len) { + uint32_t word = data[pos]; + if ((word & 0xFFFF0000) == 0) { + if (word >= 0xD800 && word <= 0xDFFF) { + return 0; + } + // will not generate a surrogate pair + *utf16_output++ = !match_system(big_endian) + ? char16_t(u16_swap_bytes(uint16_t(word))) + : char16_t(word); + } else { + // will generate a surrogate pair + if (word > 0x10FFFF) { + return 0; + } + word -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); + if simdutf_constexpr (!match_system(big_endian)) { + high_surrogate = u16_swap_bytes(high_surrogate); + low_surrogate = u16_swap_bytes(low_surrogate); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + } + pos++; + } + return utf16_output - start; +} + +template +simdutf_constexpr23 result convert_with_errors(const char32_t *data, size_t len, + char16_t *utf16_output) { + size_t pos = 0; + char16_t *start{utf16_output}; + while (pos < len) { + uint32_t word = data[pos]; + if ((word & 0xFFFF0000) == 0) { + if (word >= 0xD800 && word <= 0xDFFF) { + return result(error_code::SURROGATE, pos); + } + // will not generate a surrogate pair + *utf16_output++ = !match_system(big_endian) + ? char16_t(u16_swap_bytes(uint16_t(word))) + : char16_t(word); + } else { + // will generate a surrogate pair + if (word > 0x10FFFF) { + return result(error_code::TOO_LARGE, pos); + } + word -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); + if simdutf_constexpr (!match_system(big_endian)) { + high_surrogate = u16_swap_bytes(high_surrogate); + low_surrogate = u16_swap_bytes(low_surrogate); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + } + pos++; + } + return result(error_code::SUCCESS, utf16_output - start); +} + +} // namespace utf32_to_utf16 +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file include/simdutf/scalar/utf32_to_utf16/utf32_to_utf16.h */ +/* begin file include/simdutf/scalar/utf32_to_utf16/valid_utf32_to_utf16.h */ +#ifndef SIMDUTF_VALID_UTF32_TO_UTF16_H +#define SIMDUTF_VALID_UTF32_TO_UTF16_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf32_to_utf16 { + +template +simdutf_constexpr23 size_t convert_valid(const char32_t *data, size_t len, + char16_t *utf16_output) { + size_t pos = 0; + char16_t *start{utf16_output}; + while (pos < len) { + uint32_t word = data[pos]; + if ((word & 0xFFFF0000) == 0) { + // will not generate a surrogate pair + *utf16_output++ = !match_system(big_endian) + ? char16_t(u16_swap_bytes(uint16_t(word))) + : char16_t(word); + pos++; + } else { + // will generate a surrogate pair + word -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); + if simdutf_constexpr (!match_system(big_endian)) { + high_surrogate = u16_swap_bytes(high_surrogate); + low_surrogate = u16_swap_bytes(low_surrogate); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + pos++; + } + } + return utf16_output - start; +} + +} // namespace utf32_to_utf16 +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file include/simdutf/scalar/utf32_to_utf16/valid_utf32_to_utf16.h */ +/* begin file include/simdutf/scalar/utf32_to_utf8/utf32_to_utf8.h */ +#ifndef SIMDUTF_UTF32_TO_UTF8_H +#define SIMDUTF_UTF32_TO_UTF8_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf32_to_utf8 { + +template +#if SIMDUTF_CPLUSPLUS20 + requires(simdutf::detail::indexes_into_utf32 && + simdutf::detail::index_assignable_from_char) +#endif +simdutf_constexpr23 size_t convert(InputPtr data, size_t len, + OutputPtr utf8_output) { + size_t pos = 0; + auto start = utf8_output; + while (pos < len) { +#if SIMDUTF_CPLUSPLUS23 + if !consteval +#endif + { // try to convert the next block of 2 ASCII characters + if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that + // they are ascii + uint64_t v; + ::memcpy(&v, data + pos, sizeof(uint64_t)); + if ((v & 0xFFFFFF80FFFFFF80) == 0) { + *utf8_output++ = char(data[pos]); + *utf8_output++ = char(data[pos + 1]); + pos += 2; + continue; + } + } + } + + uint32_t word = data[pos]; + if ((word & 0xFFFFFF80) == 0) { + // will generate one UTF-8 bytes + *utf8_output++ = char(word); + pos++; + } else if ((word & 0xFFFFF800) == 0) { + // will generate two UTF-8 bytes + // we have 0b110XXXXX 0b10XXXXXX + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } else if ((word & 0xFFFF0000) == 0) { + // will generate three UTF-8 bytes + // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX + if (word >= 0xD800 && word <= 0xDFFF) { + return 0; + } + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } else { + // will generate four UTF-8 bytes + // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX + if (word > 0x10FFFF) { + return 0; + } + *utf8_output++ = char((word >> 18) | 0b11110000); + *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } + } + return utf8_output - start; +} + +template +#if SIMDUTF_CPLUSPLUS20 + requires(simdutf::detail::indexes_into_utf32 && + simdutf::detail::index_assignable_from_char) +#endif +simdutf_constexpr23 result convert_with_errors(InputPtr data, size_t len, + OutputPtr utf8_output) { + size_t pos = 0; + auto start = utf8_output; + while (pos < len) { +#if SIMDUTF_CPLUSPLUS23 + if !consteval +#endif + { // try to convert the next block of 2 ASCII characters + if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that + // they are ascii + uint64_t v; + ::memcpy(&v, data + pos, sizeof(uint64_t)); + if ((v & 0xFFFFFF80FFFFFF80) == 0) { + *utf8_output++ = char(data[pos]); + *utf8_output++ = char(data[pos + 1]); + pos += 2; + continue; + } + } + } + + uint32_t word = data[pos]; + if ((word & 0xFFFFFF80) == 0) { + // will generate one UTF-8 bytes + *utf8_output++ = char(word); + pos++; + } else if ((word & 0xFFFFF800) == 0) { + // will generate two UTF-8 bytes + // we have 0b110XXXXX 0b10XXXXXX + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } else if ((word & 0xFFFF0000) == 0) { + // will generate three UTF-8 bytes + // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX + if (word >= 0xD800 && word <= 0xDFFF) { + return result(error_code::SURROGATE, pos); + } + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } else { + // will generate four UTF-8 bytes + // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX + if (word > 0x10FFFF) { + return result(error_code::TOO_LARGE, pos); + } + *utf8_output++ = char((word >> 18) | 0b11110000); + *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } + } + return result(error_code::SUCCESS, utf8_output - start); +} + +} // namespace utf32_to_utf8 +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file include/simdutf/scalar/utf32_to_utf8/utf32_to_utf8.h */ +/* begin file include/simdutf/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */ +#ifndef SIMDUTF_VALID_UTF32_TO_UTF8_H +#define SIMDUTF_VALID_UTF32_TO_UTF8_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf32_to_utf8 { + +template +#if SIMDUTF_CPLUSPLUS20 + requires(simdutf::detail::indexes_into_utf32 && + simdutf::detail::index_assignable_from_char) +#endif +simdutf_constexpr23 size_t convert_valid(InputPtr data, size_t len, + OutputPtr utf8_output) { + size_t pos = 0; + auto start = utf8_output; + while (pos < len) { +#if SIMDUTF_CPLUSPLUS23 + if !consteval +#endif + { // try to convert the next block of 2 ASCII characters + if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that + // they are ascii + uint64_t v; + ::memcpy(&v, data + pos, sizeof(uint64_t)); + if ((v & 0xFFFFFF80FFFFFF80) == 0) { + *utf8_output++ = char(data[pos]); + *utf8_output++ = char(data[pos + 1]); + pos += 2; + continue; + } + } + } + + uint32_t word = data[pos]; + if ((word & 0xFFFFFF80) == 0) { + // will generate one UTF-8 bytes + *utf8_output++ = char(word); + pos++; + } else if ((word & 0xFFFFF800) == 0) { + // will generate two UTF-8 bytes + // we have 0b110XXXXX 0b10XXXXXX + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } else if ((word & 0xFFFF0000) == 0) { + // will generate three UTF-8 bytes + // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } else { + // will generate four UTF-8 bytes + // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX + *utf8_output++ = char((word >> 18) | 0b11110000); + *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } + } + return utf8_output - start; +} + +} // namespace utf32_to_utf8 +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file include/simdutf/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */ +/* begin file include/simdutf/scalar/utf8.h */ +#ifndef SIMDUTF_UTF8_H +#define SIMDUTF_UTF8_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf8 { + +// credit: based on code from Google Fuchsia (Apache Licensed) +template +simdutf_constexpr23 simdutf_warn_unused bool validate(BytePtr data, + size_t len) noexcept { + static_assert( + std::is_same::type, uint8_t>::value, + "dereferencing the data pointer must result in a uint8_t"); + uint64_t pos = 0; + uint32_t code_point = 0; + while (pos < len) { + uint64_t next_pos; +#if SIMDUTF_CPLUSPLUS23 + if !consteval +#endif + { // check if the next 16 bytes are ascii. + next_pos = pos + 16; + if (next_pos <= len) { // if it is safe to read 16 more bytes, check + // that they are ascii + uint64_t v1{}; + std::memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2{}; + std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | v2}; + if ((v & 0x8080808080808080) == 0) { + pos = next_pos; + continue; + } + } + } + + unsigned char byte = data[pos]; + + while (byte < 0b10000000) { + if (++pos == len) { + return true; + } + byte = data[pos]; + } + + if ((byte & 0b11100000) == 0b11000000) { + next_pos = pos + 2; + if (next_pos > len) { + return false; + } + if ((data[pos + 1] & 0b11000000) != 0b10000000) { + return false; + } + // range check + code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); + if ((code_point < 0x80) || (0x7ff < code_point)) { + return false; + } + } else if ((byte & 0b11110000) == 0b11100000) { + next_pos = pos + 3; + if (next_pos > len) { + return false; + } + if ((data[pos + 1] & 0b11000000) != 0b10000000) { + return false; + } + if ((data[pos + 2] & 0b11000000) != 0b10000000) { + return false; + } + // range check + code_point = (byte & 0b00001111) << 12 | + (data[pos + 1] & 0b00111111) << 6 | + (data[pos + 2] & 0b00111111); + if ((code_point < 0x800) || (0xffff < code_point) || + (0xd7ff < code_point && code_point < 0xe000)) { + return false; + } + } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000 + next_pos = pos + 4; + if (next_pos > len) { + return false; + } + if ((data[pos + 1] & 0b11000000) != 0b10000000) { + return false; + } + if ((data[pos + 2] & 0b11000000) != 0b10000000) { + return false; + } + if ((data[pos + 3] & 0b11000000) != 0b10000000) { + return false; + } + // range check + code_point = + (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | + (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111); + if (code_point <= 0xffff || 0x10ffff < code_point) { + return false; + } + } else { + // we may have a continuation + return false; + } + pos = next_pos; + } + return true; +} + +simdutf_really_inline simdutf_warn_unused bool validate(const char *buf, + size_t len) noexcept { + return validate(reinterpret_cast(buf), len); +} + +template +simdutf_constexpr23 simdutf_warn_unused result +validate_with_errors(BytePtr data, size_t len) noexcept { + static_assert( + std::is_same::type, uint8_t>::value, + "dereferencing the data pointer must result in a uint8_t"); + size_t pos = 0; + uint32_t code_point = 0; + while (pos < len) { + // check of the next 16 bytes are ascii. + size_t next_pos = pos + 16; + if (next_pos <= + len) { // if it is safe to read 16 more bytes, check that they are ascii + uint64_t v1; + std::memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2; + std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | v2}; + if ((v & 0x8080808080808080) == 0) { + pos = next_pos; + continue; + } + } + unsigned char byte = data[pos]; + + while (byte < 0b10000000) { + if (++pos == len) { + return result(error_code::SUCCESS, len); + } + byte = data[pos]; + } + + if ((byte & 0b11100000) == 0b11000000) { + next_pos = pos + 2; + if (next_pos > len) { + return result(error_code::TOO_SHORT, pos); + } + if ((data[pos + 1] & 0b11000000) != 0b10000000) { + return result(error_code::TOO_SHORT, pos); + } + // range check + code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); + if ((code_point < 0x80) || (0x7ff < code_point)) { + return result(error_code::OVERLONG, pos); + } + } else if ((byte & 0b11110000) == 0b11100000) { + next_pos = pos + 3; + if (next_pos > len) { + return result(error_code::TOO_SHORT, pos); + } + if ((data[pos + 1] & 0b11000000) != 0b10000000) { + return result(error_code::TOO_SHORT, pos); + } + if ((data[pos + 2] & 0b11000000) != 0b10000000) { + return result(error_code::TOO_SHORT, pos); + } + // range check + code_point = (byte & 0b00001111) << 12 | + (data[pos + 1] & 0b00111111) << 6 | + (data[pos + 2] & 0b00111111); + if ((code_point < 0x800) || (0xffff < code_point)) { + return result(error_code::OVERLONG, pos); + } + if (0xd7ff < code_point && code_point < 0xe000) { + return result(error_code::SURROGATE, pos); + } + } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000 + next_pos = pos + 4; + if (next_pos > len) { + return result(error_code::TOO_SHORT, pos); + } + if ((data[pos + 1] & 0b11000000) != 0b10000000) { + return result(error_code::TOO_SHORT, pos); + } + if ((data[pos + 2] & 0b11000000) != 0b10000000) { + return result(error_code::TOO_SHORT, pos); + } + if ((data[pos + 3] & 0b11000000) != 0b10000000) { + return result(error_code::TOO_SHORT, pos); + } + // range check + code_point = + (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | + (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111); + if (code_point <= 0xffff) { + return result(error_code::OVERLONG, pos); + } + if (0x10ffff < code_point) { + return result(error_code::TOO_LARGE, pos); + } + } else { + // we either have too many continuation bytes or an invalid leading byte + if ((byte & 0b11000000) == 0b10000000) { + return result(error_code::TOO_LONG, pos); + } else { + return result(error_code::HEADER_BITS, pos); + } + } + pos = next_pos; + } + return result(error_code::SUCCESS, len); +} + +simdutf_really_inline simdutf_warn_unused result +validate_with_errors(const char *buf, size_t len) noexcept { + return validate_with_errors(reinterpret_cast(buf), len); +} + +// Finds the previous leading byte starting backward from buf and validates with +// errors from there Used to pinpoint the location of an error when an invalid +// chunk is detected We assume that the stream starts with a leading byte, and +// to check that it is the case, we ask that you pass a pointer to the start of +// the stream (start). +inline simdutf_warn_unused result rewind_and_validate_with_errors( + const char *start, const char *buf, size_t len) noexcept { + // First check that we start with a leading byte + if ((*start & 0b11000000) == 0b10000000) { + return result(error_code::TOO_LONG, 0); + } + size_t extra_len{0}; + // A leading byte cannot be further than 4 bytes away + for (int i = 0; i < 5; i++) { + unsigned char byte = *buf; + if ((byte & 0b11000000) != 0b10000000) { + break; + } else { + buf--; + extra_len++; + } + } + + result res = validate_with_errors(buf, len + extra_len); + res.count -= extra_len; + return res; +} + +template +#if SIMDUTF_CPLUSPLUS20 + requires simdutf::detail::indexes_into_byte_like +#endif +simdutf_constexpr23 size_t count_code_points(InputPtr data, size_t len) { + size_t counter{0}; + for (size_t i = 0; i < len; i++) { + // -65 is 0b10111111, anything larger in two-complement's should start a new + // code point. + if (int8_t(data[i]) > -65) { + counter++; + } + } + return counter; +} + +template +#if SIMDUTF_CPLUSPLUS20 + requires simdutf::detail::indexes_into_byte_like +#endif +simdutf_constexpr23 size_t utf16_length_from_utf8(InputPtr data, size_t len) { + size_t counter{0}; + for (size_t i = 0; i < len; i++) { + if (int8_t(data[i]) > -65) { + counter++; + } + if (uint8_t(data[i]) >= 240) { + counter++; + } + } + return counter; +} + +template +#if SIMDUTF_CPLUSPLUS20 + requires simdutf::detail::indexes_into_byte_like +#endif +simdutf_warn_unused simdutf_constexpr23 size_t +trim_partial_utf8(InputPtr input, size_t length) { + if (length < 3) { + switch (length) { + case 2: + if (uint8_t(input[length - 1]) >= 0xc0) { + return length - 1; + } // 2-, 3- and 4-byte characters with only 1 byte left + if (uint8_t(input[length - 2]) >= 0xe0) { + return length - 2; + } // 3- and 4-byte characters with only 2 bytes left + return length; + case 1: + if (uint8_t(input[length - 1]) >= 0xc0) { + return length - 1; + } // 2-, 3- and 4-byte characters with only 1 byte left + return length; + case 0: + return length; + } + } + if (uint8_t(input[length - 1]) >= 0xc0) { + return length - 1; + } // 2-, 3- and 4-byte characters with only 1 byte left + if (uint8_t(input[length - 2]) >= 0xe0) { + return length - 2; + } // 3- and 4-byte characters with only 1 byte left + if (uint8_t(input[length - 3]) >= 0xf0) { + return length - 3; + } // 4-byte characters with only 3 bytes left + return length; +} + +} // namespace utf8 +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file include/simdutf/scalar/utf8.h */ +/* begin file include/simdutf/scalar/utf8_to_latin1/utf8_to_latin1.h */ +#ifndef SIMDUTF_UTF8_TO_LATIN1_H +#define SIMDUTF_UTF8_TO_LATIN1_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf8_to_latin1 { + +template +#if SIMDUTF_CPLUSPLUS20 + requires(simdutf::detail::indexes_into_byte_like && + simdutf::detail::indexes_into_byte_like) +#endif +simdutf_constexpr23 size_t convert(InputPtr data, size_t len, + OutputPtr latin_output) { + size_t pos = 0; + auto start = latin_output; + + while (pos < len) { +#if SIMDUTF_CPLUSPLUS23 + if !consteval +#endif + { + // try to convert the next block of 16 ASCII bytes + if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that + // they are ascii + uint64_t v1; + ::memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2; + ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000 + // 1000 1000 .... etc + if ((v & 0x8080808080808080) == + 0) { // if NONE of these are set, e.g. all of them are zero, then + // everything is ASCII + size_t final_pos = pos + 16; + while (pos < final_pos) { + *latin_output++ = char(data[pos]); + pos++; + } + continue; + } + } + } + + // suppose it is not an all ASCII byte sequence + uint8_t leading_byte = data[pos]; // leading byte + if (leading_byte < 0b10000000) { + // converting one ASCII byte !!! + *latin_output++ = char(leading_byte); + pos++; + } else if ((leading_byte & 0b11100000) == + 0b11000000) { // the first three bits indicate: + // We have a two-byte UTF-8 + if (pos + 1 >= len) { + return 0; + } // minimal bound checking + if ((data[pos + 1] & 0b11000000) != 0b10000000) { + return 0; + } // checks if the next byte is a valid continuation byte in UTF-8. A + // valid continuation byte starts with 10. + // range check - + uint32_t code_point = + (leading_byte & 0b00011111) << 6 | + (data[pos + 1] & + 0b00111111); // assembles the Unicode code point from the two bytes. + // It does this by discarding the leading 110 and 10 + // bits from the two bytes, shifting the remaining bits + // of the first byte, and then combining the results + // with a bitwise OR operation. + if (code_point < 0x80 || 0xFF < code_point) { + return 0; // We only care about the range 129-255 which is Non-ASCII + // latin1 characters. A code_point beneath 0x80 is invalid as + // it is already covered by bytes whose leading bit is zero. + } + *latin_output++ = char(code_point); + pos += 2; + } else { + return 0; + } + } + return latin_output - start; +} + +template +#if SIMDUTF_CPLUSPLUS20 + requires simdutf::detail::indexes_into_byte_like +#endif +simdutf_constexpr23 result convert_with_errors(InputPtr data, size_t len, + char *latin_output) { + size_t pos = 0; + char *start{latin_output}; + + while (pos < len) { +#if SIMDUTF_CPLUSPLUS23 + if !consteval +#endif + { + // try to convert the next block of 16 ASCII bytes + if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that + // they are ascii + uint64_t v1; + ::memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2; + ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000 + // 1000 1000...etc + if ((v & 0x8080808080808080) == + 0) { // if NONE of these are set, e.g. all of them are zero, then + // everything is ASCII + size_t final_pos = pos + 16; + while (pos < final_pos) { + *latin_output++ = char(data[pos]); + pos++; + } + continue; + } + } + } + // suppose it is not an all ASCII byte sequence + uint8_t leading_byte = data[pos]; // leading byte + if (leading_byte < 0b10000000) { + // converting one ASCII byte !!! + *latin_output++ = char(leading_byte); + pos++; + } else if ((leading_byte & 0b11100000) == + 0b11000000) { // the first three bits indicate: + // We have a two-byte UTF-8 + if (pos + 1 >= len) { + return result(error_code::TOO_SHORT, pos); + } // minimal bound checking + if ((data[pos + 1] & 0b11000000) != 0b10000000) { + return result(error_code::TOO_SHORT, pos); + } // checks if the next byte is a valid continuation byte in UTF-8. A + // valid continuation byte starts with 10. + // range check - + uint32_t code_point = + (leading_byte & 0b00011111) << 6 | + (data[pos + 1] & + 0b00111111); // assembles the Unicode code point from the two bytes. + // It does this by discarding the leading 110 and 10 + // bits from the two bytes, shifting the remaining bits + // of the first byte, and then combining the results + // with a bitwise OR operation. + if (code_point < 0x80) { + return result(error_code::OVERLONG, pos); + } + if (0xFF < code_point) { + return result(error_code::TOO_LARGE, pos); + } // We only care about the range 129-255 which is Non-ASCII latin1 + // characters + *latin_output++ = char(code_point); + pos += 2; + } else if ((leading_byte & 0b11110000) == 0b11100000) { + // We have a three-byte UTF-8 + return result(error_code::TOO_LARGE, pos); + } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000 + // we have a 4-byte UTF-8 word. + return result(error_code::TOO_LARGE, pos); + } else { + // we either have too many continuation bytes or an invalid leading byte + if ((leading_byte & 0b11000000) == 0b10000000) { + return result(error_code::TOO_LONG, pos); + } + + return result(error_code::HEADER_BITS, pos); + } + } + return result(error_code::SUCCESS, latin_output - start); +} + +inline result rewind_and_convert_with_errors(size_t prior_bytes, + const char *buf, size_t len, + char *latin1_output) { + size_t extra_len{0}; + // We potentially need to go back in time and find a leading byte. + // In theory '3' would be sufficient, but sometimes the error can go back + // quite far. + size_t how_far_back = prior_bytes; + // size_t how_far_back = 3; // 3 bytes in the past + current position + // if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; } + bool found_leading_bytes{false}; + // important: it is i <= how_far_back and not 'i < how_far_back'. + for (size_t i = 0; i <= how_far_back; i++) { + unsigned char byte = buf[-static_cast(i)]; + found_leading_bytes = ((byte & 0b11000000) != 0b10000000); + if (found_leading_bytes) { + if (i > 0 && byte < 128) { + // If we had to go back and the leading byte is ascii + // then we can stop right away. + return result(error_code::TOO_LONG, 0 - i + 1); + } + buf -= i; + extra_len = i; + break; + } + } + // + // It is possible for this function to return a negative count in its result. + // C++ Standard Section 18.1 defines size_t is in which is described + // in C Standard as . C Standard Section 4.1.5 defines size_t as an + // unsigned integral type of the result of the sizeof operator + // + // An unsigned type will simply wrap round arithmetically (well defined). + // + if (!found_leading_bytes) { + // If how_far_back == 3, we may have four consecutive continuation bytes!!! + // [....] [continuation] [continuation] [continuation] | [buf is + // continuation] Or we possibly have a stream that does not start with a + // leading byte. + return result(error_code::TOO_LONG, 0 - how_far_back); + } + result res = convert_with_errors(buf, len + extra_len, latin1_output); + if (res.error) { + res.count -= extra_len; + } + return res; +} + +} // namespace utf8_to_latin1 +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file include/simdutf/scalar/utf8_to_latin1/utf8_to_latin1.h */ +/* begin file include/simdutf/scalar/utf8_to_latin1/valid_utf8_to_latin1.h */ +#ifndef SIMDUTF_VALID_UTF8_TO_LATIN1_H +#define SIMDUTF_VALID_UTF8_TO_LATIN1_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf8_to_latin1 { + +template +#if SIMDUTF_CPLUSPLUS20 + requires simdutf::detail::indexes_into_byte_like +#endif +simdutf_constexpr23 size_t convert_valid(InputPtr data, size_t len, + char *latin_output) { + + size_t pos = 0; + char *start{latin_output}; + + while (pos < len) { +#if SIMDUTF_CPLUSPLUS23 + if !consteval +#endif + { + // try to convert the next block of 16 ASCII bytes + if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that + // they are ascii + uint64_t v1; + ::memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2; + ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | + v2}; // We are only interested in these bits: 1000 1000 1000 + // 1000, so it makes sense to concatenate everything + if ((v & 0x8080808080808080) == + 0) { // if NONE of these are set, e.g. all of them are zero, then + // everything is ASCII + size_t final_pos = pos + 16; + while (pos < final_pos) { + *latin_output++ = uint8_t(data[pos]); + pos++; + } + continue; + } + } + } + + // suppose it is not an all ASCII byte sequence + auto leading_byte = uint8_t(data[pos]); // leading byte + if (leading_byte < 0b10000000) { + // converting one ASCII byte !!! + *latin_output++ = char(leading_byte); + pos++; + } else if ((leading_byte & 0b11100000) == + 0b11000000) { // the first three bits indicate: + // We have a two-byte UTF-8 + if (pos + 1 >= len) { + break; + } // minimal bound checking + if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) { + return 0; + } // checks if the next byte is a valid continuation byte in UTF-8. A + // valid continuation byte starts with 10. + // range check - + uint32_t code_point = + (leading_byte & 0b00011111) << 6 | + (uint8_t(data[pos + 1]) & + 0b00111111); // assembles the Unicode code point from the two bytes. + // It does this by discarding the leading 110 and 10 + // bits from the two bytes, shifting the remaining bits + // of the first byte, and then combining the results + // with a bitwise OR operation. + *latin_output++ = char(code_point); + pos += 2; + } else { + // we may have a continuation but we do not do error checking + return 0; + } + } + return latin_output - start; +} + +} // namespace utf8_to_latin1 +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file include/simdutf/scalar/utf8_to_latin1/valid_utf8_to_latin1.h */ +/* begin file include/simdutf/scalar/utf8_to_utf16/utf8_to_utf16.h */ +#ifndef SIMDUTF_UTF8_TO_UTF16_H +#define SIMDUTF_UTF8_TO_UTF16_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf8_to_utf16 { + +template +#if SIMDUTF_CPLUSPLUS20 + requires simdutf::detail::indexes_into_byte_like +#endif +simdutf_constexpr23 size_t convert(InputPtr data, size_t len, + char16_t *utf16_output) { + size_t pos = 0; + char16_t *start{utf16_output}; + while (pos < len) { +#if SIMDUTF_CPLUSPLUS23 + if !consteval +#endif + // try to convert the next block of 16 ASCII bytes + { + if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that + // they are ascii + uint64_t v1; + ::memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2; + ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | v2}; + if ((v & 0x8080808080808080) == 0) { + size_t final_pos = pos + 16; + while (pos < final_pos) { + *utf16_output++ = !match_system(big_endian) + ? char16_t(u16_swap_bytes(data[pos])) + : char16_t(data[pos]); + pos++; + } + continue; + } + } + } + + uint8_t leading_byte = data[pos]; // leading byte + if (leading_byte < 0b10000000) { + // converting one ASCII byte !!! + *utf16_output++ = !match_system(big_endian) + ? char16_t(u16_swap_bytes(leading_byte)) + : char16_t(leading_byte); + pos++; + } else if ((leading_byte & 0b11100000) == 0b11000000) { + // We have a two-byte UTF-8, it should become + // a single UTF-16 word. + if (pos + 1 >= len) { + return 0; + } // minimal bound checking + if ((data[pos + 1] & 0b11000000) != 0b10000000) { + return 0; + } + // range check + uint32_t code_point = + (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); + if (code_point < 0x80 || 0x7ff < code_point) { + return 0; + } + if simdutf_constexpr (!match_system(big_endian)) { + code_point = uint32_t(u16_swap_bytes(uint16_t(code_point))); + } + *utf16_output++ = char16_t(code_point); + pos += 2; + } else if ((leading_byte & 0b11110000) == 0b11100000) { + // We have a three-byte UTF-8, it should become + // a single UTF-16 word. + if (pos + 2 >= len) { + return 0; + } // minimal bound checking + + if ((data[pos + 1] & 0b11000000) != 0b10000000) { + return 0; + } + if ((data[pos + 2] & 0b11000000) != 0b10000000) { + return 0; + } + // range check + uint32_t code_point = (leading_byte & 0b00001111) << 12 | + (data[pos + 1] & 0b00111111) << 6 | + (data[pos + 2] & 0b00111111); + if (code_point < 0x800 || 0xffff < code_point || + (0xd7ff < code_point && code_point < 0xe000)) { + return 0; + } + if simdutf_constexpr (!match_system(big_endian)) { + code_point = uint32_t(u16_swap_bytes(uint16_t(code_point))); + } + *utf16_output++ = char16_t(code_point); + pos += 3; + } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000 + // we have a 4-byte UTF-8 word. + if (pos + 3 >= len) { + return 0; + } // minimal bound checking + if ((data[pos + 1] & 0b11000000) != 0b10000000) { + return 0; + } + if ((data[pos + 2] & 0b11000000) != 0b10000000) { + return 0; + } + if ((data[pos + 3] & 0b11000000) != 0b10000000) { + return 0; + } + + // range check + uint32_t code_point = (leading_byte & 0b00000111) << 18 | + (data[pos + 1] & 0b00111111) << 12 | + (data[pos + 2] & 0b00111111) << 6 | + (data[pos + 3] & 0b00111111); + if (code_point <= 0xffff || 0x10ffff < code_point) { + return 0; + } + code_point -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF)); + if simdutf_constexpr (!match_system(big_endian)) { + high_surrogate = u16_swap_bytes(high_surrogate); + low_surrogate = u16_swap_bytes(low_surrogate); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + pos += 4; + } else { + return 0; + } + } + return utf16_output - start; +} + +template +#if SIMDUTF_CPLUSPLUS20 + requires simdutf::detail::indexes_into_byte_like +#endif +simdutf_constexpr23 result convert_with_errors(InputPtr data, size_t len, + char16_t *utf16_output) { + size_t pos = 0; + char16_t *start{utf16_output}; + while (pos < len) { +#if SIMDUTF_CPLUSPLUS23 + if !consteval +#endif + { + // try to convert the next block of 16 ASCII bytes + if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that + // they are ascii + uint64_t v1; + ::memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2; + ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | v2}; + if ((v & 0x8080808080808080) == 0) { + size_t final_pos = pos + 16; + while (pos < final_pos) { + const char16_t byte = uint8_t(data[pos]); + *utf16_output++ = + !match_system(big_endian) ? u16_swap_bytes(byte) : byte; + pos++; + } + continue; + } + } + } + + auto leading_byte = uint8_t(data[pos]); // leading byte + if (leading_byte < 0b10000000) { + // converting one ASCII byte !!! + *utf16_output++ = !match_system(big_endian) + ? char16_t(u16_swap_bytes(leading_byte)) + : char16_t(leading_byte); + pos++; + } else if ((leading_byte & 0b11100000) == 0b11000000) { + // We have a two-byte UTF-8, it should become + // a single UTF-16 word. + if (pos + 1 >= len) { + return result(error_code::TOO_SHORT, pos); + } // minimal bound checking + if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) { + return result(error_code::TOO_SHORT, pos); + } + // range check + uint32_t code_point = (leading_byte & 0b00011111) << 6 | + (uint8_t(data[pos + 1]) & 0b00111111); + if (code_point < 0x80 || 0x7ff < code_point) { + return result(error_code::OVERLONG, pos); + } + if simdutf_constexpr (!match_system(big_endian)) { + code_point = uint32_t(u16_swap_bytes(uint16_t(code_point))); + } + *utf16_output++ = char16_t(code_point); + pos += 2; + } else if ((leading_byte & 0b11110000) == 0b11100000) { + // We have a three-byte UTF-8, it should become + // a single UTF-16 word. + if (pos + 2 >= len) { + return result(error_code::TOO_SHORT, pos); + } // minimal bound checking + + if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) { + return result(error_code::TOO_SHORT, pos); + } + if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) { + return result(error_code::TOO_SHORT, pos); + } + // range check + uint32_t code_point = (leading_byte & 0b00001111) << 12 | + (uint8_t(data[pos + 1]) & 0b00111111) << 6 | + (uint8_t(data[pos + 2]) & 0b00111111); + if ((code_point < 0x800) || (0xffff < code_point)) { + return result(error_code::OVERLONG, pos); + } + if (0xd7ff < code_point && code_point < 0xe000) { + return result(error_code::SURROGATE, pos); + } + if simdutf_constexpr (!match_system(big_endian)) { + code_point = uint32_t(u16_swap_bytes(uint16_t(code_point))); + } + *utf16_output++ = char16_t(code_point); + pos += 3; + } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000 + // we have a 4-byte UTF-8 word. + if (pos + 3 >= len) { + return result(error_code::TOO_SHORT, pos); + } // minimal bound checking + if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) { + return result(error_code::TOO_SHORT, pos); + } + if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) { + return result(error_code::TOO_SHORT, pos); + } + if ((uint8_t(data[pos + 3]) & 0b11000000) != 0b10000000) { + return result(error_code::TOO_SHORT, pos); + } + + // range check + uint32_t code_point = (leading_byte & 0b00000111) << 18 | + (uint8_t(data[pos + 1]) & 0b00111111) << 12 | + (uint8_t(data[pos + 2]) & 0b00111111) << 6 | + (uint8_t(data[pos + 3]) & 0b00111111); + if (code_point <= 0xffff) { + return result(error_code::OVERLONG, pos); + } + if (0x10ffff < code_point) { + return result(error_code::TOO_LARGE, pos); + } + code_point -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF)); + if simdutf_constexpr (!match_system(big_endian)) { + high_surrogate = u16_swap_bytes(high_surrogate); + low_surrogate = u16_swap_bytes(low_surrogate); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + pos += 4; + } else { + // we either have too many continuation bytes or an invalid leading byte + if ((leading_byte & 0b11000000) == 0b10000000) { + return result(error_code::TOO_LONG, pos); + } else { + return result(error_code::HEADER_BITS, pos); + } + } + } + return result(error_code::SUCCESS, utf16_output - start); +} + +/** + * When rewind_and_convert_with_errors is called, we are pointing at 'buf' and + * we have up to len input bytes left, and we encountered some error. It is + * possible that the error is at 'buf' exactly, but it could also be in the + * previous bytes (up to 3 bytes back). + * + * prior_bytes indicates how many bytes, prior to 'buf' may belong to the + * current memory section and can be safely accessed. We prior_bytes to access + * safely up to three bytes before 'buf'. + * + * The caller is responsible to ensure that len > 0. + * + * If the error is believed to have occurred prior to 'buf', the count value + * contain in the result will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3. + */ +template +inline result rewind_and_convert_with_errors(size_t prior_bytes, + const char *buf, size_t len, + char16_t *utf16_output) { + size_t extra_len{0}; + // We potentially need to go back in time and find a leading byte. + // In theory '3' would be sufficient, but sometimes the error can go back + // quite far. + size_t how_far_back = prior_bytes; + // size_t how_far_back = 3; // 3 bytes in the past + current position + // if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; } + bool found_leading_bytes{false}; + // important: it is i <= how_far_back and not 'i < how_far_back'. + for (size_t i = 0; i <= how_far_back; i++) { + unsigned char byte = buf[-static_cast(i)]; + found_leading_bytes = ((byte & 0b11000000) != 0b10000000); + if (found_leading_bytes) { + if (i > 0 && byte < 128) { + // If we had to go back and the leading byte is ascii + // then we can stop right away. + return result(error_code::TOO_LONG, 0 - i + 1); + } + buf -= i; + extra_len = i; + break; + } + } + // + // It is possible for this function to return a negative count in its result. + // C++ Standard Section 18.1 defines size_t is in which is described + // in C Standard as . C Standard Section 4.1.5 defines size_t as an + // unsigned integral type of the result of the sizeof operator + // + // An unsigned type will simply wrap round arithmetically (well defined). + // + if (!found_leading_bytes) { + // If how_far_back == 3, we may have four consecutive continuation bytes!!! + // [....] [continuation] [continuation] [continuation] | [buf is + // continuation] Or we possibly have a stream that does not start with a + // leading byte. + return result(error_code::TOO_LONG, 0 - how_far_back); + } + result res = convert_with_errors(buf, len + extra_len, utf16_output); + if (res.error) { + res.count -= extra_len; + } + return res; +} + +} // namespace utf8_to_utf16 +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file include/simdutf/scalar/utf8_to_utf16/utf8_to_utf16.h */ +/* begin file include/simdutf/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */ +#ifndef SIMDUTF_VALID_UTF8_TO_UTF16_H +#define SIMDUTF_VALID_UTF8_TO_UTF16_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf8_to_utf16 { + +template +#if SIMDUTF_CPLUSPLUS20 + requires simdutf::detail::indexes_into_byte_like +#endif +simdutf_constexpr23 size_t convert_valid(InputPtr data, size_t len, + char16_t *utf16_output) { + size_t pos = 0; + char16_t *start{utf16_output}; + while (pos < len) { +#if SIMDUTF_CPLUSPLUS23 + if !consteval +#endif + { // try to convert the next block of 8 ASCII bytes + if (pos + 8 <= len) { // if it is safe to read 8 more bytes, check that + // they are ascii + uint64_t v; + ::memcpy(&v, data + pos, sizeof(uint64_t)); + if ((v & 0x8080808080808080) == 0) { + size_t final_pos = pos + 8; + while (pos < final_pos) { + const char16_t byte = uint8_t(data[pos]); + *utf16_output++ = + !match_system(big_endian) ? u16_swap_bytes(byte) : byte; + pos++; + } + continue; + } + } + } + + auto leading_byte = uint8_t(data[pos]); // leading byte + if (leading_byte < 0b10000000) { + // converting one ASCII byte !!! + *utf16_output++ = !match_system(big_endian) + ? char16_t(u16_swap_bytes(leading_byte)) + : char16_t(leading_byte); + pos++; + } else if ((leading_byte & 0b11100000) == 0b11000000) { + // We have a two-byte UTF-8, it should become + // a single UTF-16 word. + if (pos + 1 >= len) { + break; + } // minimal bound checking + uint16_t code_point = uint16_t(((leading_byte & 0b00011111) << 6) | + (uint8_t(data[pos + 1]) & 0b00111111)); + if simdutf_constexpr (!match_system(big_endian)) { + code_point = u16_swap_bytes(uint16_t(code_point)); + } + *utf16_output++ = char16_t(code_point); + pos += 2; + } else if ((leading_byte & 0b11110000) == 0b11100000) { + // We have a three-byte UTF-8, it should become + // a single UTF-16 word. + if (pos + 2 >= len) { + break; + } // minimal bound checking + uint16_t code_point = + uint16_t(((leading_byte & 0b00001111) << 12) | + ((uint8_t(data[pos + 1]) & 0b00111111) << 6) | + (uint8_t(data[pos + 2]) & 0b00111111)); + if simdutf_constexpr (!match_system(big_endian)) { + code_point = u16_swap_bytes(uint16_t(code_point)); + } + *utf16_output++ = char16_t(code_point); + pos += 3; + } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000 + // we have a 4-byte UTF-8 word. + if (pos + 3 >= len) { + break; + } // minimal bound checking + uint32_t code_point = ((leading_byte & 0b00000111) << 18) | + ((uint8_t(data[pos + 1]) & 0b00111111) << 12) | + ((uint8_t(data[pos + 2]) & 0b00111111) << 6) | + (uint8_t(data[pos + 3]) & 0b00111111); + code_point -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF)); + if simdutf_constexpr (!match_system(big_endian)) { + high_surrogate = u16_swap_bytes(high_surrogate); + low_surrogate = u16_swap_bytes(low_surrogate); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + pos += 4; + } else { + // we may have a continuation but we do not do error checking + return 0; + } + } + return utf16_output - start; +} + +} // namespace utf8_to_utf16 +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file include/simdutf/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */ +/* begin file include/simdutf/scalar/utf8_to_utf32/utf8_to_utf32.h */ +#ifndef SIMDUTF_UTF8_TO_UTF32_H +#define SIMDUTF_UTF8_TO_UTF32_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf8_to_utf32 { + +template +#if SIMDUTF_CPLUSPLUS20 + requires simdutf::detail::indexes_into_byte_like +#endif +simdutf_constexpr23 size_t convert(InputPtr data, size_t len, + char32_t *utf32_output) { + size_t pos = 0; + char32_t *start{utf32_output}; + while (pos < len) { +#if SIMDUTF_CPLUSPLUS23 + if !consteval +#endif + { + // try to convert the next block of 16 ASCII bytes + if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that + // they are ascii + uint64_t v1; + ::memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2; + ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | v2}; + if ((v & 0x8080808080808080) == 0) { + size_t final_pos = pos + 16; + while (pos < final_pos) { + *utf32_output++ = uint8_t(data[pos]); + pos++; + } + continue; + } + } + } + auto leading_byte = uint8_t(data[pos]); // leading byte + if (leading_byte < 0b10000000) { + // converting one ASCII byte !!! + *utf32_output++ = char32_t(leading_byte); + pos++; + } else if ((leading_byte & 0b11100000) == 0b11000000) { + // We have a two-byte UTF-8 + if (pos + 1 >= len) { + return 0; + } // minimal bound checking + if ((data[pos + 1] & 0b11000000) != 0b10000000) { + return 0; + } + // range check + uint32_t code_point = (leading_byte & 0b00011111) << 6 | + (uint8_t(data[pos + 1]) & 0b00111111); + if (code_point < 0x80 || 0x7ff < code_point) { + return 0; + } + *utf32_output++ = char32_t(code_point); + pos += 2; + } else if ((leading_byte & 0b11110000) == 0b11100000) { + // We have a three-byte UTF-8 + if (pos + 2 >= len) { + return 0; + } // minimal bound checking + + if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) { + return 0; + } + if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) { + return 0; + } + // range check + uint32_t code_point = (leading_byte & 0b00001111) << 12 | + (uint8_t(data[pos + 1]) & 0b00111111) << 6 | + (uint8_t(data[pos + 2]) & 0b00111111); + if (code_point < 0x800 || 0xffff < code_point || + (0xd7ff < code_point && code_point < 0xe000)) { + return 0; + } + *utf32_output++ = char32_t(code_point); + pos += 3; + } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000 + // we have a 4-byte UTF-8 word. + if (pos + 3 >= len) { + return 0; + } // minimal bound checking + if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) { + return 0; + } + if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) { + return 0; + } + if ((uint8_t(data[pos + 3]) & 0b11000000) != 0b10000000) { + return 0; + } + + // range check + uint32_t code_point = (leading_byte & 0b00000111) << 18 | + (uint8_t(data[pos + 1]) & 0b00111111) << 12 | + (uint8_t(data[pos + 2]) & 0b00111111) << 6 | + (uint8_t(data[pos + 3]) & 0b00111111); + if (code_point <= 0xffff || 0x10ffff < code_point) { + return 0; + } + *utf32_output++ = char32_t(code_point); + pos += 4; + } else { + return 0; + } + } + return utf32_output - start; +} + +template +#if SIMDUTF_CPLUSPLUS20 + requires simdutf::detail::indexes_into_byte_like +#endif +simdutf_constexpr23 result convert_with_errors(InputPtr data, size_t len, + char32_t *utf32_output) { + size_t pos = 0; + char32_t *start{utf32_output}; + while (pos < len) { +#if SIMDUTF_CPLUSPLUS23 + if !consteval +#endif + { + // try to convert the next block of 16 ASCII bytes + if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that + // they are ascii + uint64_t v1; + ::memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2; + ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | v2}; + if ((v & 0x8080808080808080) == 0) { + size_t final_pos = pos + 16; + while (pos < final_pos) { + *utf32_output++ = uint8_t(data[pos]); + pos++; + } + continue; + } + } + } + auto leading_byte = uint8_t(data[pos]); // leading byte + if (leading_byte < 0b10000000) { + // converting one ASCII byte !!! + *utf32_output++ = char32_t(leading_byte); + pos++; + } else if ((leading_byte & 0b11100000) == 0b11000000) { + // We have a two-byte UTF-8 + if (pos + 1 >= len) { + return result(error_code::TOO_SHORT, pos); + } // minimal bound checking + if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) { + return result(error_code::TOO_SHORT, pos); + } + // range check + uint32_t code_point = (leading_byte & 0b00011111) << 6 | + (uint8_t(data[pos + 1]) & 0b00111111); + if (code_point < 0x80 || 0x7ff < code_point) { + return result(error_code::OVERLONG, pos); + } + *utf32_output++ = char32_t(code_point); + pos += 2; + } else if ((leading_byte & 0b11110000) == 0b11100000) { + // We have a three-byte UTF-8 + if (pos + 2 >= len) { + return result(error_code::TOO_SHORT, pos); + } // minimal bound checking + + if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) { + return result(error_code::TOO_SHORT, pos); + } + if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) { + return result(error_code::TOO_SHORT, pos); + } + // range check + uint32_t code_point = (leading_byte & 0b00001111) << 12 | + (uint8_t(data[pos + 1]) & 0b00111111) << 6 | + (uint8_t(data[pos + 2]) & 0b00111111); + if (code_point < 0x800 || 0xffff < code_point) { + return result(error_code::OVERLONG, pos); + } + if (0xd7ff < code_point && code_point < 0xe000) { + return result(error_code::SURROGATE, pos); + } + *utf32_output++ = char32_t(code_point); + pos += 3; + } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000 + // we have a 4-byte UTF-8 word. + if (pos + 3 >= len) { + return result(error_code::TOO_SHORT, pos); + } // minimal bound checking + if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) { + return result(error_code::TOO_SHORT, pos); + } + if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) { + return result(error_code::TOO_SHORT, pos); + } + if ((uint8_t(data[pos + 3]) & 0b11000000) != 0b10000000) { + return result(error_code::TOO_SHORT, pos); + } + + // range check + uint32_t code_point = (leading_byte & 0b00000111) << 18 | + (uint8_t(data[pos + 1]) & 0b00111111) << 12 | + (uint8_t(data[pos + 2]) & 0b00111111) << 6 | + (uint8_t(data[pos + 3]) & 0b00111111); + if (code_point <= 0xffff) { + return result(error_code::OVERLONG, pos); + } + if (0x10ffff < code_point) { + return result(error_code::TOO_LARGE, pos); + } + *utf32_output++ = char32_t(code_point); + pos += 4; + } else { + // we either have too many continuation bytes or an invalid leading byte + if ((leading_byte & 0b11000000) == 0b10000000) { + return result(error_code::TOO_LONG, pos); + } else { + return result(error_code::HEADER_BITS, pos); + } + } + } + return result(error_code::SUCCESS, utf32_output - start); +} + +/** + * When rewind_and_convert_with_errors is called, we are pointing at 'buf' and + * we have up to len input bytes left, and we encountered some error. It is + * possible that the error is at 'buf' exactly, but it could also be in the + * previous bytes location (up to 3 bytes back). + * + * prior_bytes indicates how many bytes, prior to 'buf' may belong to the + * current memory section and can be safely accessed. We prior_bytes to access + * safely up to three bytes before 'buf'. + * + * The caller is responsible to ensure that len > 0. + * + * If the error is believed to have occurred prior to 'buf', the count value + * contain in the result will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3. + */ +inline result rewind_and_convert_with_errors(size_t prior_bytes, + const char *buf, size_t len, + char32_t *utf32_output) { + size_t extra_len{0}; + // We potentially need to go back in time and find a leading byte. + size_t how_far_back = 3; // 3 bytes in the past + current position + if (how_far_back > prior_bytes) { + how_far_back = prior_bytes; + } + bool found_leading_bytes{false}; + // important: it is i <= how_far_back and not 'i < how_far_back'. + for (size_t i = 0; i <= how_far_back; i++) { + unsigned char byte = buf[-static_cast(i)]; + found_leading_bytes = ((byte & 0b11000000) != 0b10000000); + if (found_leading_bytes) { + if (i > 0 && byte < 128) { + // If we had to go back and the leading byte is ascii + // then we can stop right away. + return result(error_code::TOO_LONG, 0 - i + 1); + } + buf -= i; + extra_len = i; + break; + } + } + // + // It is possible for this function to return a negative count in its result. + // C++ Standard Section 18.1 defines size_t is in which is described + // in C Standard as . C Standard Section 4.1.5 defines size_t as an + // unsigned integral type of the result of the sizeof operator + // + // An unsigned type will simply wrap round arithmetically (well defined). + // + if (!found_leading_bytes) { + // If how_far_back == 3, we may have four consecutive continuation bytes!!! + // [....] [continuation] [continuation] [continuation] | [buf is + // continuation] Or we possibly have a stream that does not start with a + // leading byte. + return result(error_code::TOO_LONG, 0 - how_far_back); + } + + result res = convert_with_errors(buf, len + extra_len, utf32_output); + if (res.error) { + res.count -= extra_len; + } + return res; +} + +} // namespace utf8_to_utf32 +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file include/simdutf/scalar/utf8_to_utf32/utf8_to_utf32.h */ +/* begin file include/simdutf/scalar/utf8_to_utf32/valid_utf8_to_utf32.h */ +#ifndef SIMDUTF_VALID_UTF8_TO_UTF32_H +#define SIMDUTF_VALID_UTF8_TO_UTF32_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf8_to_utf32 { + +template +#if SIMDUTF_CPLUSPLUS20 + requires simdutf::detail::indexes_into_byte_like +#endif +simdutf_constexpr23 size_t convert_valid(InputPtr data, size_t len, + char32_t *utf32_output) { + size_t pos = 0; + char32_t *start{utf32_output}; + while (pos < len) { +#if SIMDUTF_CPLUSPLUS23 + if !consteval +#endif + { + // try to convert the next block of 8 ASCII bytes + if (pos + 8 <= len) { // if it is safe to read 8 more bytes, check that + // they are ascii + uint64_t v; + ::memcpy(&v, data + pos, sizeof(uint64_t)); + if ((v & 0x8080808080808080) == 0) { + size_t final_pos = pos + 8; + while (pos < final_pos) { + *utf32_output++ = uint8_t(data[pos]); + pos++; + } + continue; + } + } + } + auto leading_byte = uint8_t(data[pos]); // leading byte + if (leading_byte < 0b10000000) { + // converting one ASCII byte !!! + *utf32_output++ = char32_t(leading_byte); + pos++; + } else if ((leading_byte & 0b11100000) == 0b11000000) { + // We have a two-byte UTF-8 + if (pos + 1 >= len) { + break; + } // minimal bound checking + *utf32_output++ = char32_t(((leading_byte & 0b00011111) << 6) | + (uint8_t(data[pos + 1]) & 0b00111111)); + pos += 2; + } else if ((leading_byte & 0b11110000) == 0b11100000) { + // We have a three-byte UTF-8 + if (pos + 2 >= len) { + break; + } // minimal bound checking + *utf32_output++ = char32_t(((leading_byte & 0b00001111) << 12) | + ((uint8_t(data[pos + 1]) & 0b00111111) << 6) | + (uint8_t(data[pos + 2]) & 0b00111111)); + pos += 3; + } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000 + // we have a 4-byte UTF-8 word. + if (pos + 3 >= len) { + break; + } // minimal bound checking + uint32_t code_word = ((leading_byte & 0b00000111) << 18) | + ((uint8_t(data[pos + 1]) & 0b00111111) << 12) | + ((uint8_t(data[pos + 2]) & 0b00111111) << 6) | + (uint8_t(data[pos + 3]) & 0b00111111); + *utf32_output++ = char32_t(code_word); + pos += 4; + } else { + // we may have a continuation but we do not do error checking + return 0; + } + } + return utf32_output - start; +} + +} // namespace utf8_to_utf32 +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file include/simdutf/scalar/utf8_to_utf32/valid_utf8_to_utf32.h */ + +namespace simdutf { + +constexpr size_t default_line_length = + 76; ///< default line length for base64 encoding with lines + +#if SIMDUTF_FEATURE_DETECT_ENCODING +/** + * Autodetect the encoding of the input, a single encoding is recommended. + * E.g., the function might return simdutf::encoding_type::UTF8, + * simdutf::encoding_type::UTF16_LE, simdutf::encoding_type::UTF16_BE, or + * simdutf::encoding_type::UTF32_LE. + * + * @param input the string to analyze. + * @param length the length of the string in bytes. + * @return the detected encoding type + */ +simdutf_warn_unused simdutf::encoding_type +autodetect_encoding(const char *input, size_t length) noexcept; +simdutf_really_inline simdutf_warn_unused simdutf::encoding_type +autodetect_encoding(const uint8_t *input, size_t length) noexcept { + return autodetect_encoding(reinterpret_cast(input), length); +} + #if SIMDUTF_SPAN +/** + * Autodetect the encoding of the input, a single encoding is recommended. + * E.g., the function might return simdutf::encoding_type::UTF8, + * simdutf::encoding_type::UTF16_LE, simdutf::encoding_type::UTF16_BE, or + * simdutf::encoding_type::UTF32_LE. + * + * @param input the string to analyze. can be a anything span-like that has a + * data() and size() that points to character data: std::string, + * std::string_view, std::vector, std::span etc. + * @return the detected encoding type + */ +simdutf_really_inline simdutf_warn_unused simdutf::encoding_type +autodetect_encoding( + const detail::input_span_of_byte_like auto &input) noexcept { + return autodetect_encoding(reinterpret_cast(input.data()), + input.size()); +} + #endif // SIMDUTF_SPAN + +/** + * Autodetect the possible encodings of the input in one pass. + * E.g., if the input might be UTF-16LE or UTF-8, this function returns + * the value (simdutf::encoding_type::UTF8 | simdutf::encoding_type::UTF16_LE). + * + * Overridden by each implementation. + * + * @param input the string to analyze. + * @param length the length of the string in bytes. + * @return the detected encoding type + */ +simdutf_warn_unused int detect_encodings(const char *input, + size_t length) noexcept; +simdutf_really_inline simdutf_warn_unused int +detect_encodings(const uint8_t *input, size_t length) noexcept { + return detect_encodings(reinterpret_cast(input), length); +} + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused int +detect_encodings(const detail::input_span_of_byte_like auto &input) noexcept { + return detect_encodings(reinterpret_cast(input.data()), + input.size()); +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +/** + * Validate the UTF-8 string. This function may be best when you expect + * the input to be almost always valid. Otherwise, consider using + * validate_utf8_with_errors. * * Overridden by each implementation. * @@ -966,7 +5029,24 @@ simdutf_really_inline simdutf_warn_unused int detect_encodings(const uint8_t * i * @return true if and only if the string is valid UTF-8. */ simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept; + #if SIMDUTF_SPAN +simdutf_constexpr23 simdutf_really_inline simdutf_warn_unused bool +validate_utf8(const detail::input_span_of_byte_like auto &input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf8::validate( + detail::constexpr_cast_ptr(input.data()), input.size()); + } else + #endif + { + return validate_utf8(reinterpret_cast(input.data()), + input.size()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF8 /** * Validate the UTF-8 string and stop on error. * @@ -974,10 +5054,32 @@ simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept; * * @param buf the UTF-8 string to validate. * @param len the length of the string in bytes. - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of code units validated if + * successful. */ -simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) noexcept; +simdutf_warn_unused result validate_utf8_with_errors(const char *buf, + size_t len) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused result +validate_utf8_with_errors( + const detail::input_span_of_byte_like auto &input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf8::validate_with_errors( + detail::constexpr_cast_ptr(input.data()), input.size()); + } else + #endif + { + return validate_utf8_with_errors( + reinterpret_cast(input.data()), input.size()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF8 +#if SIMDUTF_FEATURE_ASCII /** * Validate the ASCII string. * @@ -988,6 +5090,21 @@ simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len * @return true if and only if the string is valid ASCII. */ simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 bool +validate_ascii(const detail::input_span_of_byte_like auto &input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::ascii::validate( + detail::constexpr_cast_ptr(input.data()), input.size()); + } else + #endif + { + return validate_ascii(reinterpret_cast(input.data()), + input.size()); + } +} + #endif // SIMDUTF_SPAN /** * Validate the ASCII string and stop on error. It might be faster than @@ -997,25 +5114,151 @@ simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept; * * @param buf the ASCII string to validate. * @param len the length of the string in bytes. - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of code units validated if + * successful. + */ +simdutf_warn_unused result validate_ascii_with_errors(const char *buf, + size_t len) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result +validate_ascii_with_errors( + const detail::input_span_of_byte_like auto &input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::ascii::validate_with_errors( + detail::constexpr_cast_ptr(input.data()), input.size()); + } else + #endif + { + return validate_ascii_with_errors( + reinterpret_cast(input.data()), input.size()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_ASCII + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII +/** + * Validate the ASCII string as a UTF-16 sequence. + * An UTF-16 sequence is considered an ASCII sequence + * if it could be converted to an ASCII string losslessly. + * + * Overridden by each implementation. + * + * @param buf the UTF-16 string to validate. + * @param len the length of the string in bytes. + * @return true if and only if the string is valid ASCII. + */ +simdutf_warn_unused bool validate_utf16_as_ascii(const char16_t *buf, + size_t len) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 bool +validate_utf16_as_ascii(std::span input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16::validate_as_ascii(input.data(), + input.size()); + } else + #endif + { + return validate_utf16_as_ascii(input.data(), input.size()); + } +} + #endif // SIMDUTF_SPAN + +/** + * Validate the ASCII string as a UTF-16BE sequence. + * An UTF-16 sequence is considered an ASCII sequence + * if it could be converted to an ASCII string losslessly. + * + * Overridden by each implementation. + * + * @param buf the UTF-16BE string to validate. + * @param len the length of the string in bytes. + * @return true if and only if the string is valid ASCII. + */ +simdutf_warn_unused bool validate_utf16be_as_ascii(const char16_t *buf, + size_t len) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 bool +validate_utf16be_as_ascii(std::span input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16::validate_as_ascii(input.data(), + input.size()); + } else + #endif + { + return validate_utf16be_as_ascii(input.data(), input.size()); + } +} + #endif // SIMDUTF_SPAN + +/** + * Validate the ASCII string as a UTF-16LE sequence. + * An UTF-16 sequence is considered an ASCII sequence + * if it could be converted to an ASCII string losslessly. + * + * Overridden by each implementation. + * + * @param buf the UTF-16LE string to validate. + * @param len the length of the string in bytes. + * @return true if and only if the string is valid ASCII. */ -simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) noexcept; +simdutf_warn_unused bool validate_utf16le_as_ascii(const char16_t *buf, + size_t len) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 bool +validate_utf16le_as_ascii(std::span input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16::validate_as_ascii(input.data(), + input.size()); + } else + #endif + { + return validate_utf16le_as_ascii(input.data(), input.size()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII +#if SIMDUTF_FEATURE_UTF16 /** * Using native endianness; Validate the UTF-16 string. - * This function may be best when you expect the input to be almost always valid. - * Otherwise, consider using validate_utf16_with_errors. + * This function may be best when you expect the input to be almost always + * valid. Otherwise, consider using validate_utf16_with_errors. * * Overridden by each implementation. * * This function is not BOM-aware. * * @param buf the UTF-16 string to validate. - * @param len the length of the string in number of 2-byte code units (char16_t). + * @param len the length of the string in number of 2-byte code units + * (char16_t). * @return true if and only if the string is valid UTF-16. */ -simdutf_warn_unused bool validate_utf16(const char16_t *buf, size_t len) noexcept; +simdutf_warn_unused bool validate_utf16(const char16_t *buf, + size_t len) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 bool +validate_utf16(std::span input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16::validate(input.data(), + input.size()); + } else + #endif + { + return validate_utf16(input.data(), input.size()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING /** * Validate the UTF-16LE string. This function may be best when you expect * the input to be almost always valid. Otherwise, consider using @@ -1026,11 +5269,29 @@ simdutf_warn_unused bool validate_utf16(const char16_t *buf, size_t len) noexcep * This function is not BOM-aware. * * @param buf the UTF-16LE string to validate. - * @param len the length of the string in number of 2-byte code units (char16_t). + * @param len the length of the string in number of 2-byte code units + * (char16_t). * @return true if and only if the string is valid UTF-16LE. */ -simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) noexcept; +simdutf_warn_unused bool validate_utf16le(const char16_t *buf, + size_t len) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused bool +validate_utf16le(std::span input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16::validate(input.data(), + input.size()); + } else + #endif + { + return validate_utf16le(input.data(), input.size()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF16 /** * Validate the UTF-16BE string. This function may be best when you expect * the input to be almost always valid. Otherwise, consider using @@ -1041,146 +5302,524 @@ simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) noexc * This function is not BOM-aware. * * @param buf the UTF-16BE string to validate. - * @param len the length of the string in number of 2-byte code units (char16_t). + * @param len the length of the string in number of 2-byte code units + * (char16_t). * @return true if and only if the string is valid UTF-16BE. */ -simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) noexcept; +simdutf_warn_unused bool validate_utf16be(const char16_t *buf, + size_t len) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 bool +validate_utf16be(std::span input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16::validate(input.data(), input.size()); + } else + #endif + { + return validate_utf16be(input.data(), input.size()); + } +} + #endif // SIMDUTF_SPAN + +/** + * Using native endianness; Validate the UTF-16 string and stop on error. + * It might be faster than validate_utf16 when an error is expected to occur + * early. + * + * Overridden by each implementation. + * + * This function is not BOM-aware. + * + * @param buf the UTF-16 string to validate. + * @param len the length of the string in number of 2-byte code units + * (char16_t). + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of code units validated if + * successful. + */ +simdutf_warn_unused result validate_utf16_with_errors(const char16_t *buf, + size_t len) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result +validate_utf16_with_errors(std::span input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16::validate_with_errors( + input.data(), input.size()); + } else + #endif + { + return validate_utf16_with_errors(input.data(), input.size()); + } +} + #endif // SIMDUTF_SPAN + +/** + * Validate the UTF-16LE string and stop on error. It might be faster than + * validate_utf16le when an error is expected to occur early. + * + * Overridden by each implementation. + * + * This function is not BOM-aware. + * + * @param buf the UTF-16LE string to validate. + * @param len the length of the string in number of 2-byte code units + * (char16_t). + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of code units validated if + * successful. + */ +simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, + size_t len) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result +validate_utf16le_with_errors(std::span input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16::validate_with_errors( + input.data(), input.size()); + } else + #endif + { + return validate_utf16le_with_errors(input.data(), input.size()); + } +} + #endif // SIMDUTF_SPAN + +/** + * Validate the UTF-16BE string and stop on error. It might be faster than + * validate_utf16be when an error is expected to occur early. + * + * Overridden by each implementation. + * + * This function is not BOM-aware. + * + * @param buf the UTF-16BE string to validate. + * @param len the length of the string in number of 2-byte code units + * (char16_t). + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of code units validated if + * successful. + */ +simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, + size_t len) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result +validate_utf16be_with_errors(std::span input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16::validate_with_errors(input.data(), + input.size()); + } else + #endif + { + return validate_utf16be_with_errors(input.data(), input.size()); + } +} + #endif // SIMDUTF_SPAN + +/** + * Fixes an ill-formed UTF-16LE string by replacing mismatched surrogates with + * the Unicode replacement character U+FFFD. If input and output points to + * different memory areas, the procedure copies string, and it's expected that + * output memory is at least as big as the input. It's also possible to set + * input equal output, that makes replacements an in-place operation. + * + * @param input the UTF-16LE string to correct. + * @param len the length of the string in number of 2-byte code units + * (char16_t). + * @param output the output buffer. + */ +void to_well_formed_utf16le(const char16_t *input, size_t len, + char16_t *output) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_constexpr23 void +to_well_formed_utf16le(std::span input, + std::span output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + scalar::utf16::to_well_formed_utf16( + input.data(), input.size(), output.data()); + } else + #endif + { + to_well_formed_utf16le(input.data(), input.size(), output.data()); + } +} + #endif // SIMDUTF_SPAN + +/** + * Fixes an ill-formed UTF-16BE string by replacing mismatched surrogates with + * the Unicode replacement character U+FFFD. If input and output points to + * different memory areas, the procedure copies string, and it's expected that + * output memory is at least as big as the input. It's also possible to set + * input equal output, that makes replacements an in-place operation. + * + * @param input the UTF-16BE string to correct. + * @param len the length of the string in number of 2-byte code units + * (char16_t). + * @param output the output buffer. + */ +void to_well_formed_utf16be(const char16_t *input, size_t len, + char16_t *output) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_constexpr23 void +to_well_formed_utf16be(std::span input, + std::span output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + scalar::utf16::to_well_formed_utf16( + input.data(), input.size(), output.data()); + } else + #endif + { + to_well_formed_utf16be(input.data(), input.size(), output.data()); + } +} + #endif // SIMDUTF_SPAN + +/** + * Fixes an ill-formed UTF-16 string by replacing mismatched surrogates with the + * Unicode replacement character U+FFFD. If input and output points to different + * memory areas, the procedure copies string, and it's expected that output + * memory is at least as big as the input. It's also possible to set input equal + * output, that makes replacements an in-place operation. + * + * @param input the UTF-16 string to correct. + * @param len the length of the string in number of 2-byte code units + * (char16_t). + * @param output the output buffer. + */ +void to_well_formed_utf16(const char16_t *input, size_t len, + char16_t *output) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_constexpr23 void +to_well_formed_utf16(std::span input, + std::span output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + scalar::utf16::to_well_formed_utf16( + input.data(), input.size(), output.data()); + } else + #endif + { + to_well_formed_utf16(input.data(), input.size(), output.data()); + } +} + #endif // SIMDUTF_SPAN + +#endif // SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING +/** + * Validate the UTF-32 string. This function may be best when you expect + * the input to be almost always valid. Otherwise, consider using + * validate_utf32_with_errors. + * + * Overridden by each implementation. + * + * This function is not BOM-aware. + * + * @param buf the UTF-32 string to validate. + * @param len the length of the string in number of 4-byte code units + * (char32_t). + * @return true if and only if the string is valid UTF-32. + */ +simdutf_warn_unused bool validate_utf32(const char32_t *buf, + size_t len) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 bool +validate_utf32(std::span input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf32::validate( + detail::constexpr_cast_ptr(input.data()), input.size()); + } else + #endif + { + return validate_utf32(input.data(), input.size()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING + +#if SIMDUTF_FEATURE_UTF32 +/** + * Validate the UTF-32 string and stop on error. It might be faster than + * validate_utf32 when an error is expected to occur early. + * + * Overridden by each implementation. + * + * This function is not BOM-aware. + * + * @param buf the UTF-32 string to validate. + * @param len the length of the string in number of 4-byte code units + * (char32_t). + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of code units validated if + * successful. + */ +simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, + size_t len) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result +validate_utf32_with_errors(std::span input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf32::validate_with_errors( + detail::constexpr_cast_ptr(input.data()), input.size()); + } else + #endif + { + return validate_utf32_with_errors(input.data(), input.size()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF32 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +/** + * Convert Latin1 string into UTF-8 string. + * + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the Latin1 string to convert + * @param length the length of the string in bytes + * @param utf8_output the pointer to buffer that can hold conversion result + * @return the number of written char; 0 if conversion is not possible + */ +simdutf_warn_unused size_t convert_latin1_to_utf8(const char *input, + size_t length, + char *utf8_output) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_latin1_to_utf8( + const detail::input_span_of_byte_like auto &latin1_input, + detail::output_span_of_byte_like auto &&utf8_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::latin1_to_utf8::convert( + detail::constexpr_cast_ptr(latin1_input.data()), + latin1_input.size(), + detail::constexpr_cast_writeptr(utf8_output.data())); + } else + #endif + { + return convert_latin1_to_utf8( + reinterpret_cast(latin1_input.data()), + latin1_input.size(), reinterpret_cast(utf8_output.data())); + } +} + #endif // SIMDUTF_SPAN + +/** + * Convert Latin1 string into UTF-8 string with output limit. + * + * This function is suitable to work with inputs from untrusted sources. + * + * We write as many characters as possible. + * + * @param input the Latin1 string to convert + * @param length the length of the string in bytes + * @param utf8_output the pointer to buffer that can hold conversion result + * @param utf8_len the maximum output length + * @return the number of written char; 0 if conversion is not possible + */ +simdutf_warn_unused size_t +convert_latin1_to_utf8_safe(const char *input, size_t length, char *utf8_output, + size_t utf8_len) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_latin1_to_utf8_safe( + const detail::input_span_of_byte_like auto &input, + detail::output_span_of_byte_like auto &&utf8_output) noexcept { + // implementation note: outputspan is a forwarding ref to avoid copying + // and allow both lvalues and rvalues. std::span can be copied without + // problems, but std::vector should not, and this function should accept + // both. it will allow using an owning rvalue ref (example: passing a + // temporary std::string) as output, but the user will quickly find out + // that he has no way of getting the data out of the object in that case. + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::latin1_to_utf8::convert_safe_constexpr( + input.data(), input.size(), utf8_output.data(), utf8_output.size()); + } else + #endif + { + return convert_latin1_to_utf8_safe( + reinterpret_cast(input.data()), input.size(), + reinterpret_cast(utf8_output.data()), utf8_output.size()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 /** - * Using native endianness; Validate the UTF-16 string and stop on error. - * It might be faster than validate_utf16 when an error is expected to occur early. - * - * Overridden by each implementation. + * Convert possibly Latin1 string into UTF-16LE string. * - * This function is not BOM-aware. + * This function is suitable to work with inputs from untrusted sources. * - * @param buf the UTF-16 string to validate. - * @param len the length of the string in number of 2-byte code units (char16_t). - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. + * @param input the Latin1 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return the number of written char16_t; 0 if conversion is not possible */ -simdutf_warn_unused result validate_utf16_with_errors(const char16_t *buf, size_t len) noexcept; +simdutf_warn_unused size_t convert_latin1_to_utf16le( + const char *input, size_t length, char16_t *utf16_output) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_latin1_to_utf16le( + const detail::input_span_of_byte_like auto &latin1_input, + std::span utf16_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::latin1_to_utf16::convert( + latin1_input.data(), latin1_input.size(), utf16_output.data()); + } else + #endif + { + return convert_latin1_to_utf16le( + reinterpret_cast(latin1_input.data()), + latin1_input.size(), utf16_output.data()); + } +} + #endif // SIMDUTF_SPAN /** - * Validate the UTF-16LE string and stop on error. It might be faster than - * validate_utf16le when an error is expected to occur early. + * Convert Latin1 string into UTF-16BE string. * - * Overridden by each implementation. + * This function is suitable to work with inputs from untrusted sources. * - * This function is not BOM-aware. + * @param input the Latin1 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return the number of written char16_t; 0 if conversion is not possible + */ +simdutf_warn_unused size_t convert_latin1_to_utf16be( + const char *input, size_t length, char16_t *utf16_output) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_latin1_to_utf16be(const detail::input_span_of_byte_like auto &input, + std::span output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::latin1_to_utf16::convert( + input.data(), input.size(), output.data()); + } else + #endif + { + return convert_latin1_to_utf16be( + reinterpret_cast(input.data()), input.size(), + output.data()); + } +} + #endif // SIMDUTF_SPAN +/** + * Compute the number of bytes that this UTF-16 string would require in Latin1 + * format. * - * @param buf the UTF-16LE string to validate. - * @param len the length of the string in number of 2-byte code units (char16_t). - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. + * @param length the length of the string in Latin1 code units (char) + * @return the length of the string in Latin1 code units (char) required to + * encode the UTF-16 string as Latin1 */ -simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) noexcept; +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +latin1_length_from_utf16(size_t length) noexcept { + return length; +} /** - * Validate the UTF-16BE string and stop on error. It might be faster than - * validate_utf16be when an error is expected to occur early. - * - * Overridden by each implementation. - * - * This function is not BOM-aware. + * Compute the number of code units that this Latin1 string would require in + * UTF-16 format. * - * @param buf the UTF-16BE string to validate. - * @param len the length of the string in number of 2-byte code units (char16_t). - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. + * @param length the length of the string in Latin1 code units (char) + * @return the length of the string in 2-byte code units (char16_t) required to + * encode the Latin1 string as UTF-16 */ -simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) noexcept; +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +utf16_length_from_latin1(size_t length) noexcept { + return length; +} +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 /** - * Validate the UTF-32 string. This function may be best when you expect - * the input to be almost always valid. Otherwise, consider using - * validate_utf32_with_errors. - * - * Overridden by each implementation. + * Convert Latin1 string into UTF-32 string. * - * This function is not BOM-aware. + * This function is suitable to work with inputs from untrusted sources. * - * @param buf the UTF-32 string to validate. - * @param len the length of the string in number of 4-byte code units (char32_t). - * @return true if and only if the string is valid UTF-32. + * @param input the Latin1 string to convert + * @param length the length of the string in bytes + * @param utf32_buffer the pointer to buffer that can hold conversion result + * @return the number of written char32_t; 0 if conversion is not possible */ -simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) noexcept; +simdutf_warn_unused size_t convert_latin1_to_utf32( + const char *input, size_t length, char32_t *utf32_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_latin1_to_utf32( + const detail::input_span_of_byte_like auto &latin1_input, + std::span utf32_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::latin1_to_utf32::convert( + latin1_input.data(), latin1_input.size(), utf32_output.data()); + } else + #endif + { + return convert_latin1_to_utf32( + reinterpret_cast(latin1_input.data()), + latin1_input.size(), utf32_output.data()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 /** - * Validate the UTF-32 string and stop on error. It might be faster than - * validate_utf32 when an error is expected to occur early. + * Convert possibly broken UTF-8 string into latin1 string. * - * Overridden by each implementation. - * - * This function is not BOM-aware. + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. * - * @param buf the UTF-32 string to validate. - * @param len the length of the string in number of 4-byte code units (char32_t). - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param latin1_output the pointer to buffer that can hold conversion result + * @return the number of written char; 0 if the input was not valid UTF-8 string + * or if it cannot be represented as Latin1 */ -simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) noexcept; - - /** - * Convert Latin1 string into UTF8 string. - * - * This function is suitable to work with inputs from untrusted sources. - * - * @param input the Latin1 string to convert - * @param length the length of the string in bytes - * @param latin1_output the pointer to buffer that can hold conversion result - * @return the number of written char; 0 if conversion is not possible - */ - simdutf_warn_unused size_t convert_latin1_to_utf8(const char * input, size_t length, char* utf8_output) noexcept; - - - /** - * Convert possibly Latin1 string into UTF-16LE string. - * - * This function is suitable to work with inputs from untrusted sources. - * - * @param input the Latin1 string to convert - * @param length the length of the string in bytes - * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return the number of written char16_t; 0 if conversion is not possible - */ - simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept; - - /** - * Convert Latin1 string into UTF-16BE string. - * - * This function is suitable to work with inputs from untrusted sources. - * - * @param input the Latin1 string to convert - * @param length the length of the string in bytes - * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return the number of written char16_t; 0 if conversion is not possible - */ - simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept; - - /** - * Convert Latin1 string into UTF-32 string. - * - * This function is suitable to work with inputs from untrusted sources. - * - * @param input the Latin1 string to convert - * @param length the length of the string in bytes - * @param utf32_buffer the pointer to buffer that can hold conversion result - * @return the number of written char32_t; 0 if conversion is not possible - */ - simdutf_warn_unused size_t convert_latin1_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept; - - /** - * Convert possibly broken UTF-8 string into latin1 string. - * - * During the conversion also validation of the input string is done. - * This function is suitable to work with inputs from untrusted sources. - * - * @param input the UTF-8 string to convert - * @param length the length of the string in bytes - * @param latin1_output the pointer to buffer that can hold conversion result - * @return the number of written char; 0 if the input was not valid UTF-8 string or if it cannot be represented as Latin1 - */ - simdutf_warn_unused size_t convert_utf8_to_latin1(const char * input, size_t length, char* latin1_output) noexcept; +simdutf_warn_unused size_t convert_utf8_to_latin1(const char *input, + size_t length, + char *latin1_output) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_utf8_to_latin1( + const detail::input_span_of_byte_like auto &input, + detail::output_span_of_byte_like auto &&output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf8_to_latin1::convert(input.data(), input.size(), + output.data()); + } else + #endif + { + return convert_utf8_to_latin1(reinterpret_cast(input.data()), + input.size(), + reinterpret_cast(output.data())); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 /** - * Using native endianness, convert possibly broken UTF-8 string into a UTF-16 string. + * Using native endianness, convert possibly broken UTF-8 string into a UTF-16 + * string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1188,21 +5827,132 @@ simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_ * @param input the UTF-8 string to convert * @param length the length of the string in bytes * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return the number of written char16_t; 0 if the input was not valid UTF-8 string + * @return the number of written char16_t; 0 if the input was not valid UTF-8 + * string + */ +simdutf_warn_unused size_t convert_utf8_to_utf16( + const char *input, size_t length, char16_t *utf16_output) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_utf8_to_utf16(const detail::input_span_of_byte_like auto &input, + std::span output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf8_to_utf16::convert( + input.data(), input.size(), output.data()); + } else + #endif + { + return convert_utf8_to_utf16(reinterpret_cast(input.data()), + input.size(), output.data()); + } +} + #endif // SIMDUTF_SPAN + +/** + * Compute the number of bytes that this UTF-16LE string would require in UTF-8 + * format even when the UTF-16LE content contains mismatched surrogates + * that have to be replaced by the replacement character (0xFFFD). + * + * @param input the UTF-16LE string to convert + * @param length the length of the string in 2-byte code units (char16_t) + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) where the count is the number of bytes required to + * encode the UTF-16LE string as UTF-8, and the error code is either SUCCESS or + * SURROGATE. The count is correct regardless of the error field. + * When SURROGATE is returned, it does not indicate an error in the case of this + * function: it indicates that at least one surrogate has been encountered: the + * surrogates may be matched or not (thus this function does not validate). If + * the returned error code is SUCCESS, then the input contains no surrogate, is + * in the Basic Multilingual Plane, and is necessarily valid. + */ +simdutf_warn_unused result utf8_length_from_utf16le_with_replacement( + const char16_t *input, size_t length) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused result +utf8_length_from_utf16le_with_replacement( + std::span valid_utf16_input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16::utf8_length_from_utf16_with_replacement< + endianness::LITTLE>(valid_utf16_input.data(), valid_utf16_input.size()); + } else + #endif + { + return utf8_length_from_utf16le_with_replacement(valid_utf16_input.data(), + valid_utf16_input.size()); + } +} + #endif // SIMDUTF_SPAN + +/** + * Compute the number of bytes that this UTF-16BE string would require in UTF-8 + * format even when the UTF-16BE content contains mismatched surrogates + * that have to be replaced by the replacement character (0xFFFD). + * + * @param input the UTF-16BE string to convert + * @param length the length of the string in 2-byte code units (char16_t) + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) where the count is the number of bytes required to + * encode the UTF-16BE string as UTF-8, and the error code is either SUCCESS or + * SURROGATE. The count is correct regardless of the error field. + * When SURROGATE is returned, it does not indicate an error in the case of this + * function: it indicates that at least one surrogate has been encountered: the + * surrogates may be matched or not (thus this function does not validate). If + * the returned error code is SUCCESS, then the input contains no surrogate, is + * in the Basic Multilingual Plane, and is necessarily valid. */ -simdutf_warn_unused size_t convert_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_output) noexcept; +simdutf_warn_unused result utf8_length_from_utf16be_with_replacement( + const char16_t *input, size_t length) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result +utf8_length_from_utf16be_with_replacement( + std::span valid_utf16_input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16::utf8_length_from_utf16_with_replacement< + endianness::BIG>(valid_utf16_input.data(), valid_utf16_input.size()); + } else + #endif + { + return utf8_length_from_utf16be_with_replacement(valid_utf16_input.data(), + valid_utf16_input.size()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 /** * Using native endianness, convert a Latin1 string into a UTF-16 string. * - * @param input the UTF-8 string to convert + * @param input the Latin1 string to convert * @param length the length of the string in bytes * @param utf16_buffer the pointer to buffer that can hold conversion result * @return the number of written char16_t. */ -simdutf_warn_unused size_t convert_latin1_to_utf16(const char * input, size_t length, char16_t* utf16_output) noexcept; +simdutf_warn_unused size_t convert_latin1_to_utf16( + const char *input, size_t length, char16_t *utf16_output) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_latin1_to_utf16(const detail::input_span_of_byte_like auto &input, + std::span output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::latin1_to_utf16::convert( + input.data(), input.size(), output.data()); + } else + #endif + { + return convert_latin1_to_utf16(reinterpret_cast(input.data()), + input.size(), output.data()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 /** * Convert possibly broken UTF-8 string into UTF-16LE string. * @@ -1212,9 +5962,28 @@ simdutf_warn_unused size_t convert_latin1_to_utf16(const char * input, size_t le * @param input the UTF-8 string to convert * @param length the length of the string in bytes * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return the number of written char16_t; 0 if the input was not valid UTF-8 string + * @return the number of written char16_t; 0 if the input was not valid UTF-8 + * string */ -simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept; +simdutf_warn_unused size_t convert_utf8_to_utf16le( + const char *input, size_t length, char16_t *utf16_output) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_utf8_to_utf16le(const detail::input_span_of_byte_like auto &utf8_input, + std::span utf16_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf8_to_utf16::convert( + utf8_input.data(), utf8_input.size(), utf16_output.data()); + } else + #endif + { + return convert_utf8_to_utf16le( + reinterpret_cast(utf8_input.data()), utf8_input.size(), + utf16_output.data()); + } +} + #endif // SIMDUTF_SPAN /** * Convert possibly broken UTF-8 string into UTF-16BE string. @@ -1225,26 +5994,71 @@ simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * input, size_t le * @param input the UTF-8 string to convert * @param length the length of the string in bytes * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return the number of written char16_t; 0 if the input was not valid UTF-8 string + * @return the number of written char16_t; 0 if the input was not valid UTF-8 + * string */ -simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept; - +simdutf_warn_unused size_t convert_utf8_to_utf16be( + const char *input, size_t length, char16_t *utf16_output) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_utf8_to_utf16be(const detail::input_span_of_byte_like auto &utf8_input, + std::span utf16_output) noexcept { + + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf8_to_utf16::convert( + utf8_input.data(), utf8_input.size(), utf16_output.data()); + } else + #endif + { + return convert_utf8_to_utf16be( + reinterpret_cast(utf8_input.data()), utf8_input.size(), + utf16_output.data()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 - /** - * Convert possibly broken UTF-8 string into latin1 string with errors. - * If the string cannot be represented as Latin1, an error - * code is returned. - * - * During the conversion also validation of the input string is done. - * This function is suitable to work with inputs from untrusted sources. - * - * @param input the UTF-8 string to convert - * @param length the length of the string in bytes - * @param latin1_output the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. - */ - simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char * input, size_t length, char* latin1_output) noexcept; +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +/** + * Convert possibly broken UTF-8 string into latin1 string with errors. + * If the string cannot be represented as Latin1, an error + * code is returned. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param latin1_output the pointer to buffer that can hold conversion result + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of code units validated if + * successful. + */ +simdutf_warn_unused result convert_utf8_to_latin1_with_errors( + const char *input, size_t length, char *latin1_output) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result +convert_utf8_to_latin1_with_errors( + const detail::input_span_of_byte_like auto &utf8_input, + detail::output_span_of_byte_like auto &&latin1_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf8_to_latin1::convert_with_errors( + utf8_input.data(), utf8_input.size(), latin1_output.data()); + } else + #endif + { + return convert_utf8_to_latin1_with_errors( + reinterpret_cast(utf8_input.data()), utf8_input.size(), + reinterpret_cast(latin1_output.data())); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 /** * Using native endianness, convert possibly broken UTF-8 string into UTF-16 * string and stop on error. @@ -1255,9 +6069,31 @@ simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * input, size_t le * @param input the UTF-8 string to convert * @param length the length of the string in bytes * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of char16_t written if + * successful. */ -simdutf_warn_unused result convert_utf8_to_utf16_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept; +simdutf_warn_unused result convert_utf8_to_utf16_with_errors( + const char *input, size_t length, char16_t *utf16_output) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result +convert_utf8_to_utf16_with_errors( + const detail::input_span_of_byte_like auto &utf8_input, + std::span utf16_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf8_to_utf16::convert_with_errors( + utf8_input.data(), utf8_input.size(), utf16_output.data()); + } else + #endif + { + return convert_utf8_to_utf16_with_errors( + reinterpret_cast(utf8_input.data()), utf8_input.size(), + utf16_output.data()); + } +} + #endif // SIMDUTF_SPAN /** * Convert possibly broken UTF-8 string into UTF-16LE string and stop on error. @@ -1268,9 +6104,31 @@ simdutf_warn_unused result convert_utf8_to_utf16_with_errors(const char * input, * @param input the UTF-8 string to convert * @param length the length of the string in bytes * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of char16_t written if + * successful. */ -simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept; +simdutf_warn_unused result convert_utf8_to_utf16le_with_errors( + const char *input, size_t length, char16_t *utf16_output) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result +convert_utf8_to_utf16le_with_errors( + const detail::input_span_of_byte_like auto &utf8_input, + std::span utf16_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf8_to_utf16::convert_with_errors( + utf8_input.data(), utf8_input.size(), utf16_output.data()); + } else + #endif + { + return convert_utf8_to_utf16le_with_errors( + reinterpret_cast(utf8_input.data()), utf8_input.size(), + utf16_output.data()); + } +} + #endif // SIMDUTF_SPAN /** * Convert possibly broken UTF-8 string into UTF-16BE string and stop on error. @@ -1281,10 +6139,34 @@ simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * inpu * @param input the UTF-8 string to convert * @param length the length of the string in bytes * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of char16_t written if + * successful. */ -simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept; +simdutf_warn_unused result convert_utf8_to_utf16be_with_errors( + const char *input, size_t length, char16_t *utf16_output) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result +convert_utf8_to_utf16be_with_errors( + const detail::input_span_of_byte_like auto &utf8_input, + std::span utf16_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf8_to_utf16::convert_with_errors( + utf8_input.data(), utf8_input.size(), utf16_output.data()); + } else + #endif + { + return convert_utf8_to_utf16be_with_errors( + reinterpret_cast(utf8_input.data()), utf8_input.size(), + utf16_output.data()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 /** * Convert possibly broken UTF-8 string into UTF-32 string. * @@ -1294,9 +6176,28 @@ simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * inpu * @param input the UTF-8 string to convert * @param length the length of the string in bytes * @param utf32_buffer the pointer to buffer that can hold conversion result - * @return the number of written char32_t; 0 if the input was not valid UTF-8 string + * @return the number of written char32_t; 0 if the input was not valid UTF-8 + * string */ -simdutf_warn_unused size_t convert_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_output) noexcept; +simdutf_warn_unused size_t convert_utf8_to_utf32( + const char *input, size_t length, char32_t *utf32_output) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_utf8_to_utf32(const detail::input_span_of_byte_like auto &utf8_input, + std::span utf32_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf8_to_utf32::convert(utf8_input.data(), utf8_input.size(), + utf32_output.data()); + } else + #endif + { + return convert_utf8_to_utf32( + reinterpret_cast(utf8_input.data()), utf8_input.size(), + utf32_output.data()); + } +} + #endif // SIMDUTF_SPAN /** * Convert possibly broken UTF-8 string into UTF-32 string and stop on error. @@ -1307,25 +6208,76 @@ simdutf_warn_unused size_t convert_utf8_to_utf32(const char * input, size_t leng * @param input the UTF-8 string to convert * @param length the length of the string in bytes * @param utf32_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of char32_t written if + * successful. */ -simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) noexcept; - - /** - * Convert valid UTF-8 string into latin1 string. - * - * This function assumes that the input string is valid UTF-8. - * - * This function is not BOM-aware. - * - * @param input the UTF-8 string to convert - * @param length the length of the string in bytes - * @param latin1_output the pointer to buffer that can hold conversion result - * @return the number of written char; 0 if the input was not valid UTF-8 string - */ - simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * input, size_t length, char* latin1_output) noexcept; +simdutf_warn_unused result convert_utf8_to_utf32_with_errors( + const char *input, size_t length, char32_t *utf32_output) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result +convert_utf8_to_utf32_with_errors( + const detail::input_span_of_byte_like auto &utf8_input, + std::span utf32_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf8_to_utf32::convert_with_errors( + utf8_input.data(), utf8_input.size(), utf32_output.data()); + } else + #endif + { + return convert_utf8_to_utf32_with_errors( + reinterpret_cast(utf8_input.data()), utf8_input.size(), + utf32_output.data()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +/** + * Convert valid UTF-8 string into latin1 string. + * + * This function assumes that the input string is valid UTF-8 and that it can be + * represented as Latin1. If you violate this assumption, the result is + * implementation defined and may include system-dependent behavior such as + * crashes. + * + * This function is for expert users only and not part of our public API. Use + * convert_utf8_to_latin1 instead. The function may be removed from the library + * in the future. + * + * This function is not BOM-aware. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param latin1_output the pointer to buffer that can hold conversion result + * @return the number of written char; 0 if the input was not valid UTF-8 string + */ +simdutf_warn_unused size_t convert_valid_utf8_to_latin1( + const char *input, size_t length, char *latin1_output) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_valid_utf8_to_latin1( + const detail::input_span_of_byte_like auto &valid_utf8_input, + detail::output_span_of_byte_like auto &&latin1_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf8_to_latin1::convert_valid( + valid_utf8_input.data(), valid_utf8_input.size(), latin1_output.data()); + } else + #endif + { + return convert_valid_utf8_to_latin1( + reinterpret_cast(valid_utf8_input.data()), + valid_utf8_input.size(), latin1_output.data()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 /** * Using native endianness, convert valid UTF-8 string into a UTF-16 string. * @@ -1336,7 +6288,26 @@ simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * input, * @param utf16_buffer the pointer to buffer that can hold conversion result * @return the number of written char16_t */ -simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_buffer) noexcept; +simdutf_warn_unused size_t convert_valid_utf8_to_utf16( + const char *input, size_t length, char16_t *utf16_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_valid_utf8_to_utf16( + const detail::input_span_of_byte_like auto &valid_utf8_input, + std::span utf16_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf8_to_utf16::convert_valid( + valid_utf8_input.data(), valid_utf8_input.size(), utf16_output.data()); + } else + #endif + { + return convert_valid_utf8_to_utf16( + reinterpret_cast(valid_utf8_input.data()), + valid_utf8_input.size(), utf16_output.data()); + } +} + #endif // SIMDUTF_SPAN /** * Convert valid UTF-8 string into UTF-16LE string. @@ -1348,7 +6319,27 @@ simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * input, size_ * @param utf16_buffer the pointer to buffer that can hold conversion result * @return the number of written char16_t */ -simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_buffer) noexcept; +simdutf_warn_unused size_t convert_valid_utf8_to_utf16le( + const char *input, size_t length, char16_t *utf16_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_valid_utf8_to_utf16le( + const detail::input_span_of_byte_like auto &valid_utf8_input, + std::span utf16_output) noexcept { + + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf8_to_utf16::convert_valid( + valid_utf8_input.data(), valid_utf8_input.size(), utf16_output.data()); + } else + #endif + { + return convert_valid_utf8_to_utf16le( + reinterpret_cast(valid_utf8_input.data()), + valid_utf8_input.size(), utf16_output.data()); + } +} + #endif // SIMDUTF_SPAN /** * Convert valid UTF-8 string into UTF-16BE string. @@ -1360,8 +6351,29 @@ simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * input, siz * @param utf16_buffer the pointer to buffer that can hold conversion result * @return the number of written char16_t */ -simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_buffer) noexcept; +simdutf_warn_unused size_t convert_valid_utf8_to_utf16be( + const char *input, size_t length, char16_t *utf16_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_valid_utf8_to_utf16be( + const detail::input_span_of_byte_like auto &valid_utf8_input, + std::span utf16_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf8_to_utf16::convert_valid( + valid_utf8_input.data(), valid_utf8_input.size(), utf16_output.data()); + } else + #endif + { + return convert_valid_utf8_to_utf16be( + reinterpret_cast(valid_utf8_input.data()), + valid_utf8_input.size(), utf16_output.data()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 /** * Convert valid UTF-8 string into UTF-32 string. * @@ -1372,22 +6384,63 @@ simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * input, siz * @param utf32_buffer the pointer to buffer that can hold conversion result * @return the number of written char32_t */ -simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept; - +simdutf_warn_unused size_t convert_valid_utf8_to_utf32( + const char *input, size_t length, char32_t *utf32_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_valid_utf8_to_utf32( + const detail::input_span_of_byte_like auto &valid_utf8_input, + std::span utf32_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf8_to_utf32::convert_valid( + valid_utf8_input.data(), valid_utf8_input.size(), utf32_output.data()); + } else + #endif + { + return convert_valid_utf8_to_utf32( + reinterpret_cast(valid_utf8_input.data()), + valid_utf8_input.size(), utf32_output.data()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 /** - * Return the number of bytes that this Latin1 string would require in UTF-8 format. + * Return the number of bytes that this Latin1 string would require in UTF-8 + * format. * * @param input the Latin1 string to convert * @param length the length of the string bytes * @return the number of bytes required to encode the Latin1 string as UTF-8 */ -simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) noexcept; +simdutf_warn_unused size_t utf8_length_from_latin1(const char *input, + size_t length) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +utf8_length_from_latin1( + const detail::input_span_of_byte_like auto &latin1_input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::latin1_to_utf8::utf8_length_from_latin1(latin1_input.data(), + latin1_input.size()); + } else + #endif + { + return utf8_length_from_latin1( + reinterpret_cast(latin1_input.data()), + latin1_input.size()); + } +} + #endif // SIMDUTF_SPAN /** - * Compute the number of bytes that this UTF-8 string would require in Latin1 format. + * Compute the number of bytes that this UTF-8 string would require in Latin1 + * format. * - * This function does not validate the input. + * This function does not validate the input. It is acceptable to pass invalid + * UTF-8 strings but in such cases the result is implementation defined. * * This function is not BOM-aware. * @@ -1395,38 +6448,106 @@ simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t le * @param length the length of the string in byte * @return the number of bytes required to encode the UTF-8 string as Latin1 */ -simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) noexcept; +simdutf_warn_unused size_t latin1_length_from_utf8(const char *input, + size_t length) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +latin1_length_from_utf8( + const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf8::count_code_points(valid_utf8_input.data(), + valid_utf8_input.size()); + } else + #endif + { + return latin1_length_from_utf8( + reinterpret_cast(valid_utf8_input.data()), + valid_utf8_input.size()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 /** - * Compute the number of 2-byte code units that this UTF-8 string would require in UTF-16LE format. + * Compute the number of 2-byte code units that this UTF-8 string would require + * in UTF-16LE format. * - * This function does not validate the input. + * This function does not validate the input. It is acceptable to pass invalid + * UTF-8 strings but in such cases the result is implementation defined. * * This function is not BOM-aware. * * @param input the UTF-8 string to process * @param length the length of the string in bytes - * @return the number of char16_t code units required to encode the UTF-8 string as UTF-16LE + * @return the number of char16_t code units required to encode the UTF-8 string + * as UTF-16LE */ -simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) noexcept; +simdutf_warn_unused size_t utf16_length_from_utf8(const char *input, + size_t length) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +utf16_length_from_utf8( + const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf8::utf16_length_from_utf8(valid_utf8_input.data(), + valid_utf8_input.size()); + } else + #endif + { + return utf16_length_from_utf8( + reinterpret_cast(valid_utf8_input.data()), + valid_utf8_input.size()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 /** - * Compute the number of 4-byte code units that this UTF-8 string would require in UTF-32 format. + * Compute the number of 4-byte code units that this UTF-8 string would require + * in UTF-32 format. * * This function is equivalent to count_utf8 * - * This function does not validate the input. + * This function does not validate the input. It is acceptable to pass invalid + * UTF-8 strings but in such cases the result is implementation defined. * * This function is not BOM-aware. * * @param input the UTF-8 string to process * @param length the length of the string in bytes - * @return the number of char32_t code units required to encode the UTF-8 string as UTF-32 + * @return the number of char32_t code units required to encode the UTF-8 string + * as UTF-32 */ -simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) noexcept; +simdutf_warn_unused size_t utf32_length_from_utf8(const char *input, + size_t length) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +utf32_length_from_utf8( + const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept { + + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf8::count_code_points(valid_utf8_input.data(), + valid_utf8_input.size()); + } else + #endif + { + return utf32_length_from_utf8( + reinterpret_cast(valid_utf8_input.data()), + valid_utf8_input.size()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 /** - * Using native endianness, convert possibly broken UTF-16 string into UTF-8 string. + * Using native endianness, convert possibly broken UTF-16 string into UTF-8 + * string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1436,14 +6557,89 @@ simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t len * @param input the UTF-16 string to convert * @param length the length of the string in 2-byte code units (char16_t) * @param utf8_buffer the pointer to buffer that can hold conversion result - * @return number of written code units; 0 if input is not a valid UTF-16LE string + * @return number of written code units; 0 if input is not a valid UTF-16LE + * string */ -simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; - +simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t *input, + size_t length, + char *utf8_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_utf16_to_utf8( + std::span utf16_input, + detail::output_span_of_byte_like auto &&utf8_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16_to_utf8::convert( + utf16_input.data(), utf16_input.size(), utf8_output.data()); + } else + #endif + { + return convert_utf16_to_utf8(utf16_input.data(), utf16_input.size(), + reinterpret_cast(utf8_output.data())); + } +} + #endif // SIMDUTF_SPAN +/** + * Using native endianness, convert possibly broken UTF-16 string into UTF-8 + * string with output limit. + * + * We write as many characters as possible into the output buffer, + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * + * @param input the UTF-16 string to convert + * @param length the length of the string in 16-bit code units (char16_t) + * @param utf8_output the pointer to buffer that can hold conversion result + * @param utf8_len the maximum output length + * @return the number of written char; 0 if conversion is not possible + */ +simdutf_warn_unused size_t convert_utf16_to_utf8_safe(const char16_t *input, + size_t length, + char *utf8_output, + size_t utf8_len) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_utf16_to_utf8_safe( + std::span utf16_input, + detail::output_span_of_byte_like auto &&utf8_output) noexcept { + // implementation note: outputspan is a forwarding ref to avoid copying + // and allow both lvalues and rvalues. std::span can be copied without + // problems, but std::vector should not, and this function should accept + // both. it will allow using an owning rvalue ref (example: passing a + // temporary std::string) as output, but the user will quickly find out + // that he has no way of getting the data out of the object in that case. + #if SIMDUTF_CPLUSPLUS23 + if consteval { + const full_result r = + scalar::utf16_to_utf8::convert_with_errors( + utf16_input.data(), utf16_input.size(), utf8_output.data(), + utf8_output.size()); + if (r.error != error_code::SUCCESS && + r.error != error_code::OUTPUT_BUFFER_TOO_SMALL) { + return 0; + } + return r.output_count; + } else + #endif + { + return convert_utf16_to_utf8_safe( + utf16_input.data(), utf16_input.size(), + reinterpret_cast(utf8_output.data()), utf8_output.size()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 /** - * Using native endianness, convert possibly broken UTF-16 string into Latin1 string. + * Using native endianness, convert possibly broken UTF-16 string into Latin1 + * string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1453,9 +6649,29 @@ simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * input, size_t * @param input the UTF-16 string to convert * @param length the length of the string in 2-byte code units (char16_t) * @param latin1_buffer the pointer to buffer that can hold conversion result - * @return number of written code units; 0 if input is not a valid UTF-16 string or if it cannot be represented as Latin1 + * @return number of written code units; 0 if input is not a valid UTF-16 string + * or if it cannot be represented as Latin1 */ -simdutf_warn_unused size_t convert_utf16_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept; +simdutf_warn_unused size_t convert_utf16_to_latin1( + const char16_t *input, size_t length, char *latin1_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_utf16_to_latin1( + std::span utf16_input, + detail::output_span_of_byte_like auto &&latin1_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16_to_latin1::convert( + utf16_input.data(), utf16_input.size(), latin1_output.data()); + } else + #endif + { + return convert_utf16_to_latin1( + utf16_input.data(), utf16_input.size(), + reinterpret_cast(latin1_output.data())); + } +} + #endif // SIMDUTF_SPAN /** * Convert possibly broken UTF-16LE string into Latin1 string. @@ -1470,9 +6686,29 @@ simdutf_warn_unused size_t convert_utf16_to_latin1(const char16_t * input, size_ * @param input the UTF-16LE string to convert * @param length the length of the string in 2-byte code units (char16_t) * @param latin1_buffer the pointer to buffer that can hold conversion result - * @return number of written code units; 0 if input is not a valid UTF-16LE string or if it cannot be represented as Latin1 + * @return number of written code units; 0 if input is not a valid UTF-16LE + * string or if it cannot be represented as Latin1 */ -simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept; +simdutf_warn_unused size_t convert_utf16le_to_latin1( + const char16_t *input, size_t length, char *latin1_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_utf16le_to_latin1( + std::span utf16_input, + detail::output_span_of_byte_like auto &&latin1_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16_to_latin1::convert( + utf16_input.data(), utf16_input.size(), latin1_output.data()); + } else + #endif + { + return convert_utf16le_to_latin1( + utf16_input.data(), utf16_input.size(), + reinterpret_cast(latin1_output.data())); + } +} + #endif // SIMDUTF_SPAN /** * Convert possibly broken UTF-16BE string into Latin1 string. @@ -1485,11 +6721,32 @@ simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * input, siz * @param input the UTF-16BE string to convert * @param length the length of the string in 2-byte code units (char16_t) * @param latin1_buffer the pointer to buffer that can hold conversion result - * @return number of written code units; 0 if input is not a valid UTF-16BE string or if it cannot be represented as Latin1 + * @return number of written code units; 0 if input is not a valid UTF-16BE + * string or if it cannot be represented as Latin1 */ -simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept; - +simdutf_warn_unused size_t convert_utf16be_to_latin1( + const char16_t *input, size_t length, char *latin1_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_utf16be_to_latin1( + std::span utf16_input, + detail::output_span_of_byte_like auto &&latin1_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16_to_latin1::convert( + utf16_input.data(), utf16_input.size(), latin1_output.data()); + } else + #endif + { + return convert_utf16be_to_latin1( + utf16_input.data(), utf16_input.size(), + reinterpret_cast(latin1_output.data())); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 /** * Convert possibly broken UTF-16LE string into UTF-8 string. * @@ -1501,9 +6758,30 @@ simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * input, siz * @param input the UTF-16LE string to convert * @param length the length of the string in 2-byte code units (char16_t) * @param utf8_buffer the pointer to buffer that can hold conversion result - * @return number of written code units; 0 if input is not a valid UTF-16LE string + * @return number of written code units; 0 if input is not a valid UTF-16LE + * string */ -simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; +simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t *input, + size_t length, + char *utf8_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_utf16le_to_utf8( + std::span utf16_input, + detail::output_span_of_byte_like auto &&utf8_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16_to_utf8::convert( + utf16_input.data(), utf16_input.size(), utf8_output.data()); + } else + #endif + { + return convert_utf16le_to_utf8( + utf16_input.data(), utf16_input.size(), + reinterpret_cast(utf8_output.data())); + } +} + #endif // SIMDUTF_SPAN /** * Convert possibly broken UTF-16BE string into UTF-8 string. @@ -1516,12 +6794,36 @@ simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * input, size_ * @param input the UTF-16BE string to convert * @param length the length of the string in 2-byte code units (char16_t) * @param utf8_buffer the pointer to buffer that can hold conversion result - * @return number of written code units; 0 if input is not a valid UTF-16LE string + * @return number of written code units; 0 if input is not a valid UTF-16LE + * string */ -simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; +simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t *input, + size_t length, + char *utf8_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_utf16be_to_utf8( + std::span utf16_input, + detail::output_span_of_byte_like auto &&utf8_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16_to_utf8::convert( + utf16_input.data(), utf16_input.size(), utf8_output.data()); + } else + #endif + { + return convert_utf16be_to_utf8( + utf16_input.data(), utf16_input.size(), + reinterpret_cast(utf8_output.data())); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 /** - * Using native endianness, convert possibly broken UTF-16 string into Latin1 string. + * Using native endianness, convert possibly broken UTF-16 string into Latin1 + * string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1530,9 +6832,31 @@ simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * input, size_ * @param input the UTF-16 string to convert * @param length the length of the string in 2-byte code units (char16_t) * @param latin1_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of char written if + * successful. */ -simdutf_warn_unused result convert_utf16_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) noexcept; +simdutf_warn_unused result convert_utf16_to_latin1_with_errors( + const char16_t *input, size_t length, char *latin1_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result +convert_utf16_to_latin1_with_errors( + std::span utf16_input, + detail::output_span_of_byte_like auto &&latin1_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16_to_latin1::convert_with_errors( + utf16_input.data(), utf16_input.size(), latin1_output.data()); + } else + #endif + { + return convert_utf16_to_latin1_with_errors( + utf16_input.data(), utf16_input.size(), + reinterpret_cast(latin1_output.data())); + } +} + #endif // SIMDUTF_SPAN /** * Convert possibly broken UTF-16LE string into Latin1 string. @@ -1544,9 +6868,31 @@ simdutf_warn_unused result convert_utf16_to_latin1_with_errors(const char16_t * * @param input the UTF-16LE string to convert * @param length the length of the string in 2-byte code units (char16_t) * @param latin1_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of char written if + * successful. */ -simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) noexcept; +simdutf_warn_unused result convert_utf16le_to_latin1_with_errors( + const char16_t *input, size_t length, char *latin1_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result +convert_utf16le_to_latin1_with_errors( + std::span utf16_input, + detail::output_span_of_byte_like auto &&latin1_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16_to_latin1::convert_with_errors( + utf16_input.data(), utf16_input.size(), latin1_output.data()); + } else + #endif + { + return convert_utf16le_to_latin1_with_errors( + utf16_input.data(), utf16_input.size(), + reinterpret_cast(latin1_output.data())); + } +} + #endif // SIMDUTF_SPAN /** * Convert possibly broken UTF-16BE string into Latin1 string. @@ -1560,13 +6906,37 @@ simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * @param input the UTF-16BE string to convert * @param length the length of the string in 2-byte code units (char16_t) * @param latin1_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of char written if + * successful. */ -simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) noexcept; - +simdutf_warn_unused result convert_utf16be_to_latin1_with_errors( + const char16_t *input, size_t length, char *latin1_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result +convert_utf16be_to_latin1_with_errors( + std::span utf16_input, + detail::output_span_of_byte_like auto &&latin1_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16_to_latin1::convert_with_errors( + utf16_input.data(), utf16_input.size(), latin1_output.data()); + } else + #endif + { + return convert_utf16be_to_latin1_with_errors( + utf16_input.data(), utf16_input.size(), + reinterpret_cast(latin1_output.data())); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 /** - * Using native endianness, convert possibly broken UTF-16 string into UTF-8 string and stop on error. + * Using native endianness, convert possibly broken UTF-16 string into UTF-8 + * string and stop on error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1576,9 +6946,31 @@ simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * @param input the UTF-16 string to convert * @param length the length of the string in 2-byte code units (char16_t) * @param utf8_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of char written if + * successful. */ -simdutf_warn_unused result convert_utf16_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) noexcept; +simdutf_warn_unused result convert_utf16_to_utf8_with_errors( + const char16_t *input, size_t length, char *utf8_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result +convert_utf16_to_utf8_with_errors( + std::span utf16_input, + detail::output_span_of_byte_like auto &&utf8_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16_to_utf8::convert_with_errors( + utf16_input.data(), utf16_input.size(), utf8_output.data()); + } else + #endif + { + return convert_utf16_to_utf8_with_errors( + utf16_input.data(), utf16_input.size(), + reinterpret_cast(utf8_output.data())); + } +} + #endif // SIMDUTF_SPAN /** * Convert possibly broken UTF-16LE string into UTF-8 string and stop on error. @@ -1591,9 +6983,31 @@ simdutf_warn_unused result convert_utf16_to_utf8_with_errors(const char16_t * in * @param input the UTF-16LE string to convert * @param length the length of the string in 2-byte code units (char16_t) * @param utf8_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of char written if + * successful. */ -simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) noexcept; +simdutf_warn_unused result convert_utf16le_to_utf8_with_errors( + const char16_t *input, size_t length, char *utf8_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result +convert_utf16le_to_utf8_with_errors( + std::span utf16_input, + detail::output_span_of_byte_like auto &&utf8_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16_to_utf8::convert_with_errors( + utf16_input.data(), utf16_input.size(), utf8_output.data()); + } else + #endif + { + return convert_utf16le_to_utf8_with_errors( + utf16_input.data(), utf16_input.size(), + reinterpret_cast(utf8_output.data())); + } +} + #endif // SIMDUTF_SPAN /** * Convert possibly broken UTF-16BE string into UTF-8 string and stop on error. @@ -1606,29 +7020,79 @@ simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * * @param input the UTF-16BE string to convert * @param length the length of the string in 2-byte code units (char16_t) * @param utf8_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of char written if + * successful. */ -simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) noexcept; +simdutf_warn_unused result convert_utf16be_to_utf8_with_errors( + const char16_t *input, size_t length, char *utf8_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result +convert_utf16be_to_utf8_with_errors( + std::span utf16_input, + detail::output_span_of_byte_like auto &&utf8_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16_to_utf8::convert_with_errors( + utf16_input.data(), utf16_input.size(), utf8_output.data()); + } else + #endif + { + return convert_utf16be_to_utf8_with_errors( + utf16_input.data(), utf16_input.size(), + reinterpret_cast(utf8_output.data())); + } +} + #endif // SIMDUTF_SPAN /** * Using native endianness, convert valid UTF-16 string into UTF-8 string. * - * This function assumes that the input string is valid UTF-16LE. + * This function assumes that the input string is valid UTF-16. * * This function is not BOM-aware. * * @param input the UTF-16 string to convert * @param length the length of the string in 2-byte code units (char16_t) - * @param utf8_buffer the pointer to buffer that can hold the conversion result + * @param utf8_buffer the pointer to a buffer that can hold the conversion + * result * @return number of written code units; 0 if conversion is not possible */ -simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; - +simdutf_warn_unused size_t convert_valid_utf16_to_utf8( + const char16_t *input, size_t length, char *utf8_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_valid_utf16_to_utf8( + std::span valid_utf16_input, + detail::output_span_of_byte_like auto &&utf8_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16_to_utf8::convert_valid( + valid_utf16_input.data(), valid_utf16_input.size(), utf8_output.data()); + } else + #endif + { + return convert_valid_utf16_to_utf8( + valid_utf16_input.data(), valid_utf16_input.size(), + reinterpret_cast(utf8_output.data())); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 /** * Using native endianness, convert UTF-16 string into Latin1 string. * - * This function assumes that the input string is valid UTF-8. + * This function assumes that the input string is valid UTF-16 and that it can + * be represented as Latin1. If you violate this assumption, the result is + * implementation defined and may include system-dependent behavior such as + * crashes. + * + * This function is for expert users only and not part of our public API. Use + * convert_utf16_to_latin1 instead. The function may be removed from the library + * in the future. * * This function is not BOM-aware. * @@ -1637,12 +7101,40 @@ simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * input, s * @param latin1_buffer the pointer to buffer that can hold conversion result * @return number of written code units; 0 if conversion is not possible */ -simdutf_warn_unused size_t convert_valid_utf16_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept; +simdutf_warn_unused size_t convert_valid_utf16_to_latin1( + const char16_t *input, size_t length, char *latin1_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_valid_utf16_to_latin1( + std::span valid_utf16_input, + detail::output_span_of_byte_like auto &&latin1_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16_to_latin1::convert_valid_impl( + detail::constexpr_cast_ptr(valid_utf16_input.data()), + valid_utf16_input.size(), + detail::constexpr_cast_writeptr(latin1_output.data())); + } else + #endif + { + return convert_valid_utf16_to_latin1( + valid_utf16_input.data(), valid_utf16_input.size(), + reinterpret_cast(latin1_output.data())); + } +} + #endif // SIMDUTF_SPAN /** * Convert valid UTF-16LE string into Latin1 string. * - * This function assumes that the input string is valid UTF-16LE. + * This function assumes that the input string is valid UTF-16LE and that it can + * be represented as Latin1. If you violate this assumption, the result is + * implementation defined and may include system-dependent behavior such as + * crashes. + * + * This function is for expert users only and not part of our public API. Use + * convert_utf16le_to_latin1 instead. The function may be removed from the + * library in the future. * * This function is not BOM-aware. * @@ -1651,12 +7143,40 @@ simdutf_warn_unused size_t convert_valid_utf16_to_latin1(const char16_t * input, * @param latin1_buffer the pointer to buffer that can hold conversion result * @return number of written code units; 0 if conversion is not possible */ -simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept; +simdutf_warn_unused size_t convert_valid_utf16le_to_latin1( + const char16_t *input, size_t length, char *latin1_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused size_t +convert_valid_utf16le_to_latin1( + std::span valid_utf16_input, + detail::output_span_of_byte_like auto &&latin1_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16_to_latin1::convert_valid_impl( + detail::constexpr_cast_ptr(valid_utf16_input.data()), + valid_utf16_input.size(), + detail::constexpr_cast_writeptr(latin1_output.data())); + } else + #endif + { + return convert_valid_utf16le_to_latin1( + valid_utf16_input.data(), valid_utf16_input.size(), + reinterpret_cast(latin1_output.data())); + } +} + #endif // SIMDUTF_SPAN /** * Convert valid UTF-16BE string into Latin1 string. * - * This function assumes that the input string is valid UTF-16BE. + * This function assumes that the input string is valid UTF-16BE and that it can + * be represented as Latin1. If you violate this assumption, the result is + * implementation defined and may include system-dependent behavior such as + * crashes. + * + * This function is for expert users only and not part of our public API. Use + * convert_utf16be_to_latin1 instead. The function may be removed from the + * library in the future. * * This function is not BOM-aware. * @@ -1665,22 +7185,64 @@ simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * inpu * @param latin1_buffer the pointer to buffer that can hold conversion result * @return number of written code units; 0 if conversion is not possible */ -simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept; - +simdutf_warn_unused size_t convert_valid_utf16be_to_latin1( + const char16_t *input, size_t length, char *latin1_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused size_t +convert_valid_utf16be_to_latin1( + std::span valid_utf16_input, + detail::output_span_of_byte_like auto &&latin1_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16_to_latin1::convert_valid_impl( + detail::constexpr_cast_ptr(valid_utf16_input.data()), + valid_utf16_input.size(), + detail::constexpr_cast_writeptr(latin1_output.data())); + } else + #endif + { + return convert_valid_utf16be_to_latin1( + valid_utf16_input.data(), valid_utf16_input.size(), + reinterpret_cast(latin1_output.data())); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 /** * Convert valid UTF-16LE string into UTF-8 string. * - * This function assumes that the input string is valid UTF-16LE. + * This function assumes that the input string is valid UTF-16LE * * This function is not BOM-aware. * * @param input the UTF-16LE string to convert * @param length the length of the string in 2-byte code units (char16_t) - * @param utf8_buffer the pointer to buffer that can hold the conversion result + * @param utf8_buffer the pointer to a buffer that can hold the conversion + * result * @return number of written code units; 0 if conversion is not possible */ -simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; +simdutf_warn_unused size_t convert_valid_utf16le_to_utf8( + const char16_t *input, size_t length, char *utf8_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_valid_utf16le_to_utf8( + std::span valid_utf16_input, + detail::output_span_of_byte_like auto &&utf8_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16_to_utf8::convert_valid( + valid_utf16_input.data(), valid_utf16_input.size(), utf8_output.data()); + } else + #endif + { + return convert_valid_utf16le_to_utf8( + valid_utf16_input.data(), valid_utf16_input.size(), + reinterpret_cast(utf8_output.data())); + } +} + #endif // SIMDUTF_SPAN /** * Convert valid UTF-16BE string into UTF-8 string. @@ -1691,13 +7253,36 @@ simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * input, * * @param input the UTF-16BE string to convert * @param length the length of the string in 2-byte code units (char16_t) - * @param utf8_buffer the pointer to buffer that can hold the conversion result + * @param utf8_buffer the pointer to a buffer that can hold the conversion + * result * @return number of written code units; 0 if conversion is not possible */ -simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; +simdutf_warn_unused size_t convert_valid_utf16be_to_utf8( + const char16_t *input, size_t length, char *utf8_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_valid_utf16be_to_utf8( + std::span valid_utf16_input, + detail::output_span_of_byte_like auto &&utf8_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16_to_utf8::convert_valid( + valid_utf16_input.data(), valid_utf16_input.size(), utf8_output.data()); + } else + #endif + { + return convert_valid_utf16be_to_utf8( + valid_utf16_input.data(), valid_utf16_input.size(), + reinterpret_cast(utf8_output.data())); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 /** - * Using native endianness, convert possibly broken UTF-16 string into UTF-32 string. + * Using native endianness, convert possibly broken UTF-16 string into UTF-32 + * string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1707,9 +7292,28 @@ simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * input, * @param input the UTF-16 string to convert * @param length the length of the string in 2-byte code units (char16_t) * @param utf32_buffer the pointer to buffer that can hold conversion result - * @return number of written code units; 0 if input is not a valid UTF-16LE string + * @return number of written code units; 0 if input is not a valid UTF-16LE + * string */ -simdutf_warn_unused size_t convert_utf16_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; +simdutf_warn_unused size_t convert_utf16_to_utf32( + const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_utf16_to_utf32(std::span utf16_input, + std::span utf32_output) noexcept { + + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16_to_utf32::convert( + utf16_input.data(), utf16_input.size(), utf32_output.data()); + } else + #endif + { + return convert_utf16_to_utf32(utf16_input.data(), utf16_input.size(), + utf32_output.data()); + } +} + #endif // SIMDUTF_SPAN /** * Convert possibly broken UTF-16LE string into UTF-32 string. @@ -1722,9 +7326,27 @@ simdutf_warn_unused size_t convert_utf16_to_utf32(const char16_t * input, size_t * @param input the UTF-16LE string to convert * @param length the length of the string in 2-byte code units (char16_t) * @param utf32_buffer the pointer to buffer that can hold conversion result - * @return number of written code units; 0 if input is not a valid UTF-16LE string + * @return number of written code units; 0 if input is not a valid UTF-16LE + * string */ -simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; +simdutf_warn_unused size_t convert_utf16le_to_utf32( + const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_utf16le_to_utf32(std::span utf16_input, + std::span utf32_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16_to_utf32::convert( + utf16_input.data(), utf16_input.size(), utf32_output.data()); + } else + #endif + { + return convert_utf16le_to_utf32(utf16_input.data(), utf16_input.size(), + utf32_output.data()); + } +} + #endif // SIMDUTF_SPAN /** * Convert possibly broken UTF-16BE string into UTF-32 string. @@ -1737,9 +7359,27 @@ simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * input, size * @param input the UTF-16BE string to convert * @param length the length of the string in 2-byte code units (char16_t) * @param utf32_buffer the pointer to buffer that can hold conversion result - * @return number of written code units; 0 if input is not a valid UTF-16LE string + * @return number of written code units; 0 if input is not a valid UTF-16LE + * string */ -simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; +simdutf_warn_unused size_t convert_utf16be_to_utf32( + const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_utf16be_to_utf32(std::span utf16_input, + std::span utf32_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16_to_utf32::convert( + utf16_input.data(), utf16_input.size(), utf32_output.data()); + } else + #endif + { + return convert_utf16be_to_utf32(utf16_input.data(), utf16_input.size(), + utf32_output.data()); + } +} + #endif // SIMDUTF_SPAN /** * Using native endianness, convert possibly broken UTF-16 string into @@ -1753,9 +7393,29 @@ simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * input, size * @param input the UTF-16 string to convert * @param length the length of the string in 2-byte code units (char16_t) * @param utf32_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of char32_t written if + * successful. */ -simdutf_warn_unused result convert_utf16_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; +simdutf_warn_unused result convert_utf16_to_utf32_with_errors( + const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result +convert_utf16_to_utf32_with_errors(std::span utf16_input, + std::span utf32_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16_to_utf32::convert_with_errors( + utf16_input.data(), utf16_input.size(), utf32_output.data()); + } else + #endif + { + return convert_utf16_to_utf32_with_errors( + utf16_input.data(), utf16_input.size(), utf32_output.data()); + } +} + #endif // SIMDUTF_SPAN /** * Convert possibly broken UTF-16LE string into UTF-32 string and stop on error. @@ -1768,9 +7428,30 @@ simdutf_warn_unused result convert_utf16_to_utf32_with_errors(const char16_t * i * @param input the UTF-16LE string to convert * @param length the length of the string in 2-byte code units (char16_t) * @param utf32_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of char32_t written if + * successful. */ -simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; +simdutf_warn_unused result convert_utf16le_to_utf32_with_errors( + const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result +convert_utf16le_to_utf32_with_errors( + std::span utf16_input, + std::span utf32_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16_to_utf32::convert_with_errors( + utf16_input.data(), utf16_input.size(), utf32_output.data()); + } else + #endif + { + return convert_utf16le_to_utf32_with_errors( + utf16_input.data(), utf16_input.size(), utf32_output.data()); + } +} + #endif // SIMDUTF_SPAN /** * Convert possibly broken UTF-16BE string into UTF-32 string and stop on error. @@ -1783,23 +7464,65 @@ simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * * @param input the UTF-16BE string to convert * @param length the length of the string in 2-byte code units (char16_t) * @param utf32_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of char32_t written if + * successful. */ -simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; +simdutf_warn_unused result convert_utf16be_to_utf32_with_errors( + const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result +convert_utf16be_to_utf32_with_errors( + std::span utf16_input, + std::span utf32_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16_to_utf32::convert_with_errors( + utf16_input.data(), utf16_input.size(), utf32_output.data()); + } else + #endif + { + return convert_utf16be_to_utf32_with_errors( + utf16_input.data(), utf16_input.size(), utf32_output.data()); + } +} + #endif // SIMDUTF_SPAN /** * Using native endianness, convert valid UTF-16 string into UTF-32 string. * - * This function assumes that the input string is valid UTF-16 (native endianness). + * This function assumes that the input string is valid UTF-16 (native + * endianness). * * This function is not BOM-aware. * * @param input the UTF-16 string to convert * @param length the length of the string in 2-byte code units (char16_t) - * @param utf32_buffer the pointer to buffer that can hold the conversion result + * @param utf32_buffer the pointer to a buffer that can hold the conversion + * result * @return number of written code units; 0 if conversion is not possible */ -simdutf_warn_unused size_t convert_valid_utf16_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; +simdutf_warn_unused size_t convert_valid_utf16_to_utf32( + const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_valid_utf16_to_utf32(std::span valid_utf16_input, + std::span utf32_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16_to_utf32::convert_valid( + valid_utf16_input.data(), valid_utf16_input.size(), + utf32_output.data()); + } else + #endif + { + return convert_valid_utf16_to_utf32(valid_utf16_input.data(), + valid_utf16_input.size(), + utf32_output.data()); + } +} + #endif // SIMDUTF_SPAN /** * Convert valid UTF-16LE string into UTF-32 string. @@ -1810,10 +7533,30 @@ simdutf_warn_unused size_t convert_valid_utf16_to_utf32(const char16_t * input, * * @param input the UTF-16LE string to convert * @param length the length of the string in 2-byte code units (char16_t) - * @param utf32_buffer the pointer to buffer that can hold the conversion result + * @param utf32_buffer the pointer to a buffer that can hold the conversion + * result * @return number of written code units; 0 if conversion is not possible */ -simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; +simdutf_warn_unused size_t convert_valid_utf16le_to_utf32( + const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_valid_utf16le_to_utf32(std::span valid_utf16_input, + std::span utf32_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16_to_utf32::convert_valid( + valid_utf16_input.data(), valid_utf16_input.size(), + utf32_output.data()); + } else + #endif + { + return convert_valid_utf16le_to_utf32(valid_utf16_input.data(), + valid_utf16_input.size(), + utf32_output.data()); + } +} + #endif // SIMDUTF_SPAN /** * Convert valid UTF-16BE string into UTF-32 string. @@ -1824,59 +7567,159 @@ simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * input * * @param input the UTF-16BE string to convert * @param length the length of the string in 2-byte code units (char16_t) - * @param utf32_buffer the pointer to buffer that can hold the conversion result + * @param utf32_buffer the pointer to a buffer that can hold the conversion + * result * @return number of written code units; 0 if conversion is not possible */ -simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; - +simdutf_warn_unused size_t convert_valid_utf16be_to_utf32( + const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_valid_utf16be_to_utf32(std::span valid_utf16_input, + std::span utf32_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16_to_utf32::convert_valid( + valid_utf16_input.data(), valid_utf16_input.size(), + utf32_output.data()); + } else + #endif + { + return convert_valid_utf16be_to_utf32(valid_utf16_input.data(), + valid_utf16_input.size(), + utf32_output.data()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 -/* - * Compute the number of bytes that this UTF-16LE/BE string would require in Latin1 format. - * - * This function does not validate the input. +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +/** + * Using native endianness; Compute the number of bytes that this UTF-16 + * string would require in UTF-8 format. * - * This function is not BOM-aware. + * This function does not validate the input. It is acceptable to pass invalid + * UTF-16 strings but in such cases the result is implementation defined. * + * @param input the UTF-16 string to convert * @param length the length of the string in 2-byte code units (char16_t) - * @return the number of bytes required to encode the UTF-16LE string as Latin1 + * @return the number of bytes required to encode the UTF-16LE string as UTF-8 */ -simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) noexcept; - +simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t *input, + size_t length) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +utf8_length_from_utf16(std::span valid_utf16_input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16::utf8_length_from_utf16( + valid_utf16_input.data(), valid_utf16_input.size()); + } else + #endif + { + return utf8_length_from_utf16(valid_utf16_input.data(), + valid_utf16_input.size()); + } +} + #endif // SIMDUTF_SPAN /** - * Using native endianness; Compute the number of bytes that this UTF-16 - * string would require in UTF-8 format. - * - * This function does not validate the input. + * Using native endianness; compute the number of bytes that this UTF-16 + * string would require in UTF-8 format even when the UTF-16LE content contains + * mismatched surrogates that have to be replaced by the replacement character + * (0xFFFD). * * @param input the UTF-16 string to convert * @param length the length of the string in 2-byte code units (char16_t) - * @return the number of bytes required to encode the UTF-16LE string as UTF-8 + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) where the count is the number of bytes required to + * encode the UTF-16 string as UTF-8, and the error code is either SUCCESS or + * SURROGATE. The count is correct regardless of the error field. + * When SURROGATE is returned, it does not indicate an error in the case of this + * function: it indicates that at least one surrogate has been encountered: the + * surrogates may be matched or not (thus this function does not validate). If + * the returned error code is SUCCESS, then the input contains no surrogate, is + * in the Basic Multilingual Plane, and is necessarily valid. */ -simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) noexcept; +simdutf_warn_unused result utf8_length_from_utf16_with_replacement( + const char16_t *input, size_t length) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result +utf8_length_from_utf16_with_replacement( + std::span valid_utf16_input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16::utf8_length_from_utf16_with_replacement< + endianness::NATIVE>(valid_utf16_input.data(), valid_utf16_input.size()); + } else + #endif + { + return utf8_length_from_utf16_with_replacement(valid_utf16_input.data(), + valid_utf16_input.size()); + } +} + #endif // SIMDUTF_SPAN /** - * Compute the number of bytes that this UTF-16LE string would require in UTF-8 format. + * Compute the number of bytes that this UTF-16LE string would require in UTF-8 + * format. * - * This function does not validate the input. + * This function does not validate the input. It is acceptable to pass invalid + * UTF-16 strings but in such cases the result is implementation defined. * * @param input the UTF-16LE string to convert * @param length the length of the string in 2-byte code units (char16_t) * @return the number of bytes required to encode the UTF-16LE string as UTF-8 */ -simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) noexcept; +simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *input, + size_t length) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused size_t +utf8_length_from_utf16le(std::span valid_utf16_input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16::utf8_length_from_utf16( + valid_utf16_input.data(), valid_utf16_input.size()); + } else + #endif + { + return utf8_length_from_utf16le(valid_utf16_input.data(), + valid_utf16_input.size()); + } +} + #endif // SIMDUTF_SPAN /** - * Compute the number of bytes that this UTF-16BE string would require in UTF-8 format. + * Compute the number of bytes that this UTF-16BE string would require in UTF-8 + * format. * - * This function does not validate the input. + * This function does not validate the input. It is acceptable to pass invalid + * UTF-16 strings but in such cases the result is implementation defined. * * @param input the UTF-16BE string to convert * @param length the length of the string in 2-byte code units (char16_t) * @return the number of bytes required to encode the UTF-16BE string as UTF-8 */ -simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) noexcept; +simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *input, + size_t length) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +utf8_length_from_utf16be(std::span valid_utf16_input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16::utf8_length_from_utf16( + valid_utf16_input.data(), valid_utf16_input.size()); + } else + #endif + { + return utf8_length_from_utf16be(valid_utf16_input.data(), + valid_utf16_input.size()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 /** * Convert possibly broken UTF-32 string into UTF-8 string. * @@ -1890,7 +7733,26 @@ simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size * @param utf8_buffer the pointer to buffer that can hold conversion result * @return number of written code units; 0 if input is not a valid UTF-32 string */ -simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) noexcept; +simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t *input, + size_t length, + char *utf8_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_utf32_to_utf8( + std::span utf32_input, + detail::output_span_of_byte_like auto &&utf8_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf32_to_utf8::convert( + utf32_input.data(), utf32_input.size(), utf8_output.data()); + } else + #endif + { + return convert_utf32_to_utf8(utf32_input.data(), utf32_input.size(), + reinterpret_cast(utf8_output.data())); + } +} + #endif // SIMDUTF_SPAN /** * Convert possibly broken UTF-32 string into UTF-8 string and stop on error. @@ -1903,9 +7765,31 @@ simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * input, size_t * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte code units (char32_t) * @param utf8_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of char written if + * successful. */ -simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * input, size_t length, char* utf8_buffer) noexcept; +simdutf_warn_unused result convert_utf32_to_utf8_with_errors( + const char32_t *input, size_t length, char *utf8_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result +convert_utf32_to_utf8_with_errors( + std::span utf32_input, + detail::output_span_of_byte_like auto &&utf8_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf32_to_utf8::convert_with_errors( + utf32_input.data(), utf32_input.size(), utf8_output.data()); + } else + #endif + { + return convert_utf32_to_utf8_with_errors( + utf32_input.data(), utf32_input.size(), + reinterpret_cast(utf8_output.data())); + } +} + #endif // SIMDUTF_SPAN /** * Convert valid UTF-32 string into UTF-8 string. @@ -1916,13 +7800,36 @@ simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * in * * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte code units (char32_t) - * @param utf8_buffer the pointer to buffer that can hold the conversion result + * @param utf8_buffer the pointer to a buffer that can hold the conversion + * result * @return number of written code units; 0 if conversion is not possible */ -simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) noexcept; +simdutf_warn_unused size_t convert_valid_utf32_to_utf8( + const char32_t *input, size_t length, char *utf8_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_valid_utf32_to_utf8( + std::span valid_utf32_input, + detail::output_span_of_byte_like auto &&utf8_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf32_to_utf8::convert_valid( + valid_utf32_input.data(), valid_utf32_input.size(), utf8_output.data()); + } else + #endif + { + return convert_valid_utf32_to_utf8( + valid_utf32_input.data(), valid_utf32_input.size(), + reinterpret_cast(utf8_output.data())); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 /** - * Using native endianness, convert possibly broken UTF-32 string into a UTF-16 string. + * Using native endianness, convert possibly broken UTF-32 string into a UTF-16 + * string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1934,7 +7841,24 @@ simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * input, s * @param utf16_buffer the pointer to buffer that can hold conversion result * @return number of written code units; 0 if input is not a valid UTF-32 string */ -simdutf_warn_unused size_t convert_utf32_to_utf16(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; +simdutf_warn_unused size_t convert_utf32_to_utf16( + const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_utf32_to_utf16(std::span utf32_input, + std::span utf16_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf32_to_utf16::convert( + utf32_input.data(), utf32_input.size(), utf16_output.data()); + } else + #endif + { + return convert_utf32_to_utf16(utf32_input.data(), utf32_input.size(), + utf16_output.data()); + } +} + #endif // SIMDUTF_SPAN /** * Convert possibly broken UTF-32 string into UTF-16LE string. @@ -1949,8 +7873,27 @@ simdutf_warn_unused size_t convert_utf32_to_utf16(const char32_t * input, size_t * @param utf16_buffer the pointer to buffer that can hold conversion result * @return number of written code units; 0 if input is not a valid UTF-32 string */ -simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; +simdutf_warn_unused size_t convert_utf32_to_utf16le( + const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_utf32_to_utf16le(std::span utf32_input, + std::span utf16_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf32_to_utf16::convert( + utf32_input.data(), utf32_input.size(), utf16_output.data()); + } else + #endif + { + return convert_utf32_to_utf16le(utf32_input.data(), utf32_input.size(), + utf16_output.data()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 /** * Convert possibly broken UTF-32 string into Latin1 string. * @@ -1962,10 +7905,29 @@ simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * input, size * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte code units (char32_t) * @param latin1_buffer the pointer to buffer that can hold conversion result - * @return number of written code units; 0 if input is not a valid UTF-32 string or if it cannot be represented as Latin1 + * @return number of written code units; 0 if input is not a valid UTF-32 string + * or if it cannot be represented as Latin1 */ -simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) noexcept; - +simdutf_warn_unused size_t convert_utf32_to_latin1( + const char32_t *input, size_t length, char *latin1_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_utf32_to_latin1( + std::span utf32_input, + detail::output_span_of_byte_like auto &&latin1_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf32_to_latin1::convert( + utf32_input.data(), utf32_input.size(), latin1_output.data()); + } else + #endif + { + return convert_utf32_to_latin1( + utf32_input.data(), utf32_input.size(), + reinterpret_cast(latin1_output.data())); + } +} + #endif // SIMDUTF_SPAN /** * Convert possibly broken UTF-32 string into Latin1 string and stop on error. @@ -1979,24 +7941,107 @@ simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * input, size_ * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte code units (char32_t) * @param latin1_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of char written if + * successful. */ -simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * input, size_t length, char* latin1_buffer) noexcept; +simdutf_warn_unused result convert_utf32_to_latin1_with_errors( + const char32_t *input, size_t length, char *latin1_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result +convert_utf32_to_latin1_with_errors( + std::span utf32_input, + detail::output_span_of_byte_like auto &&latin1_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf32_to_latin1::convert_with_errors( + utf32_input.data(), utf32_input.size(), latin1_output.data()); + } else + #endif + { + return convert_utf32_to_latin1_with_errors( + utf32_input.data(), utf32_input.size(), + reinterpret_cast(latin1_output.data())); + } +} + #endif // SIMDUTF_SPAN /** * Convert valid UTF-32 string into Latin1 string. * - * This function assumes that the input string is valid UTF-32. + * This function assumes that the input string is valid UTF-32 and that it can + * be represented as Latin1. If you violate this assumption, the result is + * implementation defined and may include system-dependent behavior such as + * crashes. + * + * This function is for expert users only and not part of our public API. Use + * convert_utf32_to_latin1 instead. The function may be removed from the library + * in the future. * * This function is not BOM-aware. * * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte code units (char32_t) - * @param latin1_buffer the pointer to buffer that can hold the conversion result + * @param latin1_buffer the pointer to a buffer that can hold the conversion + * result * @return number of written code units; 0 if conversion is not possible */ -simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) noexcept; +simdutf_warn_unused size_t convert_valid_utf32_to_latin1( + const char32_t *input, size_t length, char *latin1_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused size_t +convert_valid_utf32_to_latin1( + std::span valid_utf32_input, + detail::output_span_of_byte_like auto &&latin1_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf32_to_latin1::convert_valid( + detail::constexpr_cast_ptr(valid_utf32_input.data()), + valid_utf32_input.size(), + detail::constexpr_cast_writeptr(latin1_output.data())); + } + #endif + { + return convert_valid_utf32_to_latin1( + valid_utf32_input.data(), valid_utf32_input.size(), + reinterpret_cast(latin1_output.data())); + } +} + #endif // SIMDUTF_SPAN + +/** + * Compute the number of bytes that this UTF-32 string would require in Latin1 + * format. + * + * This function does not validate the input. It is acceptable to pass invalid + * UTF-32 strings but in such cases the result is implementation defined. + * + * This function is not BOM-aware. + * + * @param length the length of the string in 4-byte code units (char32_t) + * @return the number of bytes required to encode the UTF-32 string as Latin1 + */ +simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 size_t +latin1_length_from_utf32(size_t length) noexcept { + return length; +} + +/** + * Compute the number of bytes that this Latin1 string would require in UTF-32 + * format. + * + * @param length the length of the string in Latin1 code units (char) + * @return the length of the string in 4-byte code units (char32_t) required to + * encode the Latin1 string as UTF-32 + */ +simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 size_t +utf32_length_from_latin1(size_t length) noexcept { + return length; +} +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 /** * Convert possibly broken UTF-32 string into UTF-16BE string. * @@ -2010,7 +8055,24 @@ simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * input, * @param utf16_buffer the pointer to buffer that can hold conversion result * @return number of written code units; 0 if input is not a valid UTF-32 string */ -simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; +simdutf_warn_unused size_t convert_utf32_to_utf16be( + const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_utf32_to_utf16be(std::span utf32_input, + std::span utf16_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf32_to_utf16::convert( + utf32_input.data(), utf32_input.size(), utf16_output.data()); + } else + #endif + { + return convert_utf32_to_utf16be(utf32_input.data(), utf32_input.size(), + utf16_output.data()); + } +} + #endif // SIMDUTF_SPAN /** * Using native endianness, convert possibly broken UTF-32 string into UTF-16 @@ -2024,9 +8086,29 @@ simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * input, size * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte code units (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of char16_t written if + * successful. */ -simdutf_warn_unused result convert_utf32_to_utf16_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; +simdutf_warn_unused result convert_utf32_to_utf16_with_errors( + const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result +convert_utf32_to_utf16_with_errors(std::span utf32_input, + std::span utf16_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf32_to_utf16::convert_with_errors( + utf32_input.data(), utf32_input.size(), utf16_output.data()); + } else + #endif + { + return convert_utf32_to_utf16_with_errors( + utf32_input.data(), utf32_input.size(), utf16_output.data()); + } +} + #endif // SIMDUTF_SPAN /** * Convert possibly broken UTF-32 string into UTF-16LE string and stop on error. @@ -2039,9 +8121,30 @@ simdutf_warn_unused result convert_utf32_to_utf16_with_errors(const char32_t * i * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte code units (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of char16_t written if + * successful. */ -simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; +simdutf_warn_unused result convert_utf32_to_utf16le_with_errors( + const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result +convert_utf32_to_utf16le_with_errors( + std::span utf32_input, + std::span utf16_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf32_to_utf16::convert_with_errors( + utf32_input.data(), utf32_input.size(), utf16_output.data()); + } else + #endif + { + return convert_utf32_to_utf16le_with_errors( + utf32_input.data(), utf32_input.size(), utf16_output.data()); + } +} + #endif // SIMDUTF_SPAN /** * Convert possibly broken UTF-32 string into UTF-16BE string and stop on error. @@ -2054,9 +8157,30 @@ simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte code units (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of char16_t written if + * successful. */ -simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; +simdutf_warn_unused result convert_utf32_to_utf16be_with_errors( + const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result +convert_utf32_to_utf16be_with_errors( + std::span utf32_input, + std::span utf16_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf32_to_utf16::convert_with_errors( + utf32_input.data(), utf32_input.size(), utf16_output.data()); + } else + #endif + { + return convert_utf32_to_utf16be_with_errors( + utf32_input.data(), utf32_input.size(), utf16_output.data()); + } +} + #endif // SIMDUTF_SPAN /** * Using native endianness, convert valid UTF-32 string into a UTF-16 string. @@ -2067,10 +8191,31 @@ simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * * * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte code units (char32_t) - * @param utf16_buffer the pointer to buffer that can hold the conversion result + * @param utf16_buffer the pointer to a buffer that can hold the conversion + * result * @return number of written code units; 0 if conversion is not possible */ -simdutf_warn_unused size_t convert_valid_utf32_to_utf16(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; +simdutf_warn_unused size_t convert_valid_utf32_to_utf16( + const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_valid_utf32_to_utf16(std::span valid_utf32_input, + std::span utf16_output) noexcept { + + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf32_to_utf16::convert_valid( + valid_utf32_input.data(), valid_utf32_input.size(), + utf16_output.data()); + } else + #endif + { + return convert_valid_utf32_to_utf16(valid_utf32_input.data(), + valid_utf32_input.size(), + utf16_output.data()); + } +} + #endif // SIMDUTF_SPAN /** * Convert valid UTF-32 string into UTF-16LE string. @@ -2081,10 +8226,30 @@ simdutf_warn_unused size_t convert_valid_utf32_to_utf16(const char32_t * input, * * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte code units (char32_t) - * @param utf16_buffer the pointer to buffer that can hold the conversion result + * @param utf16_buffer the pointer to a buffer that can hold the conversion + * result * @return number of written code units; 0 if conversion is not possible */ -simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; +simdutf_warn_unused size_t convert_valid_utf32_to_utf16le( + const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_valid_utf32_to_utf16le(std::span valid_utf32_input, + std::span utf16_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf32_to_utf16::convert_valid( + valid_utf32_input.data(), valid_utf32_input.size(), + utf16_output.data()); + } else + #endif + { + return convert_valid_utf32_to_utf16le(valid_utf32_input.data(), + valid_utf32_input.size(), + utf16_output.data()); + } +} + #endif // SIMDUTF_SPAN /** * Convert valid UTF-32 string into UTF-16BE string. @@ -2095,14 +8260,36 @@ simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * input * * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte code units (char32_t) - * @param utf16_buffer the pointer to buffer that can hold the conversion result + * @param utf16_buffer the pointer to a buffer that can hold the conversion + * result * @return number of written code units; 0 if conversion is not possible */ -simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; +simdutf_warn_unused size_t convert_valid_utf32_to_utf16be( + const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +convert_valid_utf32_to_utf16be(std::span valid_utf32_input, + std::span utf16_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf32_to_utf16::convert_valid( + valid_utf32_input.data(), valid_utf32_input.size(), + utf16_output.data()); + } else + #endif + { + return convert_valid_utf32_to_utf16be(valid_utf32_input.data(), + valid_utf32_input.size(), + utf16_output.data()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF16 /** - * Change the endianness of the input. Can be used to go from UTF-16LE to UTF-16BE or - * from UTF-16BE to UTF-16LE. + * Change the endianness of the input. Can be used to go from UTF-16LE to + * UTF-16BE or from UTF-16BE to UTF-16LE. * * This function does not validate the input. * @@ -2110,31 +8297,89 @@ simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * input * * @param input the UTF-16 string to process * @param length the length of the string in 2-byte code units (char16_t) - * @param output the pointer to buffer that can hold the conversion result + * @param output the pointer to a buffer that can hold the conversion + * result */ -void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) noexcept; +void change_endianness_utf16(const char16_t *input, size_t length, + char16_t *output) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_constexpr23 void +change_endianness_utf16(std::span utf16_input, + std::span utf16_output) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16::change_endianness_utf16( + utf16_input.data(), utf16_input.size(), utf16_output.data()); + } else + #endif + { + return change_endianness_utf16(utf16_input.data(), utf16_input.size(), + utf16_output.data()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 /** - * Compute the number of bytes that this UTF-32 string would require in UTF-8 format. + * Compute the number of bytes that this UTF-32 string would require in UTF-8 + * format. * - * This function does not validate the input. + * This function does not validate the input. It is acceptable to pass invalid + * UTF-32 strings but in such cases the result is implementation defined. * * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte code units (char32_t) * @return the number of bytes required to encode the UTF-32 string as UTF-8 */ -simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) noexcept; +simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *input, + size_t length) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +utf8_length_from_utf32(std::span valid_utf32_input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf32::utf8_length_from_utf32(valid_utf32_input.data(), + valid_utf32_input.size()); + } else + #endif + { + return utf8_length_from_utf32(valid_utf32_input.data(), + valid_utf32_input.size()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 /** - * Compute the number of two-byte code units that this UTF-32 string would require in UTF-16 format. + * Compute the number of two-byte code units that this UTF-32 string would + * require in UTF-16 format. * - * This function does not validate the input. + * This function does not validate the input. It is acceptable to pass invalid + * UTF-32 strings but in such cases the result is implementation defined. * * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte code units (char32_t) * @return the number of bytes required to encode the UTF-32 string as UTF-16 */ -simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) noexcept; +simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t *input, + size_t length) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +utf16_length_from_utf32(std::span valid_utf32_input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf32::utf16_length_from_utf32(valid_utf32_input.data(), + valid_utf32_input.size()); + } else + #endif + { + return utf16_length_from_utf32(valid_utf32_input.data(), + valid_utf32_input.size()); + } +} + #endif // SIMDUTF_SPAN /** * Using native endianness; Compute the number of bytes that this UTF-16 @@ -2142,7 +8387,8 @@ simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_ * * This function is equivalent to count_utf16. * - * This function does not validate the input. + * This function does not validate the input. It is acceptable to pass invalid + * UTF-16 strings but in such cases the result is implementation defined. * * This function is not BOM-aware. * @@ -2150,14 +8396,32 @@ simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_ * @param length the length of the string in 2-byte code units (char16_t) * @return the number of bytes required to encode the UTF-16LE string as UTF-32 */ -simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t * input, size_t length) noexcept; +simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t *input, + size_t length) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +utf32_length_from_utf16(std::span valid_utf16_input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16::utf32_length_from_utf16( + valid_utf16_input.data(), valid_utf16_input.size()); + } else + #endif + { + return utf32_length_from_utf16(valid_utf16_input.data(), + valid_utf16_input.size()); + } +} + #endif // SIMDUTF_SPAN /** - * Compute the number of bytes that this UTF-16LE string would require in UTF-32 format. + * Compute the number of bytes that this UTF-16LE string would require in UTF-32 + * format. * * This function is equivalent to count_utf16le. * - * This function does not validate the input. + * This function does not validate the input. It is acceptable to pass invalid + * UTF-16 strings but in such cases the result is implementation defined. * * This function is not BOM-aware. * @@ -2165,14 +8429,33 @@ simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t * input, size_ * @param length the length of the string in 2-byte code units (char16_t) * @return the number of bytes required to encode the UTF-16LE string as UTF-32 */ -simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) noexcept; +simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t *input, + size_t length) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +utf32_length_from_utf16le( + std::span valid_utf16_input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16::utf32_length_from_utf16( + valid_utf16_input.data(), valid_utf16_input.size()); + } else + #endif + { + return utf32_length_from_utf16le(valid_utf16_input.data(), + valid_utf16_input.size()); + } +} + #endif // SIMDUTF_SPAN /** - * Compute the number of bytes that this UTF-16BE string would require in UTF-32 format. + * Compute the number of bytes that this UTF-16BE string would require in UTF-32 + * format. * * This function is equivalent to count_utf16be. * - * This function does not validate the input. + * This function does not validate the input. It is acceptable to pass invalid + * UTF-16 strings but in such cases the result is implementation defined. * * This function is not BOM-aware. * @@ -2180,13 +8463,34 @@ simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, siz * @param length the length of the string in 2-byte code units (char16_t) * @return the number of bytes required to encode the UTF-16BE string as UTF-32 */ -simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) noexcept; +simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t *input, + size_t length) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +utf32_length_from_utf16be( + std::span valid_utf16_input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16::utf32_length_from_utf16( + valid_utf16_input.data(), valid_utf16_input.size()); + } else + #endif + { + return utf32_length_from_utf16be(valid_utf16_input.data(), + valid_utf16_input.size()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF16 /** * Count the number of code points (characters) in the string assuming that * it is valid. * - * This function assumes that the input string is valid UTF-16 (native endianness). + * This function assumes that the input string is valid UTF-16 (native + * endianness). It is acceptable to pass invalid UTF-16 strings but in such + * cases the result is implementation defined. * * This function is not BOM-aware. * @@ -2194,13 +8498,30 @@ simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, siz * @param length the length of the string in 2-byte code units (char16_t) * @return number of code points */ -simdutf_warn_unused size_t count_utf16(const char16_t * input, size_t length) noexcept; +simdutf_warn_unused size_t count_utf16(const char16_t *input, + size_t length) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +count_utf16(std::span valid_utf16_input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16::count_code_points( + valid_utf16_input.data(), valid_utf16_input.size()); + } else + #endif + { + return count_utf16(valid_utf16_input.data(), valid_utf16_input.size()); + } +} + #endif // SIMDUTF_SPAN /** * Count the number of code points (characters) in the string assuming that * it is valid. * * This function assumes that the input string is valid UTF-16LE. + * It is acceptable to pass invalid UTF-16 strings but in such cases + * the result is implementation defined. * * This function is not BOM-aware. * @@ -2208,13 +8529,30 @@ simdutf_warn_unused size_t count_utf16(const char16_t * input, size_t length) no * @param length the length of the string in 2-byte code units (char16_t) * @return number of code points */ -simdutf_warn_unused size_t count_utf16le(const char16_t * input, size_t length) noexcept; +simdutf_warn_unused size_t count_utf16le(const char16_t *input, + size_t length) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +count_utf16le(std::span valid_utf16_input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16::count_code_points( + valid_utf16_input.data(), valid_utf16_input.size()); + } else + #endif + { + return count_utf16le(valid_utf16_input.data(), valid_utf16_input.size()); + } +} + #endif // SIMDUTF_SPAN /** * Count the number of code points (characters) in the string assuming that * it is valid. * * This function assumes that the input string is valid UTF-16BE. + * It is acceptable to pass invalid UTF-16 strings but in such cases + * the result is implementation defined. * * This function is not BOM-aware. * @@ -2222,240 +8560,2762 @@ simdutf_warn_unused size_t count_utf16le(const char16_t * input, size_t length) * @param length the length of the string in 2-byte code units (char16_t) * @return number of code points */ -simdutf_warn_unused size_t count_utf16be(const char16_t * input, size_t length) noexcept; +simdutf_warn_unused size_t count_utf16be(const char16_t *input, + size_t length) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +count_utf16be(std::span valid_utf16_input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16::count_code_points( + valid_utf16_input.data(), valid_utf16_input.size()); + } else + #endif + { + return count_utf16be(valid_utf16_input.data(), valid_utf16_input.size()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF8 /** * Count the number of code points (characters) in the string assuming that * it is valid. * * This function assumes that the input string is valid UTF-8. + * It is acceptable to pass invalid UTF-8 strings but in such cases + * the result is implementation defined. * * @param input the UTF-8 string to process * @param length the length of the string in bytes * @return number of code points */ -simdutf_warn_unused size_t count_utf8(const char * input, size_t length) noexcept; +simdutf_warn_unused size_t count_utf8(const char *input, + size_t length) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t count_utf8( + const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf8::count_code_points(valid_utf8_input.data(), + valid_utf8_input.size()); + } else + #endif + { + return count_utf8(reinterpret_cast(valid_utf8_input.data()), + valid_utf8_input.size()); + } +} + #endif // SIMDUTF_SPAN /** * Given a valid UTF-8 string having a possibly truncated last character, - * this function checks the end of string. If the last character is truncated (or partial), - * then it returns a shorter length (shorter by 1 to 3 bytes) so that the short UTF-8 - * strings only contain complete characters. If there is no truncated character, - * the original length is returned. + * this function checks the end of string. If the last character is truncated + * (or partial), then it returns a shorter length (shorter by 1 to 3 bytes) so + * that the short UTF-8 strings only contain complete characters. If there is no + * truncated character, the original length is returned. * - * This function assumes that the input string is valid UTF-8, but possibly truncated. + * This function assumes that the input string is valid UTF-8, but possibly + * truncated. * * @param input the UTF-8 string to process * @param length the length of the string in bytes * @return the length of the string in bytes, possibly shorter by 1 to 3 bytes */ simdutf_warn_unused size_t trim_partial_utf8(const char *input, size_t length); + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +trim_partial_utf8( + const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf8::trim_partial_utf8(valid_utf8_input.data(), + valid_utf8_input.size()); + } else + #endif + { + return trim_partial_utf8( + reinterpret_cast(valid_utf8_input.data()), + valid_utf8_input.size()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF8 +#if SIMDUTF_FEATURE_UTF16 /** * Given a valid UTF-16BE string having a possibly truncated last character, - * this function checks the end of string. If the last character is truncated (or partial), - * then it returns a shorter length (shorter by 1 unit) so that the short UTF-16BE - * strings only contain complete characters. If there is no truncated character, - * the original length is returned. + * this function checks the end of string. If the last character is truncated + * (or partial), then it returns a shorter length (shorter by 1 unit) so that + * the short UTF-16BE strings only contain complete characters. If there is no + * truncated character, the original length is returned. * - * This function assumes that the input string is valid UTF-16BE, but possibly truncated. + * This function assumes that the input string is valid UTF-16BE, but possibly + * truncated. * * @param input the UTF-16BE string to process * @param length the length of the string in bytes * @return the length of the string in bytes, possibly shorter by 1 unit */ -simdutf_warn_unused size_t trim_partial_utf16be(const char16_t* input, size_t length); +simdutf_warn_unused size_t trim_partial_utf16be(const char16_t *input, + size_t length); + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +trim_partial_utf16be(std::span valid_utf16_input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16::trim_partial_utf16( + valid_utf16_input.data(), valid_utf16_input.size()); + } else + #endif + { + return trim_partial_utf16be(valid_utf16_input.data(), + valid_utf16_input.size()); + } +} + #endif // SIMDUTF_SPAN /** * Given a valid UTF-16LE string having a possibly truncated last character, - * this function checks the end of string. If the last character is truncated (or partial), - * then it returns a shorter length (shorter by 1 unit) so that the short UTF-16LE - * strings only contain complete characters. If there is no truncated character, - * the original length is returned. + * this function checks the end of string. If the last character is truncated + * (or partial), then it returns a shorter length (shorter by 1 unit) so that + * the short UTF-16LE strings only contain complete characters. If there is no + * truncated character, the original length is returned. * - * This function assumes that the input string is valid UTF-16LE, but possibly truncated. + * This function assumes that the input string is valid UTF-16LE, but possibly + * truncated. * * @param input the UTF-16LE string to process * @param length the length of the string in bytes * @return the length of the string in unit, possibly shorter by 1 unit */ -simdutf_warn_unused size_t trim_partial_utf16le(const char16_t* input, size_t length); - +simdutf_warn_unused size_t trim_partial_utf16le(const char16_t *input, + size_t length); + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +trim_partial_utf16le(std::span valid_utf16_input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16::trim_partial_utf16( + valid_utf16_input.data(), valid_utf16_input.size()); + } else + #endif + { + return trim_partial_utf16le(valid_utf16_input.data(), + valid_utf16_input.size()); + } +} + #endif // SIMDUTF_SPAN /** * Given a valid UTF-16 string having a possibly truncated last character, - * this function checks the end of string. If the last character is truncated (or partial), - * then it returns a shorter length (shorter by 1 unit) so that the short UTF-16 - * strings only contain complete characters. If there is no truncated character, - * the original length is returned. + * this function checks the end of string. If the last character is truncated + * (or partial), then it returns a shorter length (shorter by 1 unit) so that + * the short UTF-16 strings only contain complete characters. If there is no + * truncated character, the original length is returned. * - * This function assumes that the input string is valid UTF-16, but possibly truncated. - * We use the native endianness. + * This function assumes that the input string is valid UTF-16, but possibly + * truncated. We use the native endianness. * * @param input the UTF-16 string to process * @param length the length of the string in bytes * @return the length of the string in unit, possibly shorter by 1 unit */ -simdutf_warn_unused size_t trim_partial_utf16(const char16_t* input, size_t length); +simdutf_warn_unused size_t trim_partial_utf16(const char16_t *input, + size_t length); + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +trim_partial_utf16(std::span valid_utf16_input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::utf16::trim_partial_utf16( + valid_utf16_input.data(), valid_utf16_input.size()); + } else + #endif + { + return trim_partial_utf16(valid_utf16_input.data(), + valid_utf16_input.size()); + } +} + #endif // SIMDUTF_SPAN +#endif // SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_BASE64 || SIMDUTF_FEATURE_UTF16 || \ + SIMDUTF_FEATURE_DETECT_ENCODING + #ifndef SIMDUTF_NEED_TRAILING_ZEROES + #define SIMDUTF_NEED_TRAILING_ZEROES 1 + #endif +#endif // SIMDUTF_FEATURE_BASE64 || SIMDUTF_FEATURE_UTF16 || + // SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_BASE64 // base64_options are used to specify the base64 encoding options. -using base64_options = uint64_t; -enum : base64_options { - base64_default = 0, /* standard base64 format */ - base64_url = 1 /* base64url format*/ +// ASCII spaces are ' ', '\t', '\n', '\r', '\f' +// garbage characters are characters that are not part of the base64 alphabet +// nor ASCII spaces. +constexpr uint64_t base64_reverse_padding = + 2; /* modifier for base64_default and base64_url */ +enum base64_options : uint64_t { + base64_default = 0, /* standard base64 format (with padding) */ + base64_url = 1, /* base64url format (no padding) */ + base64_default_no_padding = + base64_default | + base64_reverse_padding, /* standard base64 format without padding */ + base64_url_with_padding = + base64_url | base64_reverse_padding, /* base64url with padding */ + base64_default_accept_garbage = + 4, /* standard base64 format accepting garbage characters, the input stops + with the first '=' if any */ + base64_url_accept_garbage = + 5, /* base64url format accepting garbage characters, the input stops with + the first '=' if any */ + base64_default_or_url = + 8, /* standard/base64url hybrid format (only meaningful for decoding!) */ + base64_default_or_url_accept_garbage = + 12, /* standard/base64url hybrid format accepting garbage characters + (only meaningful for decoding!), the input stops with the first '=' + if any */ +}; + +// last_chunk_handling_options are used to specify the handling of the last +// chunk in base64 decoding. +// https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64 +enum last_chunk_handling_options : uint64_t { + loose = 0, /* standard base64 format, decode partial final chunk */ + strict = 1, /* error when the last chunk is partial, 2 or 3 chars, and + unpadded, or non-zero bit padding */ + stop_before_partial = + 2, /* if the last chunk is partial, ignore it (no error) */ + only_full_chunks = + 3 /* only decode full blocks (4 base64 characters, no padding) */ +}; + +inline simdutf_constexpr23 bool +is_partial(last_chunk_handling_options options) { + return (options == stop_before_partial) || (options == only_full_chunks); +} + +namespace detail { +simdutf_warn_unused const char *find(const char *start, const char *end, + char character) noexcept; +simdutf_warn_unused const char16_t * +find(const char16_t *start, const char16_t *end, char16_t character) noexcept; +} // namespace detail + +/** + * Find the first occurrence of a character in a string. If the character is + * not found, return a pointer to the end of the string. + * @param start the start of the string + * @param end the end of the string + * @param character the character to find + * @return a pointer to the first occurrence of the character in the string, + * or a pointer to the end of the string if the character is not found. + * + */ +simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 const char * +find(const char *start, const char *end, char character) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + for (; start != end; ++start) + if (*start == character) + return start; + return end; + } else + #endif + { + return detail::find(start, end, character); + } +} +simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 const char16_t * +find(const char16_t *start, const char16_t *end, char16_t character) noexcept { + // implementation note: this is repeated instead of a template, to ensure + // the api is still a function and compiles without concepts + #if SIMDUTF_CPLUSPLUS23 + if consteval { + for (; start != end; ++start) + if (*start == character) + return start; + return end; + } else + #endif + { + return detail::find(start, end, character); + } +} +} + // We include base64_tables once. +/* begin file include/simdutf/base64_tables.h */ +#ifndef SIMDUTF_BASE64_TABLES_H +#define SIMDUTF_BASE64_TABLES_H +#include + +namespace simdutf { +namespace { +namespace tables { +namespace base64 { +namespace base64_default { + +constexpr char e0[256] = { + 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'D', 'D', 'D', + 'D', 'E', 'E', 'E', 'E', 'F', 'F', 'F', 'F', 'G', 'G', 'G', 'G', 'H', 'H', + 'H', 'H', 'I', 'I', 'I', 'I', 'J', 'J', 'J', 'J', 'K', 'K', 'K', 'K', 'L', + 'L', 'L', 'L', 'M', 'M', 'M', 'M', 'N', 'N', 'N', 'N', 'O', 'O', 'O', 'O', + 'P', 'P', 'P', 'P', 'Q', 'Q', 'Q', 'Q', 'R', 'R', 'R', 'R', 'S', 'S', 'S', + 'S', 'T', 'T', 'T', 'T', 'U', 'U', 'U', 'U', 'V', 'V', 'V', 'V', 'W', 'W', + 'W', 'W', 'X', 'X', 'X', 'X', 'Y', 'Y', 'Y', 'Y', 'Z', 'Z', 'Z', 'Z', 'a', + 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'd', 'd', 'd', 'd', + 'e', 'e', 'e', 'e', 'f', 'f', 'f', 'f', 'g', 'g', 'g', 'g', 'h', 'h', 'h', + 'h', 'i', 'i', 'i', 'i', 'j', 'j', 'j', 'j', 'k', 'k', 'k', 'k', 'l', 'l', + 'l', 'l', 'm', 'm', 'm', 'm', 'n', 'n', 'n', 'n', 'o', 'o', 'o', 'o', 'p', + 'p', 'p', 'p', 'q', 'q', 'q', 'q', 'r', 'r', 'r', 'r', 's', 's', 's', 's', + 't', 't', 't', 't', 'u', 'u', 'u', 'u', 'v', 'v', 'v', 'v', 'w', 'w', 'w', + 'w', 'x', 'x', 'x', 'x', 'y', 'y', 'y', 'y', 'z', 'z', 'z', 'z', '0', '0', + '0', '0', '1', '1', '1', '1', '2', '2', '2', '2', '3', '3', '3', '3', '4', + '4', '4', '4', '5', '5', '5', '5', '6', '6', '6', '6', '7', '7', '7', '7', + '8', '8', '8', '8', '9', '9', '9', '9', '+', '+', '+', '+', '/', '/', '/', + '/'}; + +constexpr char e1[256] = { + 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', + 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', + 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', + 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', + 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', + 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', + '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G', + 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', + 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', + 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C', + 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', + 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', + 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', + 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', + '/'}; + +constexpr char e2[256] = { + 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', + 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', + 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', + 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', + 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', + 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', + '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G', + 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', + 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', + 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C', + 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', + 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', + 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', + 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', + '/'}; + +constexpr uint32_t d0[256] = { + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x000000f8, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x000000fc, + 0x000000d0, 0x000000d4, 0x000000d8, 0x000000dc, 0x000000e0, 0x000000e4, + 0x000000e8, 0x000000ec, 0x000000f0, 0x000000f4, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, + 0x00000004, 0x00000008, 0x0000000c, 0x00000010, 0x00000014, 0x00000018, + 0x0000001c, 0x00000020, 0x00000024, 0x00000028, 0x0000002c, 0x00000030, + 0x00000034, 0x00000038, 0x0000003c, 0x00000040, 0x00000044, 0x00000048, + 0x0000004c, 0x00000050, 0x00000054, 0x00000058, 0x0000005c, 0x00000060, + 0x00000064, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x00000068, 0x0000006c, 0x00000070, 0x00000074, 0x00000078, + 0x0000007c, 0x00000080, 0x00000084, 0x00000088, 0x0000008c, 0x00000090, + 0x00000094, 0x00000098, 0x0000009c, 0x000000a0, 0x000000a4, 0x000000a8, + 0x000000ac, 0x000000b0, 0x000000b4, 0x000000b8, 0x000000bc, 0x000000c0, + 0x000000c4, 0x000000c8, 0x000000cc, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}; + +constexpr uint32_t d1[256] = { + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x0000e003, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x0000f003, + 0x00004003, 0x00005003, 0x00006003, 0x00007003, 0x00008003, 0x00009003, + 0x0000a003, 0x0000b003, 0x0000c003, 0x0000d003, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, + 0x00001000, 0x00002000, 0x00003000, 0x00004000, 0x00005000, 0x00006000, + 0x00007000, 0x00008000, 0x00009000, 0x0000a000, 0x0000b000, 0x0000c000, + 0x0000d000, 0x0000e000, 0x0000f000, 0x00000001, 0x00001001, 0x00002001, + 0x00003001, 0x00004001, 0x00005001, 0x00006001, 0x00007001, 0x00008001, + 0x00009001, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x0000a001, 0x0000b001, 0x0000c001, 0x0000d001, 0x0000e001, + 0x0000f001, 0x00000002, 0x00001002, 0x00002002, 0x00003002, 0x00004002, + 0x00005002, 0x00006002, 0x00007002, 0x00008002, 0x00009002, 0x0000a002, + 0x0000b002, 0x0000c002, 0x0000d002, 0x0000e002, 0x0000f002, 0x00000003, + 0x00001003, 0x00002003, 0x00003003, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}; + +constexpr uint32_t d2[256] = { + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x00800f00, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00c00f00, + 0x00000d00, 0x00400d00, 0x00800d00, 0x00c00d00, 0x00000e00, 0x00400e00, + 0x00800e00, 0x00c00e00, 0x00000f00, 0x00400f00, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, + 0x00400000, 0x00800000, 0x00c00000, 0x00000100, 0x00400100, 0x00800100, + 0x00c00100, 0x00000200, 0x00400200, 0x00800200, 0x00c00200, 0x00000300, + 0x00400300, 0x00800300, 0x00c00300, 0x00000400, 0x00400400, 0x00800400, + 0x00c00400, 0x00000500, 0x00400500, 0x00800500, 0x00c00500, 0x00000600, + 0x00400600, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x00800600, 0x00c00600, 0x00000700, 0x00400700, 0x00800700, + 0x00c00700, 0x00000800, 0x00400800, 0x00800800, 0x00c00800, 0x00000900, + 0x00400900, 0x00800900, 0x00c00900, 0x00000a00, 0x00400a00, 0x00800a00, + 0x00c00a00, 0x00000b00, 0x00400b00, 0x00800b00, 0x00c00b00, 0x00000c00, + 0x00400c00, 0x00800c00, 0x00c00c00, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}; + +constexpr uint32_t d3[256] = { + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x003e0000, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x003f0000, + 0x00340000, 0x00350000, 0x00360000, 0x00370000, 0x00380000, 0x00390000, + 0x003a0000, 0x003b0000, 0x003c0000, 0x003d0000, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, + 0x00010000, 0x00020000, 0x00030000, 0x00040000, 0x00050000, 0x00060000, + 0x00070000, 0x00080000, 0x00090000, 0x000a0000, 0x000b0000, 0x000c0000, + 0x000d0000, 0x000e0000, 0x000f0000, 0x00100000, 0x00110000, 0x00120000, + 0x00130000, 0x00140000, 0x00150000, 0x00160000, 0x00170000, 0x00180000, + 0x00190000, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x001a0000, 0x001b0000, 0x001c0000, 0x001d0000, 0x001e0000, + 0x001f0000, 0x00200000, 0x00210000, 0x00220000, 0x00230000, 0x00240000, + 0x00250000, 0x00260000, 0x00270000, 0x00280000, 0x00290000, 0x002a0000, + 0x002b0000, 0x002c0000, 0x002d0000, 0x002e0000, 0x002f0000, 0x00300000, + 0x00310000, 0x00320000, 0x00330000, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}; +} // namespace base64_default + +namespace base64_url { + +constexpr char e0[256] = { + 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'D', 'D', 'D', + 'D', 'E', 'E', 'E', 'E', 'F', 'F', 'F', 'F', 'G', 'G', 'G', 'G', 'H', 'H', + 'H', 'H', 'I', 'I', 'I', 'I', 'J', 'J', 'J', 'J', 'K', 'K', 'K', 'K', 'L', + 'L', 'L', 'L', 'M', 'M', 'M', 'M', 'N', 'N', 'N', 'N', 'O', 'O', 'O', 'O', + 'P', 'P', 'P', 'P', 'Q', 'Q', 'Q', 'Q', 'R', 'R', 'R', 'R', 'S', 'S', 'S', + 'S', 'T', 'T', 'T', 'T', 'U', 'U', 'U', 'U', 'V', 'V', 'V', 'V', 'W', 'W', + 'W', 'W', 'X', 'X', 'X', 'X', 'Y', 'Y', 'Y', 'Y', 'Z', 'Z', 'Z', 'Z', 'a', + 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'd', 'd', 'd', 'd', + 'e', 'e', 'e', 'e', 'f', 'f', 'f', 'f', 'g', 'g', 'g', 'g', 'h', 'h', 'h', + 'h', 'i', 'i', 'i', 'i', 'j', 'j', 'j', 'j', 'k', 'k', 'k', 'k', 'l', 'l', + 'l', 'l', 'm', 'm', 'm', 'm', 'n', 'n', 'n', 'n', 'o', 'o', 'o', 'o', 'p', + 'p', 'p', 'p', 'q', 'q', 'q', 'q', 'r', 'r', 'r', 'r', 's', 's', 's', 's', + 't', 't', 't', 't', 'u', 'u', 'u', 'u', 'v', 'v', 'v', 'v', 'w', 'w', 'w', + 'w', 'x', 'x', 'x', 'x', 'y', 'y', 'y', 'y', 'z', 'z', 'z', 'z', '0', '0', + '0', '0', '1', '1', '1', '1', '2', '2', '2', '2', '3', '3', '3', '3', '4', + '4', '4', '4', '5', '5', '5', '5', '6', '6', '6', '6', '7', '7', '7', '7', + '8', '8', '8', '8', '9', '9', '9', '9', '-', '-', '-', '-', '_', '_', '_', + '_'}; + +constexpr char e1[256] = { + 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', + 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', + 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', + 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', '-', '_', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', + 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', + 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', + '4', '5', '6', '7', '8', '9', '-', '_', 'A', 'B', 'C', 'D', 'E', 'F', 'G', + 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', + 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', + 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_', 'A', 'B', 'C', + 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', + 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', + 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', + 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', + '_'}; + +constexpr char e2[256] = { + 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', + 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', + 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', + 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', '-', '_', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', + 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', + 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', + '4', '5', '6', '7', '8', '9', '-', '_', 'A', 'B', 'C', 'D', 'E', 'F', 'G', + 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', + 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', + 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_', 'A', 'B', 'C', + 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', + 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', + 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', + 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', + '_'}; + +constexpr uint32_t d0[256] = { + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x000000f8, 0x01ffffff, 0x01ffffff, + 0x000000d0, 0x000000d4, 0x000000d8, 0x000000dc, 0x000000e0, 0x000000e4, + 0x000000e8, 0x000000ec, 0x000000f0, 0x000000f4, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, + 0x00000004, 0x00000008, 0x0000000c, 0x00000010, 0x00000014, 0x00000018, + 0x0000001c, 0x00000020, 0x00000024, 0x00000028, 0x0000002c, 0x00000030, + 0x00000034, 0x00000038, 0x0000003c, 0x00000040, 0x00000044, 0x00000048, + 0x0000004c, 0x00000050, 0x00000054, 0x00000058, 0x0000005c, 0x00000060, + 0x00000064, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x000000fc, + 0x01ffffff, 0x00000068, 0x0000006c, 0x00000070, 0x00000074, 0x00000078, + 0x0000007c, 0x00000080, 0x00000084, 0x00000088, 0x0000008c, 0x00000090, + 0x00000094, 0x00000098, 0x0000009c, 0x000000a0, 0x000000a4, 0x000000a8, + 0x000000ac, 0x000000b0, 0x000000b4, 0x000000b8, 0x000000bc, 0x000000c0, + 0x000000c4, 0x000000c8, 0x000000cc, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}; +constexpr uint32_t d1[256] = { + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x0000e003, 0x01ffffff, 0x01ffffff, + 0x00004003, 0x00005003, 0x00006003, 0x00007003, 0x00008003, 0x00009003, + 0x0000a003, 0x0000b003, 0x0000c003, 0x0000d003, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, + 0x00001000, 0x00002000, 0x00003000, 0x00004000, 0x00005000, 0x00006000, + 0x00007000, 0x00008000, 0x00009000, 0x0000a000, 0x0000b000, 0x0000c000, + 0x0000d000, 0x0000e000, 0x0000f000, 0x00000001, 0x00001001, 0x00002001, + 0x00003001, 0x00004001, 0x00005001, 0x00006001, 0x00007001, 0x00008001, + 0x00009001, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x0000f003, + 0x01ffffff, 0x0000a001, 0x0000b001, 0x0000c001, 0x0000d001, 0x0000e001, + 0x0000f001, 0x00000002, 0x00001002, 0x00002002, 0x00003002, 0x00004002, + 0x00005002, 0x00006002, 0x00007002, 0x00008002, 0x00009002, 0x0000a002, + 0x0000b002, 0x0000c002, 0x0000d002, 0x0000e002, 0x0000f002, 0x00000003, + 0x00001003, 0x00002003, 0x00003003, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}; +constexpr uint32_t d2[256] = { + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00800f00, 0x01ffffff, 0x01ffffff, + 0x00000d00, 0x00400d00, 0x00800d00, 0x00c00d00, 0x00000e00, 0x00400e00, + 0x00800e00, 0x00c00e00, 0x00000f00, 0x00400f00, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, + 0x00400000, 0x00800000, 0x00c00000, 0x00000100, 0x00400100, 0x00800100, + 0x00c00100, 0x00000200, 0x00400200, 0x00800200, 0x00c00200, 0x00000300, + 0x00400300, 0x00800300, 0x00c00300, 0x00000400, 0x00400400, 0x00800400, + 0x00c00400, 0x00000500, 0x00400500, 0x00800500, 0x00c00500, 0x00000600, + 0x00400600, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00c00f00, + 0x01ffffff, 0x00800600, 0x00c00600, 0x00000700, 0x00400700, 0x00800700, + 0x00c00700, 0x00000800, 0x00400800, 0x00800800, 0x00c00800, 0x00000900, + 0x00400900, 0x00800900, 0x00c00900, 0x00000a00, 0x00400a00, 0x00800a00, + 0x00c00a00, 0x00000b00, 0x00400b00, 0x00800b00, 0x00c00b00, 0x00000c00, + 0x00400c00, 0x00800c00, 0x00c00c00, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}; +constexpr uint32_t d3[256] = { + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x003e0000, 0x01ffffff, 0x01ffffff, + 0x00340000, 0x00350000, 0x00360000, 0x00370000, 0x00380000, 0x00390000, + 0x003a0000, 0x003b0000, 0x003c0000, 0x003d0000, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, + 0x00010000, 0x00020000, 0x00030000, 0x00040000, 0x00050000, 0x00060000, + 0x00070000, 0x00080000, 0x00090000, 0x000a0000, 0x000b0000, 0x000c0000, + 0x000d0000, 0x000e0000, 0x000f0000, 0x00100000, 0x00110000, 0x00120000, + 0x00130000, 0x00140000, 0x00150000, 0x00160000, 0x00170000, 0x00180000, + 0x00190000, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x003f0000, + 0x01ffffff, 0x001a0000, 0x001b0000, 0x001c0000, 0x001d0000, 0x001e0000, + 0x001f0000, 0x00200000, 0x00210000, 0x00220000, 0x00230000, 0x00240000, + 0x00250000, 0x00260000, 0x00270000, 0x00280000, 0x00290000, 0x002a0000, + 0x002b0000, 0x002c0000, 0x002d0000, 0x002e0000, 0x002f0000, 0x00300000, + 0x00310000, 0x00320000, 0x00330000, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}; +} // namespace base64_url + +namespace base64_default_or_url { +constexpr uint32_t d0[256] = { + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x000000f8, 0x01ffffff, 0x000000f8, 0x01ffffff, 0x000000fc, + 0x000000d0, 0x000000d4, 0x000000d8, 0x000000dc, 0x000000e0, 0x000000e4, + 0x000000e8, 0x000000ec, 0x000000f0, 0x000000f4, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, + 0x00000004, 0x00000008, 0x0000000c, 0x00000010, 0x00000014, 0x00000018, + 0x0000001c, 0x00000020, 0x00000024, 0x00000028, 0x0000002c, 0x00000030, + 0x00000034, 0x00000038, 0x0000003c, 0x00000040, 0x00000044, 0x00000048, + 0x0000004c, 0x00000050, 0x00000054, 0x00000058, 0x0000005c, 0x00000060, + 0x00000064, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x000000fc, + 0x01ffffff, 0x00000068, 0x0000006c, 0x00000070, 0x00000074, 0x00000078, + 0x0000007c, 0x00000080, 0x00000084, 0x00000088, 0x0000008c, 0x00000090, + 0x00000094, 0x00000098, 0x0000009c, 0x000000a0, 0x000000a4, 0x000000a8, + 0x000000ac, 0x000000b0, 0x000000b4, 0x000000b8, 0x000000bc, 0x000000c0, + 0x000000c4, 0x000000c8, 0x000000cc, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}; +constexpr uint32_t d1[256] = { + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x0000e003, 0x01ffffff, 0x0000e003, 0x01ffffff, 0x0000f003, + 0x00004003, 0x00005003, 0x00006003, 0x00007003, 0x00008003, 0x00009003, + 0x0000a003, 0x0000b003, 0x0000c003, 0x0000d003, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, + 0x00001000, 0x00002000, 0x00003000, 0x00004000, 0x00005000, 0x00006000, + 0x00007000, 0x00008000, 0x00009000, 0x0000a000, 0x0000b000, 0x0000c000, + 0x0000d000, 0x0000e000, 0x0000f000, 0x00000001, 0x00001001, 0x00002001, + 0x00003001, 0x00004001, 0x00005001, 0x00006001, 0x00007001, 0x00008001, + 0x00009001, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x0000f003, + 0x01ffffff, 0x0000a001, 0x0000b001, 0x0000c001, 0x0000d001, 0x0000e001, + 0x0000f001, 0x00000002, 0x00001002, 0x00002002, 0x00003002, 0x00004002, + 0x00005002, 0x00006002, 0x00007002, 0x00008002, 0x00009002, 0x0000a002, + 0x0000b002, 0x0000c002, 0x0000d002, 0x0000e002, 0x0000f002, 0x00000003, + 0x00001003, 0x00002003, 0x00003003, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}; +constexpr uint32_t d2[256] = { + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x00800f00, 0x01ffffff, 0x00800f00, 0x01ffffff, 0x00c00f00, + 0x00000d00, 0x00400d00, 0x00800d00, 0x00c00d00, 0x00000e00, 0x00400e00, + 0x00800e00, 0x00c00e00, 0x00000f00, 0x00400f00, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, + 0x00400000, 0x00800000, 0x00c00000, 0x00000100, 0x00400100, 0x00800100, + 0x00c00100, 0x00000200, 0x00400200, 0x00800200, 0x00c00200, 0x00000300, + 0x00400300, 0x00800300, 0x00c00300, 0x00000400, 0x00400400, 0x00800400, + 0x00c00400, 0x00000500, 0x00400500, 0x00800500, 0x00c00500, 0x00000600, + 0x00400600, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00c00f00, + 0x01ffffff, 0x00800600, 0x00c00600, 0x00000700, 0x00400700, 0x00800700, + 0x00c00700, 0x00000800, 0x00400800, 0x00800800, 0x00c00800, 0x00000900, + 0x00400900, 0x00800900, 0x00c00900, 0x00000a00, 0x00400a00, 0x00800a00, + 0x00c00a00, 0x00000b00, 0x00400b00, 0x00800b00, 0x00c00b00, 0x00000c00, + 0x00400c00, 0x00800c00, 0x00c00c00, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}; +constexpr uint32_t d3[256] = { + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x003e0000, 0x01ffffff, 0x003e0000, 0x01ffffff, 0x003f0000, + 0x00340000, 0x00350000, 0x00360000, 0x00370000, 0x00380000, 0x00390000, + 0x003a0000, 0x003b0000, 0x003c0000, 0x003d0000, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, + 0x00010000, 0x00020000, 0x00030000, 0x00040000, 0x00050000, 0x00060000, + 0x00070000, 0x00080000, 0x00090000, 0x000a0000, 0x000b0000, 0x000c0000, + 0x000d0000, 0x000e0000, 0x000f0000, 0x00100000, 0x00110000, 0x00120000, + 0x00130000, 0x00140000, 0x00150000, 0x00160000, 0x00170000, 0x00180000, + 0x00190000, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x003f0000, + 0x01ffffff, 0x001a0000, 0x001b0000, 0x001c0000, 0x001d0000, 0x001e0000, + 0x001f0000, 0x00200000, 0x00210000, 0x00220000, 0x00230000, 0x00240000, + 0x00250000, 0x00260000, 0x00270000, 0x00280000, 0x00290000, 0x002a0000, + 0x002b0000, 0x002c0000, 0x002d0000, 0x002e0000, 0x002f0000, 0x00300000, + 0x00310000, 0x00320000, 0x00330000, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}; +} // namespace base64_default_or_url +constexpr uint64_t thintable_epi8[256] = { + 0x0706050403020100, 0x0007060504030201, 0x0007060504030200, + 0x0000070605040302, 0x0007060504030100, 0x0000070605040301, + 0x0000070605040300, 0x0000000706050403, 0x0007060504020100, + 0x0000070605040201, 0x0000070605040200, 0x0000000706050402, + 0x0000070605040100, 0x0000000706050401, 0x0000000706050400, + 0x0000000007060504, 0x0007060503020100, 0x0000070605030201, + 0x0000070605030200, 0x0000000706050302, 0x0000070605030100, + 0x0000000706050301, 0x0000000706050300, 0x0000000007060503, + 0x0000070605020100, 0x0000000706050201, 0x0000000706050200, + 0x0000000007060502, 0x0000000706050100, 0x0000000007060501, + 0x0000000007060500, 0x0000000000070605, 0x0007060403020100, + 0x0000070604030201, 0x0000070604030200, 0x0000000706040302, + 0x0000070604030100, 0x0000000706040301, 0x0000000706040300, + 0x0000000007060403, 0x0000070604020100, 0x0000000706040201, + 0x0000000706040200, 0x0000000007060402, 0x0000000706040100, + 0x0000000007060401, 0x0000000007060400, 0x0000000000070604, + 0x0000070603020100, 0x0000000706030201, 0x0000000706030200, + 0x0000000007060302, 0x0000000706030100, 0x0000000007060301, + 0x0000000007060300, 0x0000000000070603, 0x0000000706020100, + 0x0000000007060201, 0x0000000007060200, 0x0000000000070602, + 0x0000000007060100, 0x0000000000070601, 0x0000000000070600, + 0x0000000000000706, 0x0007050403020100, 0x0000070504030201, + 0x0000070504030200, 0x0000000705040302, 0x0000070504030100, + 0x0000000705040301, 0x0000000705040300, 0x0000000007050403, + 0x0000070504020100, 0x0000000705040201, 0x0000000705040200, + 0x0000000007050402, 0x0000000705040100, 0x0000000007050401, + 0x0000000007050400, 0x0000000000070504, 0x0000070503020100, + 0x0000000705030201, 0x0000000705030200, 0x0000000007050302, + 0x0000000705030100, 0x0000000007050301, 0x0000000007050300, + 0x0000000000070503, 0x0000000705020100, 0x0000000007050201, + 0x0000000007050200, 0x0000000000070502, 0x0000000007050100, + 0x0000000000070501, 0x0000000000070500, 0x0000000000000705, + 0x0000070403020100, 0x0000000704030201, 0x0000000704030200, + 0x0000000007040302, 0x0000000704030100, 0x0000000007040301, + 0x0000000007040300, 0x0000000000070403, 0x0000000704020100, + 0x0000000007040201, 0x0000000007040200, 0x0000000000070402, + 0x0000000007040100, 0x0000000000070401, 0x0000000000070400, + 0x0000000000000704, 0x0000000703020100, 0x0000000007030201, + 0x0000000007030200, 0x0000000000070302, 0x0000000007030100, + 0x0000000000070301, 0x0000000000070300, 0x0000000000000703, + 0x0000000007020100, 0x0000000000070201, 0x0000000000070200, + 0x0000000000000702, 0x0000000000070100, 0x0000000000000701, + 0x0000000000000700, 0x0000000000000007, 0x0006050403020100, + 0x0000060504030201, 0x0000060504030200, 0x0000000605040302, + 0x0000060504030100, 0x0000000605040301, 0x0000000605040300, + 0x0000000006050403, 0x0000060504020100, 0x0000000605040201, + 0x0000000605040200, 0x0000000006050402, 0x0000000605040100, + 0x0000000006050401, 0x0000000006050400, 0x0000000000060504, + 0x0000060503020100, 0x0000000605030201, 0x0000000605030200, + 0x0000000006050302, 0x0000000605030100, 0x0000000006050301, + 0x0000000006050300, 0x0000000000060503, 0x0000000605020100, + 0x0000000006050201, 0x0000000006050200, 0x0000000000060502, + 0x0000000006050100, 0x0000000000060501, 0x0000000000060500, + 0x0000000000000605, 0x0000060403020100, 0x0000000604030201, + 0x0000000604030200, 0x0000000006040302, 0x0000000604030100, + 0x0000000006040301, 0x0000000006040300, 0x0000000000060403, + 0x0000000604020100, 0x0000000006040201, 0x0000000006040200, + 0x0000000000060402, 0x0000000006040100, 0x0000000000060401, + 0x0000000000060400, 0x0000000000000604, 0x0000000603020100, + 0x0000000006030201, 0x0000000006030200, 0x0000000000060302, + 0x0000000006030100, 0x0000000000060301, 0x0000000000060300, + 0x0000000000000603, 0x0000000006020100, 0x0000000000060201, + 0x0000000000060200, 0x0000000000000602, 0x0000000000060100, + 0x0000000000000601, 0x0000000000000600, 0x0000000000000006, + 0x0000050403020100, 0x0000000504030201, 0x0000000504030200, + 0x0000000005040302, 0x0000000504030100, 0x0000000005040301, + 0x0000000005040300, 0x0000000000050403, 0x0000000504020100, + 0x0000000005040201, 0x0000000005040200, 0x0000000000050402, + 0x0000000005040100, 0x0000000000050401, 0x0000000000050400, + 0x0000000000000504, 0x0000000503020100, 0x0000000005030201, + 0x0000000005030200, 0x0000000000050302, 0x0000000005030100, + 0x0000000000050301, 0x0000000000050300, 0x0000000000000503, + 0x0000000005020100, 0x0000000000050201, 0x0000000000050200, + 0x0000000000000502, 0x0000000000050100, 0x0000000000000501, + 0x0000000000000500, 0x0000000000000005, 0x0000000403020100, + 0x0000000004030201, 0x0000000004030200, 0x0000000000040302, + 0x0000000004030100, 0x0000000000040301, 0x0000000000040300, + 0x0000000000000403, 0x0000000004020100, 0x0000000000040201, + 0x0000000000040200, 0x0000000000000402, 0x0000000000040100, + 0x0000000000000401, 0x0000000000000400, 0x0000000000000004, + 0x0000000003020100, 0x0000000000030201, 0x0000000000030200, + 0x0000000000000302, 0x0000000000030100, 0x0000000000000301, + 0x0000000000000300, 0x0000000000000003, 0x0000000000020100, + 0x0000000000000201, 0x0000000000000200, 0x0000000000000002, + 0x0000000000000100, 0x0000000000000001, 0x0000000000000000, + 0x0000000000000000, +}; + +constexpr uint8_t pshufb_combine_table[272] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, + 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x08, + 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, + 0x0f, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, + 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x08, + 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0x00, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, + 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x08, 0x09, 0x0a, 0x0b, + 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, +}; + +constexpr unsigned char BitsSetTable256mul2[256] = { + 0, 2, 2, 4, 2, 4, 4, 6, 2, 4, 4, 6, 4, 6, 6, 8, 2, 4, 4, + 6, 4, 6, 6, 8, 4, 6, 6, 8, 6, 8, 8, 10, 2, 4, 4, 6, 4, 6, + 6, 8, 4, 6, 6, 8, 6, 8, 8, 10, 4, 6, 6, 8, 6, 8, 8, 10, 6, + 8, 8, 10, 8, 10, 10, 12, 2, 4, 4, 6, 4, 6, 6, 8, 4, 6, 6, 8, + 6, 8, 8, 10, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8, 10, 8, 10, 10, + 12, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8, 10, 8, 10, 10, 12, 6, 8, + 8, 10, 8, 10, 10, 12, 8, 10, 10, 12, 10, 12, 12, 14, 2, 4, 4, 6, 4, + 6, 6, 8, 4, 6, 6, 8, 6, 8, 8, 10, 4, 6, 6, 8, 6, 8, 8, 10, + 6, 8, 8, 10, 8, 10, 10, 12, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8, + 10, 8, 10, 10, 12, 6, 8, 8, 10, 8, 10, 10, 12, 8, 10, 10, 12, 10, 12, + 12, 14, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8, 10, 8, 10, 10, 12, 6, + 8, 8, 10, 8, 10, 10, 12, 8, 10, 10, 12, 10, 12, 12, 14, 6, 8, 8, 10, + 8, 10, 10, 12, 8, 10, 10, 12, 10, 12, 12, 14, 8, 10, 10, 12, 10, 12, 12, + 14, 10, 12, 12, 14, 12, 14, 14, 16}; + +constexpr uint8_t to_base64_value[] = { + 255, 255, 255, 255, 255, 255, 255, 255, 255, 64, 64, 255, 64, 64, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 64, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, + 255, 255, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, + 255, 255, 255, 255, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 255, 255, 255, 255, 255, 255, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, + 49, 50, 51, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255}; + +constexpr uint8_t to_base64_url_value[] = { + 255, 255, 255, 255, 255, 255, 255, 255, 255, 64, 64, 255, 64, 64, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 64, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 62, 255, 255, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, + 255, 255, 255, 255, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 255, 255, 255, 255, 63, 255, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, + 49, 50, 51, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255}; + +constexpr uint8_t to_base64_default_or_url_value[] = { + 255, 255, 255, 255, 255, 255, 255, 255, 255, 64, 64, 255, 64, 64, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 64, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, + 62, 255, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, + 255, 255, 255, 255, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 255, 255, 255, 255, 63, 255, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, + 49, 50, 51, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255}; + +static_assert(sizeof(to_base64_value) == 256, + "to_base64_value must have 256 elements"); +static_assert(sizeof(to_base64_url_value) == 256, + "to_base64_url_value must have 256 elements"); +static_assert(to_base64_value[uint8_t(' ')] == 64, + "space must be == 64 in to_base64_value"); +static_assert(to_base64_url_value[uint8_t(' ')] == 64, + "space must be == 64 in to_base64_url_value"); +static_assert(to_base64_value[uint8_t('\t')] == 64, + "tab must be == 64 in to_base64_value"); +static_assert(to_base64_url_value[uint8_t('\t')] == 64, + "tab must be == 64 in to_base64_url_value"); +static_assert(to_base64_value[uint8_t('\r')] == 64, + "cr must be == 64 in to_base64_value"); +static_assert(to_base64_url_value[uint8_t('\r')] == 64, + "cr must be == 64 in to_base64_url_value"); +static_assert(to_base64_value[uint8_t('\n')] == 64, + "lf must be == 64 in to_base64_value"); +static_assert(to_base64_url_value[uint8_t('\n')] == 64, + "lf must be == 64 in to_base64_url_value"); +static_assert(to_base64_value[uint8_t('\f')] == 64, + "ff must be == 64 in to_base64_value"); +static_assert(to_base64_url_value[uint8_t('\f')] == 64, + "ff must be == 64 in to_base64_url_value"); +static_assert(to_base64_value[uint8_t('+')] == 62, + "+ must be == 62 in to_base64_value"); +static_assert(to_base64_url_value[uint8_t('-')] == 62, + "- must be == 62 in to_base64_url_value"); +static_assert(to_base64_value[uint8_t('/')] == 63, + "/ must be == 63 in to_base64_value"); +static_assert(to_base64_url_value[uint8_t('_')] == 63, + "_ must be == 63 in to_base64_url_value"); +} // namespace base64 +} // namespace tables +} // unnamed namespace +} // namespace simdutf + +#endif // SIMDUTF_BASE64_TABLES_H +/* end file include/simdutf/base64_tables.h */ +/* begin file include/simdutf/scalar/base64.h */ +#ifndef SIMDUTF_BASE64_H +#define SIMDUTF_BASE64_H + +#include +#include +#include +#include +#include + +namespace simdutf { +namespace scalar { +namespace { +namespace base64 { + +// This function is not expected to be fast. Do not use in long loops. +// In most instances you should be using is_ignorable. +template bool is_ascii_white_space(char_type c) { + return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f'; +} + +template simdutf_constexpr23 bool is_eight_byte(char_type c) { + if simdutf_constexpr (sizeof(char_type) == 1) { + return true; + } + return uint8_t(c) == c; +} + +template +simdutf_constexpr23 bool is_ignorable(char_type c, + simdutf::base64_options options) { + const uint8_t *to_base64 = + (options & base64_default_or_url) + ? tables::base64::to_base64_default_or_url_value + : ((options & base64_url) ? tables::base64::to_base64_url_value + : tables::base64::to_base64_value); + const bool ignore_garbage = + (options == base64_options::base64_url_accept_garbage) || + (options == base64_options::base64_default_accept_garbage) || + (options == base64_options::base64_default_or_url_accept_garbage); + uint8_t code = to_base64[uint8_t(c)]; + if (is_eight_byte(c) && code <= 63) { + return false; + } + if (is_eight_byte(c) && code == 64) { + return true; + } + return ignore_garbage; +} +template +simdutf_constexpr23 bool is_base64(char_type c, + simdutf::base64_options options) { + const uint8_t *to_base64 = + (options & base64_default_or_url) + ? tables::base64::to_base64_default_or_url_value + : ((options & base64_url) ? tables::base64::to_base64_url_value + : tables::base64::to_base64_value); + uint8_t code = to_base64[uint8_t(c)]; + if (is_eight_byte(c) && code <= 63) { + return true; + } + return false; +} + +template +simdutf_constexpr23 bool is_base64_or_padding(char_type c, + simdutf::base64_options options) { + const uint8_t *to_base64 = + (options & base64_default_or_url) + ? tables::base64::to_base64_default_or_url_value + : ((options & base64_url) ? tables::base64::to_base64_url_value + : tables::base64::to_base64_value); + if (c == '=') { + return true; + } + uint8_t code = to_base64[uint8_t(c)]; + if (is_eight_byte(c) && code <= 63) { + return true; + } + return false; +} + +template +bool is_ignorable_or_padding(char_type c, simdutf::base64_options options) { + return is_ignorable(c, options) || c == '='; +} + +struct reduced_input { + size_t equalsigns; // number of padding characters '=', typically 0, 1, 2. + size_t equallocation; // location of the first padding character if any + size_t srclen; // length of the input buffer before padding + size_t full_input_length; // length of the input buffer with padding but + // without ignorable characters }; +// find the end of the base64 input buffer +// It returns the number of padding characters, the location of the first +// padding character if any, the length of the input buffer before padding +// and the length of the input buffer with padding. The input buffer is not +// modified. The function assumes that there are at most two padding characters. +template +simdutf_constexpr23 reduced_input find_end(const char_type *src, size_t srclen, + simdutf::base64_options options) { + const uint8_t *to_base64 = + (options & base64_default_or_url) + ? tables::base64::to_base64_default_or_url_value + : ((options & base64_url) ? tables::base64::to_base64_url_value + : tables::base64::to_base64_value); + const bool ignore_garbage = + (options == base64_options::base64_url_accept_garbage) || + (options == base64_options::base64_default_accept_garbage) || + (options == base64_options::base64_default_or_url_accept_garbage); + + size_t equalsigns = 0; + // We intentionally include trailing spaces in the full input length. + // See https://github.com/simdutf/simdutf/issues/824 + size_t full_input_length = srclen; + // skip trailing spaces + while (!ignore_garbage && srclen > 0 && + scalar::base64::is_eight_byte(src[srclen - 1]) && + to_base64[uint8_t(src[srclen - 1])] == 64) { + srclen--; + } + size_t equallocation = + srclen; // location of the first padding character if any + if (ignore_garbage) { + // Technically, we don't need to find the first padding character, we can + // just change our algorithms, but it adds substantial complexity. + auto it = simdutf::find(src, src + srclen, '='); + if (it != src + srclen) { + equallocation = it - src; + equalsigns = 1; + srclen = equallocation; + full_input_length = equallocation + 1; + } + return {equalsigns, equallocation, srclen, full_input_length}; + } + if (!ignore_garbage && srclen > 0 && src[srclen - 1] == '=') { + // This is the last '=' sign. + equallocation = srclen - 1; + srclen--; + equalsigns = 1; + // skip trailing spaces + while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) && + to_base64[uint8_t(src[srclen - 1])] == 64) { + srclen--; + } + if (srclen > 0 && src[srclen - 1] == '=') { + // This is the second '=' sign. + equallocation = srclen - 1; + srclen--; + equalsigns = 2; + } + } + return {equalsigns, equallocation, srclen, full_input_length}; +} + +// Returns true upon success. The destination buffer must be large enough. +// This functions assumes that the padding (=) has been removed. +// if check_capacity is true, it will check that the destination buffer is +// large enough. If it is not, it will return OUTPUT_BUFFER_TOO_SMALL. +template +simdutf_constexpr23 full_result base64_tail_decode_impl( + char *dst, size_t outlen, const char_type *src, size_t length, + size_t padding_characters, // number of padding characters + // '=', typically 0, 1, 2. + base64_options options, last_chunk_handling_options last_chunk_options) { + char *dstend = dst + outlen; + (void)dstend; + // This looks like 10 branches, but we expect the compiler to resolve this to + // two branches (easily predicted): + const uint8_t *to_base64 = + (options & base64_default_or_url) + ? tables::base64::to_base64_default_or_url_value + : ((options & base64_url) ? tables::base64::to_base64_url_value + : tables::base64::to_base64_value); + const uint32_t *d0 = + (options & base64_default_or_url) + ? tables::base64::base64_default_or_url::d0 + : ((options & base64_url) ? tables::base64::base64_url::d0 + : tables::base64::base64_default::d0); + const uint32_t *d1 = + (options & base64_default_or_url) + ? tables::base64::base64_default_or_url::d1 + : ((options & base64_url) ? tables::base64::base64_url::d1 + : tables::base64::base64_default::d1); + const uint32_t *d2 = + (options & base64_default_or_url) + ? tables::base64::base64_default_or_url::d2 + : ((options & base64_url) ? tables::base64::base64_url::d2 + : tables::base64::base64_default::d2); + const uint32_t *d3 = + (options & base64_default_or_url) + ? tables::base64::base64_default_or_url::d3 + : ((options & base64_url) ? tables::base64::base64_url::d3 + : tables::base64::base64_default::d3); + const bool ignore_garbage = + (options == base64_options::base64_url_accept_garbage) || + (options == base64_options::base64_default_accept_garbage) || + (options == base64_options::base64_default_or_url_accept_garbage); + + const char_type *srcend = src + length; + const char_type *srcinit = src; + const char *dstinit = dst; + + uint32_t x; + size_t idx; + uint8_t buffer[4]; + while (true) { + while (srcend - src >= 4 && is_eight_byte(src[0]) && + is_eight_byte(src[1]) && is_eight_byte(src[2]) && + is_eight_byte(src[3]) && + (x = d0[uint8_t(src[0])] | d1[uint8_t(src[1])] | + d2[uint8_t(src[2])] | d3[uint8_t(src[3])]) < 0x01FFFFFF) { + if (check_capacity && dstend - dst < 3) { + return {OUTPUT_BUFFER_TOO_SMALL, size_t(src - srcinit), + size_t(dst - dstinit)}; + } + *dst++ = static_cast(x & 0xFF); + *dst++ = static_cast((x >> 8) & 0xFF); + *dst++ = static_cast((x >> 16) & 0xFF); + src += 4; + } + const char_type *srccur = src; + idx = 0; + // we need at least four characters. +#ifdef __clang__ + // If possible, we read four characters at a time. (It is an optimization.) + if (ignore_garbage && src + 4 <= srcend) { + char_type c0 = src[0]; + char_type c1 = src[1]; + char_type c2 = src[2]; + char_type c3 = src[3]; + + uint8_t code0 = to_base64[uint8_t(c0)]; + uint8_t code1 = to_base64[uint8_t(c1)]; + uint8_t code2 = to_base64[uint8_t(c2)]; + uint8_t code3 = to_base64[uint8_t(c3)]; + + buffer[idx] = code0; + idx += (is_eight_byte(c0) && code0 <= 63); + buffer[idx] = code1; + idx += (is_eight_byte(c1) && code1 <= 63); + buffer[idx] = code2; + idx += (is_eight_byte(c2) && code2 <= 63); + buffer[idx] = code3; + idx += (is_eight_byte(c3) && code3 <= 63); + src += 4; + } +#endif + while ((idx < 4) && (src < srcend)) { + char_type c = *src; + + uint8_t code = to_base64[uint8_t(c)]; + buffer[idx] = uint8_t(code); + if (is_eight_byte(c) && code <= 63) { + idx++; + } else if (!ignore_garbage && + (code > 64 || !scalar::base64::is_eight_byte(c))) { + return {INVALID_BASE64_CHARACTER, size_t(src - srcinit), + size_t(dst - dstinit)}; + } else { + // We have a space or a newline or garbage. We ignore it. + } + src++; + } + if (idx != 4) { + simdutf_log_assert(idx < 4, "idx should be less than 4"); + // We never should have that the number of base64 characters + the + // number of padding characters is more than 4. + if (!ignore_garbage && (idx + padding_characters > 4)) { + return {INVALID_BASE64_CHARACTER, size_t(src - srcinit), + size_t(dst - dstinit), true}; + } + + // The idea here is that in loose mode, + // if there is padding at all, it must be used + // to form 4-wise chunk. However, in loose mode, + // we do accept no padding at all. + if (!ignore_garbage && + last_chunk_options == last_chunk_handling_options::loose && + (idx >= 2) && padding_characters > 0 && + ((idx + padding_characters) & 3) != 0) { + return {INVALID_BASE64_CHARACTER, size_t(src - srcinit), + size_t(dst - dstinit), true}; + } else + + // The idea here is that in strict mode, we do not want to accept + // incomplete base64 chunks. So if the chunk was otherwise valid, we + // return BASE64_INPUT_REMAINDER. + if (!ignore_garbage && + last_chunk_options == last_chunk_handling_options::strict && + (idx >= 2) && ((idx + padding_characters) & 3) != 0) { + // The partial chunk was at src - idx + return {BASE64_INPUT_REMAINDER, size_t(src - srcinit), + size_t(dst - dstinit), true}; + } else + // If there is a partial chunk with insufficient padding, with + // stop_before_partial, we need to just ignore it. In "only full" + // mode, skip the minute there are padding characters. + if ((last_chunk_options == + last_chunk_handling_options::stop_before_partial && + (padding_characters + idx < 4) && (idx != 0) && + (idx >= 2 || padding_characters == 0)) || + (last_chunk_options == + last_chunk_handling_options::only_full_chunks && + (idx >= 2 || padding_characters == 0))) { + // partial means that we are *not* going to consume the read + // characters. We need to rewind the src pointer. + src = srccur; + return {SUCCESS, size_t(src - srcinit), size_t(dst - dstinit)}; + } else { + if (idx == 2) { + uint32_t triple = (uint32_t(buffer[0]) << 3 * 6) + + (uint32_t(buffer[1]) << 2 * 6); + if (!ignore_garbage && + (last_chunk_options == last_chunk_handling_options::strict) && + (triple & 0xffff)) { + return {BASE64_EXTRA_BITS, size_t(src - srcinit), + size_t(dst - dstinit)}; + } + if (check_capacity && dstend - dst < 1) { + return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit), + size_t(dst - dstinit)}; + } + *dst++ = static_cast((triple >> 16) & 0xFF); + } else if (idx == 3) { + uint32_t triple = (uint32_t(buffer[0]) << 3 * 6) + + (uint32_t(buffer[1]) << 2 * 6) + + (uint32_t(buffer[2]) << 1 * 6); + if (!ignore_garbage && + (last_chunk_options == last_chunk_handling_options::strict) && + (triple & 0xff)) { + return {BASE64_EXTRA_BITS, size_t(src - srcinit), + size_t(dst - dstinit)}; + } + if (check_capacity && dstend - dst < 2) { + return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit), + size_t(dst - dstinit)}; + } + *dst++ = static_cast((triple >> 16) & 0xFF); + *dst++ = static_cast((triple >> 8) & 0xFF); + } else if (!ignore_garbage && idx == 1 && + (!is_partial(last_chunk_options) || + (is_partial(last_chunk_options) && + padding_characters > 0))) { + return {BASE64_INPUT_REMAINDER, size_t(src - srcinit), + size_t(dst - dstinit)}; + } else if (!ignore_garbage && idx == 0 && padding_characters > 0) { + return {INVALID_BASE64_CHARACTER, size_t(src - srcinit), + size_t(dst - dstinit), true}; + } + return {SUCCESS, size_t(src - srcinit), size_t(dst - dstinit)}; + } + } + if (check_capacity && dstend - dst < 3) { + return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit), + size_t(dst - dstinit)}; + } + uint32_t triple = + (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6) + + (uint32_t(buffer[2]) << 1 * 6) + (uint32_t(buffer[3]) << 0 * 6); + *dst++ = static_cast((triple >> 16) & 0xFF); + *dst++ = static_cast((triple >> 8) & 0xFF); + *dst++ = static_cast(triple & 0xFF); + } +} + +template +simdutf_constexpr23 full_result base64_tail_decode( + char *dst, const char_type *src, size_t length, + size_t padding_characters, // number of padding characters + // '=', typically 0, 1, 2. + base64_options options, last_chunk_handling_options last_chunk_options) { + return base64_tail_decode_impl(dst, 0, src, length, padding_characters, + options, last_chunk_options); +} + +// like base64_tail_decode, but it will not write past the end of the output +// buffer. The outlen parameter is modified to reflect the number of bytes +// written. This functions assumes that the padding (=) has been removed. +// +template +simdutf_constexpr23 full_result base64_tail_decode_safe( + char *dst, size_t outlen, const char_type *src, size_t length, + size_t padding_characters, // number of padding characters + // '=', typically 0, 1, 2. + base64_options options, last_chunk_handling_options last_chunk_options) { + return base64_tail_decode_impl(dst, outlen, src, length, + padding_characters, options, + last_chunk_options); +} + +inline simdutf_constexpr23 full_result +patch_tail_result(full_result r, size_t previous_input, size_t previous_output, + size_t equallocation, size_t full_input_length, + last_chunk_handling_options last_chunk_options) { + r.input_count += previous_input; + r.output_count += previous_output; + if (r.padding_error) { + r.input_count = equallocation; + } + + if (r.error == error_code::SUCCESS) { + if (!is_partial(last_chunk_options)) { + // A success when we are not in stop_before_partial mode. + // means that we have consumed the whole input buffer. + r.input_count = full_input_length; + } else if (r.output_count % 3 != 0) { + r.input_count = full_input_length; + } + } + return r; +} + +// Returns the number of bytes written. The destination buffer must be large +// enough. It will add padding (=) if needed. +template +simdutf_constexpr23 size_t tail_encode_base64_impl( + char *dst, const char *src, size_t srclen, base64_options options, + size_t line_length = simdutf::default_line_length, size_t line_offset = 0) { + if simdutf_constexpr (use_lines) { + // sanitize line_length and starting_line_offset. + // line_length must be greater than 3. + if (line_length < 4) { + line_length = 4; + } + simdutf_log_assert(line_offset <= line_length, + "line_offset should be less than line_length"); + } + // By default, we use padding if we are not using the URL variant. + // This is check with ((options & base64_url) == 0) which returns true if we + // are not using the URL variant. However, we also allow 'inversion' of the + // convention with the base64_reverse_padding option. If the + // base64_reverse_padding option is set, we use padding if we are using the + // URL variant, and we omit it if we are not using the URL variant. This is + // checked with + // ((options & base64_reverse_padding) == base64_reverse_padding). + bool use_padding = + ((options & base64_url) == 0) ^ + ((options & base64_reverse_padding) == base64_reverse_padding); + // This looks like 3 branches, but we expect the compiler to resolve this to + // a single branch: + const char *e0 = (options & base64_url) ? tables::base64::base64_url::e0 + : tables::base64::base64_default::e0; + const char *e1 = (options & base64_url) ? tables::base64::base64_url::e1 + : tables::base64::base64_default::e1; + const char *e2 = (options & base64_url) ? tables::base64::base64_url::e2 + : tables::base64::base64_default::e2; + char *out = dst; + size_t i = 0; + uint8_t t1, t2, t3; + for (; i + 2 < srclen; i += 3) { + t1 = uint8_t(src[i]); + t2 = uint8_t(src[i + 1]); + t3 = uint8_t(src[i + 2]); + if simdutf_constexpr (use_lines) { + if (line_offset + 3 >= line_length) { + if (line_offset == line_length) { + *out++ = '\n'; + *out++ = e0[t1]; + *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; + *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)]; + *out++ = e2[t3]; + line_offset = 4; + } else if (line_offset + 1 == line_length) { + *out++ = e0[t1]; + *out++ = '\n'; + *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; + *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)]; + *out++ = e2[t3]; + line_offset = 3; + } else if (line_offset + 2 == line_length) { + *out++ = e0[t1]; + *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; + *out++ = '\n'; + *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)]; + *out++ = e2[t3]; + line_offset = 2; + } else if (line_offset + 3 == line_length) { + *out++ = e0[t1]; + *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; + *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)]; + *out++ = '\n'; + *out++ = e2[t3]; + line_offset = 1; + } + } else { + *out++ = e0[t1]; + *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; + *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)]; + *out++ = e2[t3]; + line_offset += 4; + } + } else { + *out++ = e0[t1]; + *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; + *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)]; + *out++ = e2[t3]; + } + } + switch (srclen - i) { + case 0: + break; + case 1: + t1 = uint8_t(src[i]); + if simdutf_constexpr (use_lines) { + if (use_padding) { + if (line_offset + 3 >= line_length) { + if (line_offset == line_length) { + *out++ = '\n'; + *out++ = e0[t1]; + *out++ = e1[(t1 & 0x03) << 4]; + *out++ = '='; + *out++ = '='; + } else if (line_offset + 1 == line_length) { + *out++ = e0[t1]; + *out++ = '\n'; + *out++ = e1[(t1 & 0x03) << 4]; + *out++ = '='; + *out++ = '='; + } else if (line_offset + 2 == line_length) { + *out++ = e0[t1]; + *out++ = e1[(t1 & 0x03) << 4]; + *out++ = '\n'; + *out++ = '='; + *out++ = '='; + } else if (line_offset + 3 == line_length) { + *out++ = e0[t1]; + *out++ = e1[(t1 & 0x03) << 4]; + *out++ = '='; + *out++ = '\n'; + *out++ = '='; + } + } else { + *out++ = e0[t1]; + *out++ = e1[(t1 & 0x03) << 4]; + *out++ = '='; + *out++ = '='; + } + } else { + if (line_offset + 2 >= line_length) { + if (line_offset == line_length) { + *out++ = '\n'; + *out++ = e0[uint8_t(src[i])]; + *out++ = e1[(uint8_t(src[i]) & 0x03) << 4]; + } else if (line_offset + 1 == line_length) { + *out++ = e0[uint8_t(src[i])]; + *out++ = '\n'; + *out++ = e1[(uint8_t(src[i]) & 0x03) << 4]; + } else { + *out++ = e0[uint8_t(src[i])]; + *out++ = e1[(uint8_t(src[i]) & 0x03) << 4]; + // *out++ = '\n'; ==> no newline at the end of the output + } + } else { + *out++ = e0[uint8_t(src[i])]; + *out++ = e1[(uint8_t(src[i]) & 0x03) << 4]; + } + } + } else { + *out++ = e0[t1]; + *out++ = e1[(t1 & 0x03) << 4]; + if (use_padding) { + *out++ = '='; + *out++ = '='; + } + } + break; + default: /* case 2 */ + t1 = uint8_t(src[i]); + t2 = uint8_t(src[i + 1]); + if simdutf_constexpr (use_lines) { + if (use_padding) { + if (line_offset + 3 >= line_length) { + if (line_offset == line_length) { + *out++ = '\n'; + *out++ = e0[t1]; + *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; + *out++ = e2[(t2 & 0x0F) << 2]; + *out++ = '='; + } else if (line_offset + 1 == line_length) { + *out++ = e0[t1]; + *out++ = '\n'; + *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; + *out++ = e2[(t2 & 0x0F) << 2]; + *out++ = '='; + } else if (line_offset + 2 == line_length) { + *out++ = e0[t1]; + *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; + *out++ = '\n'; + *out++ = e2[(t2 & 0x0F) << 2]; + *out++ = '='; + } else if (line_offset + 3 == line_length) { + *out++ = e0[t1]; + *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; + *out++ = e2[(t2 & 0x0F) << 2]; + *out++ = '\n'; + *out++ = '='; + } + } else { + *out++ = e0[t1]; + *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; + *out++ = e2[(t2 & 0x0F) << 2]; + *out++ = '='; + } + } else { + if (line_offset + 3 >= line_length) { + if (line_offset == line_length) { + *out++ = '\n'; + *out++ = e0[t1]; + *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; + *out++ = e2[(t2 & 0x0F) << 2]; + } else if (line_offset + 1 == line_length) { + *out++ = e0[t1]; + *out++ = '\n'; + *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; + *out++ = e2[(t2 & 0x0F) << 2]; + } else if (line_offset + 2 == line_length) { + *out++ = e0[t1]; + *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; + *out++ = '\n'; + *out++ = e2[(t2 & 0x0F) << 2]; + } else { + *out++ = e0[t1]; + *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; + *out++ = e2[(t2 & 0x0F) << 2]; + // *out++ = '\n'; ==> no newline at the end of the output + } + } else { + *out++ = e0[t1]; + *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; + *out++ = e2[(t2 & 0x0F) << 2]; + } + } + } else { + *out++ = e0[t1]; + *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; + *out++ = e2[(t2 & 0x0F) << 2]; + if (use_padding) { + *out++ = '='; + } + } + } + return (size_t)(out - dst); +} + +// Returns the number of bytes written. The destination buffer must be large +// enough. It will add padding (=) if needed. +inline simdutf_constexpr23 size_t tail_encode_base64(char *dst, const char *src, + size_t srclen, + base64_options options) { + return tail_encode_base64_impl(dst, src, srclen, options); +} + +template +simdutf_warn_unused simdutf_constexpr23 size_t +maximal_binary_length_from_base64(InputPtr input, size_t length) noexcept { + // We process the padding characters ('=') at the end to make sure + // that we return an exact result when the input has no ignorable characters + // (e.g., spaces). + size_t padding = 0; + if (length > 0) { + if (input[length - 1] == '=') { + padding++; + if (length > 1 && input[length - 2] == '=') { + padding++; + } + } + } + // The input is not otherwise processed for ignorable characters or + // validation, so that the function runs in constant time (very fast). In + // practice, base64 inputs without ignorable characters are common and the + // common case are line separated inputs with relatively long lines (e.g., 76 + // characters) which leads this function to a slight (1%) overestimation of + // the output size. + // + // Of course, some inputs might contain an arbitrary number of spaces or + // newlines, which would make this function return a very pessimistic output + // size but systems that produce base64 outputs typically do not do that and + // if they do, they do not care much about minimizing memory usage. + // + // In specialized applications, users may know that their input is line + // separated, which can be checked very quickly by by iterating (e.g., over 76 + // character chunks, looking for the linefeed characters only). We could + // provide a specialized function for that, but it is not clear that the added + // complexity is worth it for us. + // + size_t actual_length = length - padding; + if (actual_length % 4 <= 1) { + return actual_length / 4 * 3; + } + // if we have a valid input, then the remainder must be 2 or 3 adding one or + // two extra bytes. + return actual_length / 4 * 3 + (actual_length % 4) - 1; +} + +template +simdutf_warn_unused simdutf_constexpr23 full_result +base64_to_binary_details_impl( + const char_type *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) noexcept { + const bool ignore_garbage = + (options == base64_options::base64_url_accept_garbage) || + (options == base64_options::base64_default_accept_garbage) || + (options == base64_options::base64_default_or_url_accept_garbage); + auto ri = simdutf::scalar::base64::find_end(input, length, options); + size_t equallocation = ri.equallocation; + size_t equalsigns = ri.equalsigns; + length = ri.srclen; + size_t full_input_length = ri.full_input_length; + if (length == 0) { + if (!ignore_garbage && equalsigns > 0) { + return {INVALID_BASE64_CHARACTER, equallocation, 0}; + } + return {SUCCESS, full_input_length, 0}; + } + full_result r = scalar::base64::base64_tail_decode( + output, input, length, equalsigns, options, last_chunk_options); + r = scalar::base64::patch_tail_result(r, 0, 0, equallocation, + full_input_length, last_chunk_options); + if (!is_partial(last_chunk_options) && r.error == error_code::SUCCESS && + equalsigns > 0 && !ignore_garbage) { + // additional checks + if ((r.output_count % 3 == 0) || + ((r.output_count % 3) + 1 + equalsigns != 4)) { + return {INVALID_BASE64_CHARACTER, equallocation, r.output_count}; + } + } + // When is_partial(last_chunk_options) is true, we must either end with + // the end of the stream (beyond whitespace) or right after a non-ignorable + // character or at the very beginning of the stream. + // See https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64 + if (is_partial(last_chunk_options) && r.error == error_code::SUCCESS && + r.input_count < full_input_length) { + // First check if we can extend the input to the end of the stream + while (r.input_count < full_input_length && + base64_ignorable(*(input + r.input_count), options)) { + r.input_count++; + } + // If we are still not at the end of the stream, then we must backtrack + // to the last non-ignorable character. + if (r.input_count < full_input_length) { + while (r.input_count > 0 && + base64_ignorable(*(input + r.input_count - 1), options)) { + r.input_count--; + } + } + } + return r; +} + +template +simdutf_constexpr23 simdutf_warn_unused full_result +base64_to_binary_details_safe_impl( + const char_type *input, size_t length, char *output, size_t outlen, + base64_options options, + last_chunk_handling_options last_chunk_options) noexcept { + const bool ignore_garbage = + (options == base64_options::base64_url_accept_garbage) || + (options == base64_options::base64_default_accept_garbage) || + (options == base64_options::base64_default_or_url_accept_garbage); + auto ri = simdutf::scalar::base64::find_end(input, length, options); + size_t equallocation = ri.equallocation; + size_t equalsigns = ri.equalsigns; + length = ri.srclen; + size_t full_input_length = ri.full_input_length; + if (length == 0) { + if (!ignore_garbage && equalsigns > 0) { + return {INVALID_BASE64_CHARACTER, equallocation, 0}; + } + return {SUCCESS, full_input_length, 0}; + } + full_result r = scalar::base64::base64_tail_decode_safe( + output, outlen, input, length, equalsigns, options, last_chunk_options); + r = scalar::base64::patch_tail_result(r, 0, 0, equallocation, + full_input_length, last_chunk_options); + if (!is_partial(last_chunk_options) && r.error == error_code::SUCCESS && + equalsigns > 0 && !ignore_garbage) { + // additional checks + if ((r.output_count % 3 == 0) || + ((r.output_count % 3) + 1 + equalsigns != 4)) { + return {INVALID_BASE64_CHARACTER, equallocation, r.output_count}; + } + } + + // When is_partial(last_chunk_options) is true, we must either end with + // the end of the stream (beyond whitespace) or right after a non-ignorable + // character or at the very beginning of the stream. + // See https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64 + if (is_partial(last_chunk_options) && r.error == error_code::SUCCESS && + r.input_count < full_input_length) { + // First check if we can extend the input to the end of the stream + while (r.input_count < full_input_length && + base64_ignorable(*(input + r.input_count), options)) { + r.input_count++; + } + // If we are still not at the end of the stream, then we must backtrack + // to the last non-ignorable character. + if (r.input_count < full_input_length) { + while (r.input_count > 0 && + base64_ignorable(*(input + r.input_count - 1), options)) { + r.input_count--; + } + } + } + return r; +} + +simdutf_warn_unused simdutf_constexpr23 size_t +base64_length_from_binary(size_t length, base64_options options) noexcept { + // By default, we use padding if we are not using the URL variant. + // This is check with ((options & base64_url) == 0) which returns true if we + // are not using the URL variant. However, we also allow 'inversion' of the + // convention with the base64_reverse_padding option. If the + // base64_reverse_padding option is set, we use padding if we are using the + // URL variant, and we omit it if we are not using the URL variant. This is + // checked with + // ((options & base64_reverse_padding) == base64_reverse_padding). + bool use_padding = + ((options & base64_url) == 0) ^ + ((options & base64_reverse_padding) == base64_reverse_padding); + if (!use_padding) { + return length / 3 * 4 + ((length % 3) ? (length % 3) + 1 : 0); + } + return (length + 2) / 3 * + 4; // We use padding to make the length a multiple of 4. +} + +simdutf_warn_unused simdutf_constexpr23 size_t +base64_length_from_binary_with_lines(size_t length, base64_options options, + size_t line_length) noexcept { + if (length == 0) { + return 0; + } + size_t base64_length = + scalar::base64::base64_length_from_binary(length, options); + if (line_length < 4) { + line_length = 4; + } + size_t lines = + (base64_length + line_length - 1) / line_length; // number of lines + return base64_length + lines - 1; +} + +// Return the length of the prefix that contains count base64 characters. +// Thus, if count is 3, the function returns the length of the prefix +// that contains 3 base64 characters. +// The function returns (size_t)-1 if there is not enough base64 characters in +// the input. +template +simdutf_warn_unused size_t prefix_length(size_t count, + simdutf::base64_options options, + const char_type *input, + size_t length) noexcept { + size_t i = 0; + while (i < length && is_ignorable(input[i], options)) { + i++; + } + if (count == 0) { + return i; // duh! + } + for (; i < length; i++) { + if (is_ignorable(input[i], options)) { + continue; + } + // We have a base64 character or a padding character. + count--; + if (count == 0) { + return i + 1; + } + } + simdutf_log_assert(false, "You never get here"); + + return -1; // should never happen +} + +} // namespace base64 +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file include/simdutf/scalar/base64.h */ + +namespace simdutf { + + #if SIMDUTF_CPLUSPLUS17 +inline std::string_view to_string(base64_options options) { + switch (options) { + case base64_default: + return "base64_default"; + case base64_url: + return "base64_url"; + case base64_reverse_padding: + return "base64_reverse_padding"; + case base64_url_with_padding: + return "base64_url_with_padding"; + case base64_default_accept_garbage: + return "base64_default_accept_garbage"; + case base64_url_accept_garbage: + return "base64_url_accept_garbage"; + case base64_default_or_url: + return "base64_default_or_url"; + case base64_default_or_url_accept_garbage: + return "base64_default_or_url_accept_garbage"; + } + return ""; +} + #endif // SIMDUTF_CPLUSPLUS17 + + #if SIMDUTF_CPLUSPLUS17 +inline std::string_view to_string(last_chunk_handling_options options) { + switch (options) { + case loose: + return "loose"; + case strict: + return "strict"; + case stop_before_partial: + return "stop_before_partial"; + case only_full_chunks: + return "only_full_chunks"; + } + return ""; +} + #endif + /** * Provide the maximal binary length in bytes given the base64 input. - * In general, if the input contains ASCII spaces, the result will be less than - * the maximum length. + * As long as the input does not contain ignorable characters (e.g., ASCII + * spaces or linefeed characters), the result is exact. In particular, the + * function checks for padding characters. + * + * The function is fast (constant time). It checks up to two characters at + * the end of the string. The input is not otherwise validated or read. * * @param input the base64 input to process * @param length the length of the base64 input in bytes * @return maximum number of binary bytes */ -simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) noexcept; +simdutf_warn_unused size_t +maximal_binary_length_from_base64(const char *input, size_t length) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +maximal_binary_length_from_base64( + const detail::input_span_of_byte_like auto &input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::base64::maximal_binary_length_from_base64( + detail::constexpr_cast_ptr(input.data()), input.size()); + } else + #endif + { + return maximal_binary_length_from_base64( + reinterpret_cast(input.data()), input.size()); + } +} + #endif // SIMDUTF_SPAN /** * Provide the maximal binary length in bytes given the base64 input. - * In general, if the input contains ASCII spaces, the result will be less than - * the maximum length. + * As long as the input does not contain ignorable characters (e.g., ASCII + * spaces or linefeed characters), the result is exact. In particular, the + * function checks for padding characters. * - * @param input the base64 input to process, in ASCII stored as 16-bit units + * The function is fast (constant time). It checks up to two characters at + * the end of the string. The input is not otherwise validated or read. + * + * @param input the base64 input to process, in ASCII stored as 16-bit + * units * @param length the length of the base64 input in 16-bit units * @return maximal number of binary bytes */ -simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) noexcept; +simdutf_warn_unused size_t maximal_binary_length_from_base64( + const char16_t *input, size_t length) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +maximal_binary_length_from_base64(std::span input) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::base64::maximal_binary_length_from_base64(input.data(), + input.size()); + } else + #endif + { + return maximal_binary_length_from_base64(input.data(), input.size()); + } +} + #endif // SIMDUTF_SPAN /** * Convert a base64 input to a binary output. * - * This function follows the WHATWG forgiving-base64 format, which means that it will - * ignore any ASCII spaces in the input. You may provide a padded input (with one or two - * equal signs at the end) or an unpadded input (without any equal signs at the end). + * This function follows the WHATWG forgiving-base64 format, which means that it + * will ignore any ASCII spaces in the input. You may provide a padded input + * (with one or two equal signs at the end) or an unpadded input (without any + * equal signs at the end). * * See https://infra.spec.whatwg.org/#forgiving-base64-decode * - * This function will fail in case of invalid input. There are two possible reasons for - * failure: the input contains a number of base64 characters that when divided by 4, leaves - * a single remainder character (BASE64_INPUT_REMAINDER), or the input contains a character - * that is not a valid base64 character (INVALID_BASE64_CHARACTER). + * This function will fail in case of invalid input. When last_chunk_options = + * loose, there are two possible reasons for failure: the input contains a + * number of base64 characters that when divided by 4, leaves a single remainder + * character (BASE64_INPUT_REMAINDER), or the input contains a character that is + * not a valid base64 character (INVALID_BASE64_CHARACTER). + * + * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the + * input where the invalid character was found. When the error is + * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded. + * + * The default option (simdutf::base64_default) expects the characters `+` and + * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the + * characters `-` and `_` as part of its alphabet. + * + * The padding (`=`) is validated if present. There may be at most two padding + * characters at the end of the input. If there are any padding characters, the + * total number of characters (excluding spaces but including padding + * characters) must be divisible by four. + * + * You should call this function with a buffer that is at least + * maximal_binary_length_from_base64(input, length) bytes long. If you fail to + * provide that much space, the function may cause a buffer overflow. + * + * Advanced users may want to tailor how the last chunk is handled. By default, + * we use a loose (forgiving) approach but we also support a strict approach + * as well as a stop_before_partial approach, as per the following proposal: + * + * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64 + * + * @param input the base64 string to process + * @param length the length of the string in bytes + * @param output the pointer to a buffer that can hold the conversion + * result (should be at least maximal_binary_length_from_base64(input, length) + * bytes long). + * @param options the base64 options to use, usually base64_default or + * base64_url, and base64_default by default. + * @param last_chunk_options the last chunk handling options, + * last_chunk_handling_options::loose by default + * but can also be last_chunk_handling_options::strict or + * last_chunk_handling_options::stop_before_partial. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in bytes) if any, or the number of bytes written if successful. + */ +simdutf_warn_unused result base64_to_binary( + const char *input, size_t length, char *output, + base64_options options = base64_default, + last_chunk_handling_options last_chunk_options = loose) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result +base64_to_binary( + const detail::input_span_of_byte_like auto &input, + detail::output_span_of_byte_like auto &&binary_output, + base64_options options = base64_default, + last_chunk_handling_options last_chunk_options = loose) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::base64::base64_to_binary_details_impl( + input.data(), input.size(), binary_output.data(), options, + last_chunk_options); + } else + #endif + { + return base64_to_binary(reinterpret_cast(input.data()), + input.size(), + reinterpret_cast(binary_output.data()), + options, last_chunk_options); + } +} + #endif // SIMDUTF_SPAN + +/** + * Provide the base64 length in bytes given the length of a binary input. + * + * @param length the length of the input in bytes + * @return number of base64 bytes + */ +inline simdutf_warn_unused simdutf_constexpr23 size_t base64_length_from_binary( + size_t length, base64_options options = base64_default) noexcept { + return scalar::base64::base64_length_from_binary(length, options); +} + +/** + * Provide the base64 length in bytes given the length of a binary input, + * taking into account line breaks. + * + * @param length the length of the input in bytes + * @param line_length the length of lines, must be at least 4 (otherwise it is + * interpreted as 4), + * @return number of base64 bytes + */ +inline simdutf_warn_unused simdutf_constexpr23 size_t +base64_length_from_binary_with_lines( + size_t length, base64_options options = base64_default, + size_t line_length = default_line_length) noexcept { + return scalar::base64::base64_length_from_binary_with_lines(length, options, + line_length); +} + +/** + * Convert a binary input to a base64 output. + * + * The default option (simdutf::base64_default) uses the characters `+` and `/` + * as part of its alphabet. Further, it adds padding (`=`) at the end of the + * output to ensure that the output length is a multiple of four. * - * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the input - * where the invalid character was found. When the error is BASE64_INPUT_REMAINDER, then - * r.count contains the number of bytes decoded. + * The URL option (simdutf::base64_url) uses the characters `-` and `_` as part + * of its alphabet. No padding is added at the end of the output. * - * You should call this function with a buffer that is at least maximal_binary_length_from_base64(input, length) bytes long. - * If you fail to provide that much space, the function may cause a buffer overflow. + * This function always succeeds. * - * @param input the base64 string to process - * @param length the length of the string in bytes - * @param output the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long). - * @param options the base64 options to use, can be base64_default or base64_url, is base64_default by default. - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in bytes) if any, or the number of bytes written if successful. + * @param input the binary to process + * @param length the length of the input in bytes + * @param output the pointer to a buffer that can hold the conversion + * result (should be at least base64_length_from_binary(length) bytes long) + * @param options the base64 options to use, can be base64_default or + * base64_url, is base64_default by default. + * @return number of written bytes, will be equal to + * base64_length_from_binary(length, options) */ -simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options = base64_default) noexcept; +size_t binary_to_base64(const char *input, size_t length, char *output, + base64_options options = base64_default) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +binary_to_base64(const detail::input_span_of_byte_like auto &input, + detail::output_span_of_byte_like auto &&binary_output, + base64_options options = base64_default) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::base64::tail_encode_base64( + binary_output.data(), input.data(), input.size(), options); + } else + #endif + { + return binary_to_base64( + reinterpret_cast(input.data()), input.size(), + reinterpret_cast(binary_output.data()), options); + } +} + #endif // SIMDUTF_SPAN /** - * Provide the base64 length in bytes given the length of a binary input. + * Convert a binary input to a base64 output with line breaks. + * + * The default option (simdutf::base64_default) uses the characters `+` and `/` + * as part of its alphabet. Further, it adds padding (`=`) at the end of the + * output to ensure that the output length is a multiple of four. + * + * The URL option (simdutf::base64_url) uses the characters `-` and `_` as part + * of its alphabet. No padding is added at the end of the output. + * + * This function always succeeds. * + * @param input the binary to process * @param length the length of the input in bytes - * @return number of base64 bytes + * @param output the pointer to a buffer that can hold the conversion + * result (should be at least base64_length_from_binary_with_lines(length, + * options, line_length) bytes long) + * @param line_length the length of lines, must be at least 4 (otherwise it is + * interpreted as 4), + * @param options the base64 options to use, can be base64_default or + * base64_url, is base64_default by default. + * @return number of written bytes, will be equal to + * base64_length_from_binary_with_lines(length, options) */ -simdutf_warn_unused size_t base64_length_from_binary(size_t length, base64_options options = base64_default) noexcept; +size_t +binary_to_base64_with_lines(const char *input, size_t length, char *output, + size_t line_length = simdutf::default_line_length, + base64_options options = base64_default) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t +binary_to_base64_with_lines( + const detail::input_span_of_byte_like auto &input, + detail::output_span_of_byte_like auto &&binary_output, + size_t line_length = simdutf::default_line_length, + base64_options options = base64_default) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::base64::tail_encode_base64_impl( + binary_output.data(), input.data(), input.size(), options, line_length); + } else + #endif + { + return binary_to_base64_with_lines( + reinterpret_cast(input.data()), input.size(), + reinterpret_cast(binary_output.data()), line_length, options); + } +} + #endif // SIMDUTF_SPAN + #if SIMDUTF_ATOMIC_REF /** - * Convert a binary input to a base64 output. The output is always padded with equal signs so that it is - * a multiple of 4 bytes long. + * Convert a binary input to a base64 output, using atomic accesses. + * This function comes with a potentially significant performance + * penalty, but it may be useful in some cases where the input + * buffers are shared between threads, to avoid undefined + * behavior in case of data races. + * + * The function is for advanced users. Its main use case is when + * to silence sanitizer warnings. We have no documented use case + * where this function is actually necessary in terms of practical correctness. + * + * This function is only available when simdutf is compiled with + * C++20 support and __cpp_lib_atomic_ref >= 201806L. You may check + * the availability of this function by checking the macro + * SIMDUTF_ATOMIC_REF. + * + * The default option (simdutf::base64_default) uses the characters `+` and `/` + * as part of its alphabet. Further, it adds padding (`=`) at the end of the + * output to ensure that the output length is a multiple of four. + * + * The URL option (simdutf::base64_url) uses the characters `-` and `_` as part + * of its alphabet. No padding is added at the end of the output. * * This function always succeeds. * + * This function is considered experimental. It is not tested by default + * (see the CMake option SIMDUTF_ATOMIC_BASE64_TESTS) nor is it fuzz tested. + * It is not documented in the public API documentation (README). It is + * offered on a best effort basis. We rely on the community for further + * testing and feedback. + * + * @brief atomic_binary_to_base64 * @param input the binary to process * @param length the length of the input in bytes - * @param output the pointer to buffer that can hold the conversion result (should be at least base64_length_from_binary(length) bytes long) - * @param options the base64 options to use, can be base64_default or base64_url, is base64_default by default. - * @return number of written bytes, will be equal to base64_length_from_binary(length, options) + * @param output the pointer to a buffer that can hold the conversion + * result (should be at least base64_length_from_binary(length) bytes long) + * @param options the base64 options to use, can be base64_default or + * base64_url, is base64_default by default. + * @return number of written bytes, will be equal to + * base64_length_from_binary(length, options) */ -size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options = base64_default) noexcept; +size_t +atomic_binary_to_base64(const char *input, size_t length, char *output, + base64_options options = base64_default) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused size_t +atomic_binary_to_base64(const detail::input_span_of_byte_like auto &input, + detail::output_span_of_byte_like auto &&binary_output, + base64_options options = base64_default) noexcept { + return atomic_binary_to_base64( + reinterpret_cast(input.data()), input.size(), + reinterpret_cast(binary_output.data()), options); +} + #endif // SIMDUTF_SPAN + #endif // SIMDUTF_ATOMIC_REF /** * Convert a base64 input to a binary output. * - * This function follows the WHATWG forgiving-base64 format, which means that it will - * ignore any ASCII spaces in the input. You may provide a padded input (with one or two - * equal signs at the end) or an unpadded input (without any equal signs at the end). + * This function follows the WHATWG forgiving-base64 format, which means that it + * will ignore any ASCII spaces in the input. You may provide a padded input + * (with one or two equal signs at the end) or an unpadded input (without any + * equal signs at the end). * * See https://infra.spec.whatwg.org/#forgiving-base64-decode * - * This function will fail in case of invalid input. There are two possible reasons for - * failure: the input contains a number of base64 characters that when divided by 4, leaves - * a single remainder character (BASE64_INPUT_REMAINDER), or the input contains a character - * that is not a valid base64 character (INVALID_BASE64_CHARACTER). + * This function will fail in case of invalid input. When last_chunk_options = + * loose, there are two possible reasons for failure: the input contains a + * number of base64 characters that when divided by 4, leaves a single remainder + * character (BASE64_INPUT_REMAINDER), or the input contains a character that is + * not a valid base64 character (INVALID_BASE64_CHARACTER). + * + * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the + * input where the invalid character was found. When the error is + * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded. + * + * The default option (simdutf::base64_default) expects the characters `+` and + * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the + * characters `-` and `_` as part of its alphabet. + * + * The padding (`=`) is validated if present. There may be at most two padding + * characters at the end of the input. If there are any padding characters, the + * total number of characters (excluding spaces but including padding + * characters) must be divisible by four. * - * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the input - * where the invalid character was found. When the error is BASE64_INPUT_REMAINDER, then - * r.count contains the number of bytes decoded. + * You should call this function with a buffer that is at least + * maximal_binary_length_from_base64(input, length) bytes long. If you fail + * to provide that much space, the function may cause a buffer overflow. * - * You should call this function with a buffer that is at least maximal_binary_length_from_utf6_base64(input, length) bytes long. - * If you fail to provide that much space, the function may cause a buffer overflow. + * Advanced users may want to tailor how the last chunk is handled. By default, + * we use a loose (forgiving) approach but we also support a strict approach + * as well as a stop_before_partial approach, as per the following proposal: * - * @param input the base64 string to process, in ASCII stored as 16-bit units + * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64 + * + * @param input the base64 string to process, in ASCII stored as 16-bit + * units * @param length the length of the string in 16-bit units - * @param output the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long). - * @param options the base64 options to use, can be base64_default or base64_url, is base64_default by default. - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and position of the INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number of bytes written if successful. + * @param output the pointer to a buffer that can hold the conversion + * result (should be at least maximal_binary_length_from_base64(input, length) + * bytes long). + * @param options the base64 options to use, can be base64_default or + * base64_url, is base64_default by default. + * @param last_chunk_options the last chunk handling options, + * last_chunk_handling_options::loose by default + * but can also be last_chunk_handling_options::strict or + * last_chunk_handling_options::stop_before_partial. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and position of the + * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number + * of bytes written if successful. + */ +simdutf_warn_unused result +base64_to_binary(const char16_t *input, size_t length, char *output, + base64_options options = base64_default, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) noexcept; + #if SIMDUTF_SPAN +simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result +base64_to_binary( + std::span input, + detail::output_span_of_byte_like auto &&binary_output, + base64_options options = base64_default, + last_chunk_handling_options last_chunk_options = loose) noexcept { + #if SIMDUTF_CPLUSPLUS23 + if consteval { + return scalar::base64::base64_to_binary_details_impl( + input.data(), input.size(), binary_output.data(), options, + last_chunk_options); + } else + #endif + { + return base64_to_binary(input.data(), input.size(), + reinterpret_cast(binary_output.data()), + options, last_chunk_options); + } +} + #endif // SIMDUTF_SPAN + +/** + * Check if a character is an ignorable base64 character. + * Checking a large input, character by character, is not computationally + * efficient. + * + * @param input the character to check + * @param options the base64 options to use, is base64_default by default. + * @return true if the character is an ignorable base64 character, false + * otherwise. + */ +simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 bool +base64_ignorable(char input, base64_options options = base64_default) noexcept { + return scalar::base64::is_ignorable(input, options); +} +simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 bool +base64_ignorable(char16_t input, + base64_options options = base64_default) noexcept { + return scalar::base64::is_ignorable(input, options); +} + +/** + * Check if a character is a valid base64 character. + * Checking a large input, character by character, is not computationally + * efficient. + * Note that padding characters are not considered valid base64 characters in + * this context, nor are spaces. + * + * @param input the character to check + * @param options the base64 options to use, is base64_default by default. + * @return true if the character is a base64 character, false otherwise. + */ +simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 bool +base64_valid(char input, base64_options options = base64_default) noexcept { + return scalar::base64::is_base64(input, options); +} +simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 bool +base64_valid(char16_t input, base64_options options = base64_default) noexcept { + return scalar::base64::is_base64(input, options); +} + +/** + * Check if a character is a valid base64 character or the padding character + * ('='). Checking a large input, character by character, is not computationally + * efficient. + * + * @param input the character to check + * @param options the base64 options to use, is base64_default by default. + * @return true if the character is a base64 character, false otherwise. */ -simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options = base64_default) noexcept; +simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 bool +base64_valid_or_padding(char input, + base64_options options = base64_default) noexcept { + return scalar::base64::is_base64_or_padding(input, options); +} +simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 bool +base64_valid_or_padding(char16_t input, + base64_options options = base64_default) noexcept { + return scalar::base64::is_base64_or_padding(input, options); +} /** * Convert a base64 input to a binary output. * - * This function follows the WHATWG forgiving-base64 format, which means that it will - * ignore any ASCII spaces in the input. You may provide a padded input (with one or two - * equal signs at the end) or an unpadded input (without any equal signs at the end). + * This function follows the WHATWG forgiving-base64 format, which means that it + * will ignore any ASCII spaces in the input. You may provide a padded input + * (with one or two equal signs at the end) or an unpadded input (without any + * equal signs at the end). * * See https://infra.spec.whatwg.org/#forgiving-base64-decode * - * This function will fail in case of invalid input. There are three possible reasons for - * failure: the input contains a number of base64 characters that when divided by 4, leaves - * a single remainder character (BASE64_INPUT_REMAINDER), the input contains a character - * that is not a valid base64 character (INVALID_BASE64_CHARACTER), or the output buffer + * This function will fail in case of invalid input. When last_chunk_options = + * loose, there are three possible reasons for failure: the input contains a + * number of base64 characters that when divided by 4, leaves a single remainder + * character (BASE64_INPUT_REMAINDER), the input contains a character that is + * not a valid base64 character (INVALID_BASE64_CHARACTER), or the output buffer * is too small (OUTPUT_BUFFER_TOO_SMALL). * * When OUTPUT_BUFFER_TOO_SMALL, we return both the number of bytes written - * and the number of units processed, see description of the parameters and returned value. + * and the number of units processed, see description of the parameters and + * returned value. + * + * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the + * input where the invalid character was found. When the error is + * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded. + * + * The default option (simdutf::base64_default) expects the characters `+` and + * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the + * characters `-` and `_` as part of its alphabet. + * + * The padding (`=`) is validated if present. There may be at most two padding + * characters at the end of the input. If there are any padding characters, the + * total number of characters (excluding spaces but including padding + * characters) must be divisible by four. * - * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the input - * where the invalid character was found. When the error is BASE64_INPUT_REMAINDER, then - * r.count contains the number of bytes decoded. + * The INVALID_BASE64_CHARACTER cases are considered fatal and you are expected + * to discard the output unless the parameter decode_up_to_bad_char is set to + * true. In that case, the function will decode up to the first invalid + * character. Extra padding characters ('=') are considered invalid characters. * - * The INVALID_BASE64_CHARACTER cases are considered fatal and you are expected to discard - * the output. + * Advanced users may want to tailor how the last chunk is handled. By default, + * we use a loose (forgiving) approach but we also support a strict approach + * as well as a stop_before_partial approach, as per the following proposal: * - * @param input the base64 string to process, in ASCII stored as 8-bit or 16-bit units + * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64 + * + * @param input the base64 string to process, in ASCII stored as 8-bit + * or 16-bit units * @param length the length of the string in 8-bit or 16-bit units. - * @param output the pointer to buffer that can hold the conversion result. - * @param outlen the number of bytes that can be written in the output buffer. Upon return, it is modified to reflect how many bytes were written. - * @param options the base64 options to use, can be base64_default or base64_url, is base64_default by default. - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and position of the INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number of units processed if successful. + * @param output the pointer to a buffer that can hold the conversion + * result. + * @param outlen the number of bytes that can be written in the output + * buffer. Upon return, it is modified to reflect how many bytes were written. + * @param options the base64 options to use, can be base64_default or + * base64_url, is base64_default by default. + * @param last_chunk_options the last chunk handling options, + * last_chunk_handling_options::loose by default + * but can also be last_chunk_handling_options::strict or + * last_chunk_handling_options::stop_before_partial. + * @param decode_up_to_bad_char if true, the function will decode up to the + * first invalid character. By default (false), it is assumed that the output + * buffer is to be discarded. When there are multiple errors in the input, + * using decode_up_to_bad_char might trigger a different error. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and position of the + * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number + * of units processed if successful. + */ +simdutf_warn_unused result +base64_to_binary_safe(const char *input, size_t length, char *output, + size_t &outlen, base64_options options = base64_default, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose, + bool decode_up_to_bad_char = false) noexcept; +// the span overload has moved to the bottom of the file + +simdutf_warn_unused result +base64_to_binary_safe(const char16_t *input, size_t length, char *output, + size_t &outlen, base64_options options = base64_default, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose, + bool decode_up_to_bad_char = false) noexcept; + // span overload moved to bottom of file + + #if SIMDUTF_ATOMIC_REF +/** + * Convert a base64 input to a binary output with a size limit and using atomic + * operations. + * + * Like `base64_to_binary_safe` but using atomic operations, this function is + * thread-safe for concurrent memory access, allowing the output + * buffers to be shared between threads without undefined behavior in case of + * data races. + * + * This function comes with a potentially significant performance penalty, but + * is useful when thread safety is needed during base64 decoding. + * + * This function is only available when simdutf is compiled with + * C++20 support and __cpp_lib_atomic_ref >= 201806L. You may check + * the availability of this function by checking the macro + * SIMDUTF_ATOMIC_REF. + * + * This function is considered experimental. It is not tested by default + * (see the CMake option SIMDUTF_ATOMIC_BASE64_TESTS) nor is it fuzz tested. + * It is not documented in the public API documentation (README). It is + * offered on a best effort basis. We rely on the community for further + * testing and feedback. + * + * @param input the base64 input to decode + * @param length the length of the input in bytes + * @param output the pointer to buffer that can hold the conversion + * result + * @param outlen the number of bytes that can be written in the output + * buffer. Upon return, it is modified to reflect how many bytes were written. + * @param options the base64 options to use (default, url, etc.) + * @param last_chunk_options the last chunk handling options (loose, strict, + * stop_before_partial) + * @param decode_up_to_bad_char if true, the function will decode up to the + * first invalid character. By default (false), it is assumed that the output + * buffer is to be discarded. When there are multiple errors in the input, + * using decode_up_to_bad_char might trigger a different error. + * @return a result struct with an error code and count indicating error + * position or success + */ +simdutf_warn_unused result atomic_base64_to_binary_safe( + const char *input, size_t length, char *output, size_t &outlen, + base64_options options = base64_default, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose, + bool decode_up_to_bad_char = false) noexcept; +simdutf_warn_unused result atomic_base64_to_binary_safe( + const char16_t *input, size_t length, char *output, size_t &outlen, + base64_options options = base64_default, + last_chunk_handling_options last_chunk_options = loose, + bool decode_up_to_bad_char = false) noexcept; + #if SIMDUTF_SPAN +/** + * @brief span overload + * @return a tuple of result and outlen + */ +simdutf_really_inline simdutf_warn_unused std::tuple +atomic_base64_to_binary_safe( + const detail::input_span_of_byte_like auto &binary_input, + detail::output_span_of_byte_like auto &&output, + base64_options options = base64_default, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose, + bool decode_up_to_bad_char = false) noexcept { + size_t outlen = output.size(); + auto ret = atomic_base64_to_binary_safe( + reinterpret_cast(binary_input.data()), binary_input.size(), + reinterpret_cast(output.data()), outlen, options, + last_chunk_options, decode_up_to_bad_char); + return {ret, outlen}; +} +/** + * @brief span overload + * @return a tuple of result and outlen */ -simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen, base64_options options = base64_default) noexcept; -simdutf_warn_unused result base64_to_binary_safe(const char16_t * input, size_t length, char* output, size_t& outlen, base64_options options = base64_default) noexcept; +simdutf_warn_unused std::tuple +atomic_base64_to_binary_safe( + std::span base64_input, + detail::output_span_of_byte_like auto &&binary_output, + base64_options options = base64_default, + last_chunk_handling_options last_chunk_options = loose, + bool decode_up_to_bad_char = false) noexcept { + size_t outlen = binary_output.size(); + auto ret = atomic_base64_to_binary_safe( + base64_input.data(), base64_input.size(), + reinterpret_cast(binary_output.data()), outlen, options, + last_chunk_options, decode_up_to_bad_char); + return {ret, outlen}; +} + #endif // SIMDUTF_SPAN + #endif // SIMDUTF_ATOMIC_REF + +#endif // SIMDUTF_FEATURE_BASE64 /** * An implementation of simdutf for a particular CPU architecture. * - * Also used to maintain the currently active implementation. The active implementation is - * automatically initialized on first use to the most advanced implementation supported by the host. + * Also used to maintain the currently active implementation. The active + * implementation is automatically initialized on first use to the most advanced + * implementation supported by the host. */ class implementation { public: - /** * The name of this implementation. * * const implementation *impl = simdutf::active_implementation; - * cout << "simdutf is optimized for " << impl->name() << "(" << impl->description() << ")" << endl; + * cout << "simdutf is optimized for " << impl->name() << "(" << + * impl->description() << ")" << endl; * * @return the name of the implementation, e.g. "haswell", "westmere", "arm64" */ @@ -2465,7 +11325,8 @@ class implementation { * The description of this implementation. * * const implementation *impl = simdutf::active_implementation; - * cout << "simdutf is optimized for " << impl->name() << "(" << impl->description() << ")" << endl; + * cout << "simdutf is optimized for " << impl->name() << "(" << + * impl->description() << ")" << endl; * * @return the name of the implementation, e.g. "haswell", "westmere", "arm64" */ @@ -2477,17 +11338,20 @@ class implementation { * and should therefore not be called too often if performance is a concern. * * - * @return true if the implementation can be safely used on the current system (determined at runtime) + * @return true if the implementation can be safely used on the current system + * (determined at runtime) */ bool supported_by_runtime_system() const; +#if SIMDUTF_FEATURE_DETECT_ENCODING /** * This function will try to detect the encoding * @param input the string to identify * @param length the length of the string in bytes. * @return the encoding type detected */ - virtual encoding_type autodetect_encoding(const char * input, size_t length) const noexcept; + virtual encoding_type autodetect_encoding(const char *input, + size_t length) const noexcept; /** * This function will try to detect the possible encodings in one pass @@ -2495,7 +11359,9 @@ class implementation { * @param length the length of the string in bytes. * @return the encoding type detected */ - virtual int detect_encodings(const char * input, size_t length) const noexcept = 0; + virtual int detect_encodings(const char *input, + size_t length) const noexcept = 0; +#endif // SIMDUTF_FEATURE_DETECT_ENCODING /** * @private For internal implementation use @@ -2504,9 +11370,11 @@ class implementation { * * @return a mask of all required `internal::instruction_set::` values */ - virtual uint32_t required_instruction_sets() const { return _required_instruction_sets; } - + virtual uint32_t required_instruction_sets() const { + return _required_instruction_sets; + } +#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING /** * Validate the UTF-8 string. * @@ -2516,8 +11384,11 @@ class implementation { * @param len the length of the string in bytes. * @return true if and only if the string is valid UTF-8. */ - simdutf_warn_unused virtual bool validate_utf8(const char *buf, size_t len) const noexcept = 0; + simdutf_warn_unused virtual bool validate_utf8(const char *buf, + size_t len) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF8 /** * Validate the UTF-8 string and stop on errors. * @@ -2525,10 +11396,16 @@ class implementation { * * @param buf the UTF-8 string to validate. * @param len the length of the string in bytes. - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of code units validated + * if successful. */ - simdutf_warn_unused virtual result validate_utf8_with_errors(const char *buf, size_t len) const noexcept = 0; + simdutf_warn_unused virtual result + validate_utf8_with_errors(const char *buf, size_t len) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF8 +#if SIMDUTF_FEATURE_ASCII /** * Validate the ASCII string. * @@ -2538,7 +11415,8 @@ class implementation { * @param len the length of the string in bytes. * @return true if and only if the string is valid ASCII. */ - simdutf_warn_unused virtual bool validate_ascii(const char *buf, size_t len) const noexcept = 0; + simdutf_warn_unused virtual bool + validate_ascii(const char *buf, size_t len) const noexcept = 0; /** * Validate the ASCII string and stop on error. @@ -2547,10 +11425,47 @@ class implementation { * * @param buf the ASCII string to validate. * @param len the length of the string in bytes. - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of code units validated + * if successful. + */ + simdutf_warn_unused virtual result + validate_ascii_with_errors(const char *buf, size_t len) const noexcept = 0; + +#endif // SIMDUTF_FEATURE_ASCII + +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII + /** + * Validate the ASCII string as a UTF-16BE sequence. + * An UTF-16 sequence is considered an ASCII sequence + * if it could be converted to an ASCII string losslessly. + * + * Overridden by each implementation. + * + * @param buf the UTF-16BE string to validate. + * @param len the length of the string in bytes. + * @return true if and only if the string is valid ASCII. + */ + simdutf_warn_unused virtual bool + validate_utf16be_as_ascii(const char16_t *buf, size_t len) const noexcept = 0; + + /** + * Validate the ASCII string as a UTF-16LE sequence. + * An UTF-16 sequence is considered an ASCII sequence + * if it could be converted to an ASCII string losslessly. + * + * Overridden by each implementation. + * + * @param buf the UTF-16LE string to validate. + * @param len the length of the string in bytes. + * @return true if and only if the string is valid ASCII. */ - simdutf_warn_unused virtual result validate_ascii_with_errors(const char *buf, size_t len) const noexcept = 0; + simdutf_warn_unused virtual bool + validate_utf16le_as_ascii(const char16_t *buf, size_t len) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII +#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING /** * Validate the UTF-16LE string.This function may be best when you expect * the input to be almost always valid. Otherwise, consider using @@ -2561,11 +11476,15 @@ class implementation { * This function is not BOM-aware. * * @param buf the UTF-16LE string to validate. - * @param len the length of the string in number of 2-byte code units (char16_t). + * @param len the length of the string in number of 2-byte code units + * (char16_t). * @return true if and only if the string is valid UTF-16LE. */ - simdutf_warn_unused virtual bool validate_utf16le(const char16_t *buf, size_t len) const noexcept = 0; + simdutf_warn_unused virtual bool + validate_utf16le(const char16_t *buf, size_t len) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF16 /** * Validate the UTF-16BE string. This function may be best when you expect * the input to be almost always valid. Otherwise, consider using @@ -2576,24 +11495,32 @@ class implementation { * This function is not BOM-aware. * * @param buf the UTF-16BE string to validate. - * @param len the length of the string in number of 2-byte code units (char16_t). + * @param len the length of the string in number of 2-byte code units + * (char16_t). * @return true if and only if the string is valid UTF-16BE. */ - simdutf_warn_unused virtual bool validate_utf16be(const char16_t *buf, size_t len) const noexcept = 0; + simdutf_warn_unused virtual bool + validate_utf16be(const char16_t *buf, size_t len) const noexcept = 0; /** * Validate the UTF-16LE string and stop on error. It might be faster than - * validate_utf16le when an error is expected to occur early. + * validate_utf16le when an error is expected to occur early. * * Overridden by each implementation. * * This function is not BOM-aware. * * @param buf the UTF-16LE string to validate. - * @param len the length of the string in number of 2-byte code units (char16_t). - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. + * @param len the length of the string in number of 2-byte code units + * (char16_t). + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of code units validated + * if successful. */ - simdutf_warn_unused virtual result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept = 0; + simdutf_warn_unused virtual result + validate_utf16le_with_errors(const char16_t *buf, + size_t len) const noexcept = 0; /** * Validate the UTF-16BE string and stop on error. It might be faster than @@ -2604,11 +11531,47 @@ class implementation { * This function is not BOM-aware. * * @param buf the UTF-16BE string to validate. - * @param len the length of the string in number of 2-byte code units (char16_t). - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. + * @param len the length of the string in number of 2-byte code units + * (char16_t). + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of code units validated + * if successful. + */ + simdutf_warn_unused virtual result + validate_utf16be_with_errors(const char16_t *buf, + size_t len) const noexcept = 0; + /** + * Copies the UTF-16LE string while replacing mismatched surrogates with the + * Unicode replacement character U+FFFD. We allow the input and output to be + * the same buffer so that the correction is done in-place. + * + * Overridden by each implementation. + * + * @param input the UTF-16LE string to correct. + * @param len the length of the string in number of 2-byte code units + * (char16_t). + * @param output the output buffer. + */ + virtual void to_well_formed_utf16le(const char16_t *input, size_t len, + char16_t *output) const noexcept = 0; + /** + * Copies the UTF-16BE string while replacing mismatched surrogates with the + * Unicode replacement character U+FFFD. We allow the input and output to be + * the same buffer so that the correction is done in-place. + * + * Overridden by each implementation. + * + * @param input the UTF-16BE string to correct. + * @param len the length of the string in number of 2-byte code units + * (char16_t). + * @param output the output buffer. */ - simdutf_warn_unused virtual result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept = 0; + virtual void to_well_formed_utf16be(const char16_t *input, size_t len, + char16_t *output) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING /** * Validate the UTF-32 string. * @@ -2617,11 +11580,15 @@ class implementation { * This function is not BOM-aware. * * @param buf the UTF-32 string to validate. - * @param len the length of the string in number of 4-byte code units (char32_t). + * @param len the length of the string in number of 4-byte code units + * (char32_t). * @return true if and only if the string is valid UTF-32. */ - simdutf_warn_unused virtual bool validate_utf32(const char32_t *buf, size_t len) const noexcept = 0; + simdutf_warn_unused virtual bool + validate_utf32(const char32_t *buf, size_t len) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF32 /** * Validate the UTF-32 string and stop on error. * @@ -2630,25 +11597,36 @@ class implementation { * This function is not BOM-aware. * * @param buf the UTF-32 string to validate. - * @param len the length of the string in number of 4-byte code units (char32_t). - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. + * @param len the length of the string in number of 4-byte code units + * (char32_t). + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of code units validated + * if successful. */ - simdutf_warn_unused virtual result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept = 0; + simdutf_warn_unused virtual result + validate_utf32_with_errors(const char32_t *buf, + size_t len) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 /** - * Convert Latin1 string into UTF8 string. + * Convert Latin1 string into UTF-8 string. * * This function is suitable to work with inputs from untrusted sources. * * @param input the Latin1 string to convert * @param length the length of the string in bytes - * @param latin1_output the pointer to buffer that can hold conversion result + * @param utf8_output the pointer to buffer that can hold conversion result * @return the number of written char; 0 if conversion is not possible */ - simdutf_warn_unused virtual size_t convert_latin1_to_utf8(const char * input, size_t length, char* utf8_output) const noexcept = 0; + simdutf_warn_unused virtual size_t + convert_latin1_to_utf8(const char *input, size_t length, + char *utf8_output) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 - - /** +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + /** * Convert possibly Latin1 string into UTF-16LE string. * * This function is suitable to work with inputs from untrusted sources. @@ -2658,7 +11636,9 @@ class implementation { * @param utf16_buffer the pointer to buffer that can hold conversion result * @return the number of written char16_t; 0 if conversion is not possible */ - simdutf_warn_unused virtual size_t convert_latin1_to_utf16le(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0; + simdutf_warn_unused virtual size_t + convert_latin1_to_utf16le(const char *input, size_t length, + char16_t *utf16_output) const noexcept = 0; /** * Convert Latin1 string into UTF-16BE string. @@ -2670,8 +11650,12 @@ class implementation { * @param utf16_buffer the pointer to buffer that can hold conversion result * @return the number of written char16_t; 0 if conversion is not possible */ - simdutf_warn_unused virtual size_t convert_latin1_to_utf16be(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0; + simdutf_warn_unused virtual size_t + convert_latin1_to_utf16be(const char *input, size_t length, + char16_t *utf16_output) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 /** * Convert Latin1 string into UTF-32 string. * @@ -2682,9 +11666,13 @@ class implementation { * @param utf32_buffer the pointer to buffer that can hold conversion result * @return the number of written char32_t; 0 if conversion is not possible */ - simdutf_warn_unused virtual size_t convert_latin1_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; + simdutf_warn_unused virtual size_t + convert_latin1_to_utf32(const char *input, size_t length, + char32_t *utf32_buffer) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 - /** +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + /** * Convert possibly broken UTF-8 string into latin1 string. * * During the conversion also validation of the input string is done. @@ -2693,9 +11681,12 @@ class implementation { * @param input the UTF-8 string to convert * @param length the length of the string in bytes * @param latin1_output the pointer to buffer that can hold conversion result - * @return the number of written char; 0 if the input was not valid UTF-8 string or if it cannot be represented as Latin1 + * @return the number of written char; 0 if the input was not valid UTF-8 + * string or if it cannot be represented as Latin1 */ - simdutf_warn_unused virtual size_t convert_utf8_to_latin1(const char * input, size_t length, char* latin1_output) const noexcept = 0; + simdutf_warn_unused virtual size_t + convert_utf8_to_latin1(const char *input, size_t length, + char *latin1_output) const noexcept = 0; /** * Convert possibly broken UTF-8 string into latin1 string with errors. @@ -2708,25 +11699,40 @@ class implementation { * @param input the UTF-8 string to convert * @param length the length of the string in bytes * @param latin1_output the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of code units validated + * if successful. */ - simdutf_warn_unused virtual result convert_utf8_to_latin1_with_errors(const char * input, size_t length, char* latin1_output) const noexcept = 0; + simdutf_warn_unused virtual result + convert_utf8_to_latin1_with_errors(const char *input, size_t length, + char *latin1_output) const noexcept = 0; - /** + /** * Convert valid UTF-8 string into latin1 string. * - * This function assumes that the input string is valid UTF-8. + * This function assumes that the input string is valid UTF-8 and that it can + * be represented as Latin1. If you violate this assumption, the result is + * implementation defined and may include system-dependent behavior such as + * crashes. + * + * This function is for expert users only and not part of our public API. Use + * convert_utf8_to_latin1 instead. * * This function is not BOM-aware. * * @param input the UTF-8 string to convert * @param length the length of the string in bytes * @param latin1_output the pointer to buffer that can hold conversion result - * @return the number of written char; 0 if the input was not valid UTF-8 string + * @return the number of written char; 0 if the input was not valid UTF-8 + * string */ - simdutf_warn_unused virtual size_t convert_valid_utf8_to_latin1(const char * input, size_t length, char* latin1_output) const noexcept = 0; - + simdutf_warn_unused virtual size_t + convert_valid_utf8_to_latin1(const char *input, size_t length, + char *latin1_output) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 /** * Convert possibly broken UTF-8 string into UTF-16LE string. * @@ -2736,9 +11742,12 @@ class implementation { * @param input the UTF-8 string to convert * @param length the length of the string in bytes * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return the number of written char16_t; 0 if the input was not valid UTF-8 string + * @return the number of written char16_t; 0 if the input was not valid UTF-8 + * string */ - simdutf_warn_unused virtual size_t convert_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0; + simdutf_warn_unused virtual size_t + convert_utf8_to_utf16le(const char *input, size_t length, + char16_t *utf16_output) const noexcept = 0; /** * Convert possibly broken UTF-8 string into UTF-16BE string. @@ -2749,12 +11758,16 @@ class implementation { * @param input the UTF-8 string to convert * @param length the length of the string in bytes * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return the number of written char16_t; 0 if the input was not valid UTF-8 string + * @return the number of written char16_t; 0 if the input was not valid UTF-8 + * string */ - simdutf_warn_unused virtual size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0; + simdutf_warn_unused virtual size_t + convert_utf8_to_utf16be(const char *input, size_t length, + char16_t *utf16_output) const noexcept = 0; /** - * Convert possibly broken UTF-8 string into UTF-16LE string and stop on error. + * Convert possibly broken UTF-8 string into UTF-16LE string and stop on + * error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -2762,12 +11775,18 @@ class implementation { * @param input the UTF-8 string to convert * @param length the length of the string in bytes * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of code units validated + * if successful. */ - simdutf_warn_unused virtual result convert_utf8_to_utf16le_with_errors(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0; + simdutf_warn_unused virtual result convert_utf8_to_utf16le_with_errors( + const char *input, size_t length, + char16_t *utf16_output) const noexcept = 0; /** - * Convert possibly broken UTF-8 string into UTF-16BE string and stop on error. + * Convert possibly broken UTF-8 string into UTF-16BE string and stop on + * error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -2775,10 +11794,61 @@ class implementation { * @param input the UTF-8 string to convert * @param length the length of the string in bytes * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of code units validated + * if successful. + */ + simdutf_warn_unused virtual result convert_utf8_to_utf16be_with_errors( + const char *input, size_t length, + char16_t *utf16_output) const noexcept = 0; + /** + * Compute the number of bytes that this UTF-16LE string would require in + * UTF-8 format even when the UTF-16LE content contains mismatched + * surrogates that have to be replaced by the replacement character (0xFFFD). + * + * @param input the UTF-16LE string to convert + * @param length the length of the string in 2-byte code units + * (char16_t) + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) where the count is the number of bytes required to + * encode the UTF-16LE string as UTF-8, and the error code is either SUCCESS + * or SURROGATE. The count is correct regardless of the error field. + * When SURROGATE is returned, it does not indicate an error in the case of + * this function: it indicates that at least one surrogate has been + * encountered: the surrogates may be matched or not (thus this function does + * not validate). If the returned error code is SUCCESS, then the input + * contains no surrogate, is in the Basic Multilingual Plane, and is + * necessarily valid. + */ + virtual simdutf_warn_unused result utf8_length_from_utf16le_with_replacement( + const char16_t *input, size_t length) const noexcept = 0; + + /** + * Compute the number of bytes that this UTF-16BE string would require in + * UTF-8 format even when the UTF-16BE content contains mismatched + * surrogates that have to be replaced by the replacement character (0xFFFD). + * + * @param input the UTF-16BE string to convert + * @param length the length of the string in 2-byte code units + * (char16_t) + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) where the count is the number of bytes required to + * encode the UTF-16BE string as UTF-8, and the error code is either SUCCESS + * or SURROGATE. The count is correct regardless of the error field. + * When SURROGATE is returned, it does not indicate an error in the case of + * this function: it indicates that at least one surrogate has been + * encountered: the surrogates may be matched or not (thus this function does + * not validate). If the returned error code is SUCCESS, then the input + * contains no surrogate, is in the Basic Multilingual Plane, and is + * necessarily valid. */ - simdutf_warn_unused virtual result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0; + virtual simdutf_warn_unused result utf8_length_from_utf16be_with_replacement( + const char16_t *input, size_t length) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 + +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 /** * Convert possibly broken UTF-8 string into UTF-32 string. * @@ -2788,9 +11858,12 @@ class implementation { * @param input the UTF-8 string to convert * @param length the length of the string in bytes * @param utf32_buffer the pointer to buffer that can hold conversion result - * @return the number of written char16_t; 0 if the input was not valid UTF-8 string + * @return the number of written char16_t; 0 if the input was not valid UTF-8 + * string */ - simdutf_warn_unused virtual size_t convert_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_output) const noexcept = 0; + simdutf_warn_unused virtual size_t + convert_utf8_to_utf32(const char *input, size_t length, + char32_t *utf32_output) const noexcept = 0; /** * Convert possibly broken UTF-8 string into UTF-32 string and stop on error. @@ -2801,10 +11874,17 @@ class implementation { * @param input the UTF-8 string to convert * @param length the length of the string in bytes * @param utf32_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of char32_t written if + * successful. */ - simdutf_warn_unused virtual result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) const noexcept = 0; + simdutf_warn_unused virtual result + convert_utf8_to_utf32_with_errors(const char *input, size_t length, + char32_t *utf32_output) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 /** * Convert valid UTF-8 string into UTF-16LE string. * @@ -2815,9 +11895,11 @@ class implementation { * @param utf16_buffer the pointer to buffer that can hold conversion result * @return the number of written char16_t */ - simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; + simdutf_warn_unused virtual size_t + convert_valid_utf8_to_utf16le(const char *input, size_t length, + char16_t *utf16_buffer) const noexcept = 0; -/** + /** * Convert valid UTF-8 string into UTF-16BE string. * * This function assumes that the input string is valid UTF-8. @@ -2827,8 +11909,12 @@ class implementation { * @param utf16_buffer the pointer to buffer that can hold conversion result * @return the number of written char16_t */ - simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; + simdutf_warn_unused virtual size_t + convert_valid_utf8_to_utf16be(const char *input, size_t length, + char16_t *utf16_buffer) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 /** * Convert valid UTF-8 string into UTF-32 string. * @@ -2839,32 +11925,48 @@ class implementation { * @param utf16_buffer the pointer to buffer that can hold conversion result * @return the number of written char32_t */ - simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; + simdutf_warn_unused virtual size_t + convert_valid_utf8_to_utf32(const char *input, size_t length, + char32_t *utf32_buffer) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 /** - * Compute the number of 2-byte code units that this UTF-8 string would require in UTF-16LE format. + * Compute the number of 2-byte code units that this UTF-8 string would + * require in UTF-16LE format. * - * This function does not validate the input. + * This function does not validate the input. It is acceptable to pass invalid + * UTF-8 strings but in such cases the result is implementation defined. * * @param input the UTF-8 string to process * @param length the length of the string in bytes - * @return the number of char16_t code units required to encode the UTF-8 string as UTF-16LE + * @return the number of char16_t code units required to encode the UTF-8 + * string as UTF-16LE */ - simdutf_warn_unused virtual size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept = 0; + simdutf_warn_unused virtual size_t + utf16_length_from_utf8(const char *input, size_t length) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 - /** - * Compute the number of 4-byte code units that this UTF-8 string would require in UTF-32 format. +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 + /** + * Compute the number of 4-byte code units that this UTF-8 string would + * require in UTF-32 format. * - * This function is equivalent to count_utf8. + * This function is equivalent to count_utf8. It is acceptable to pass invalid + * UTF-8 strings but in such cases the result is implementation defined. * * This function does not validate the input. * * @param input the UTF-8 string to process * @param length the length of the string in bytes - * @return the number of char32_t code units required to encode the UTF-8 string as UTF-32 + * @return the number of char32_t code units required to encode the UTF-8 + * string as UTF-32 */ - simdutf_warn_unused virtual size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept = 0; + simdutf_warn_unused virtual size_t + utf32_length_from_utf8(const char *input, size_t length) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 /** * Convert possibly broken UTF-16LE string into Latin1 string. * @@ -2874,11 +11976,16 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16LE string to convert - * @param length the length of the string in 2-byte code units (char16_t) - * @param latin1_buffer the pointer to buffer that can hold conversion result - * @return number of written code units; 0 if input is not a valid UTF-16LE string or if it cannot be represented as Latin1 + * @param length the length of the string in 2-byte code units + * (char16_t) + * @param latin1_buffer the pointer to buffer that can hold conversion + * result + * @return number of written code units; 0 if input is not a valid UTF-16LE + * string or if it cannot be represented as Latin1 */ - simdutf_warn_unused virtual size_t convert_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0; + simdutf_warn_unused virtual size_t + convert_utf16le_to_latin1(const char16_t *input, size_t length, + char *latin1_buffer) const noexcept = 0; /** * Convert possibly broken UTF-16BE string into Latin1 string. @@ -2889,11 +11996,16 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16BE string to convert - * @param length the length of the string in 2-byte code units (char16_t) - * @param latin1_buffer the pointer to buffer that can hold conversion result - * @return number of written code units; 0 if input is not a valid UTF-16BE string or if it cannot be represented as Latin1 + * @param length the length of the string in 2-byte code units + * (char16_t) + * @param latin1_buffer the pointer to buffer that can hold conversion + * result + * @return number of written code units; 0 if input is not a valid UTF-16BE + * string or if it cannot be represented as Latin1 */ - simdutf_warn_unused virtual size_t convert_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0; + simdutf_warn_unused virtual size_t + convert_utf16be_to_latin1(const char16_t *input, size_t length, + char *latin1_buffer) const noexcept = 0; /** * Convert possibly broken UTF-16LE string into Latin1 string. @@ -2905,11 +12017,18 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16LE string to convert - * @param length the length of the string in 2-byte code units (char16_t) - * @param latin1_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. + * @param length the length of the string in 2-byte code units + * (char16_t) + * @param latin1_buffer the pointer to buffer that can hold conversion + * result + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of char written if + * successful. */ - simdutf_warn_unused virtual result convert_utf16le_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0; + simdutf_warn_unused virtual result + convert_utf16le_to_latin1_with_errors(const char16_t *input, size_t length, + char *latin1_buffer) const noexcept = 0; /** * Convert possibly broken UTF-16BE string into Latin1 string. @@ -2921,40 +12040,69 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16BE string to convert - * @param length the length of the string in 2-byte code units (char16_t) - * @param latin1_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. + * @param length the length of the string in 2-byte code units + * (char16_t) + * @param latin1_buffer the pointer to buffer that can hold conversion + * result + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of char written if + * successful. */ - simdutf_warn_unused virtual result convert_utf16be_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0; + simdutf_warn_unused virtual result + convert_utf16be_to_latin1_with_errors(const char16_t *input, size_t length, + char *latin1_buffer) const noexcept = 0; /** * Convert valid UTF-16LE string into Latin1 string. * - * This function assumes that the input string is valid UTF-8. - + * This function assumes that the input string is valid UTF-L16LE and that it + * can be represented as Latin1. If you violate this assumption, the result is + * implementation defined and may include system-dependent behavior such as + * crashes. + * + * This function is for expert users only and not part of our public API. Use + * convert_utf16le_to_latin1 instead. + * * This function is not BOM-aware. * * @param input the UTF-16LE string to convert - * @param length the length of the string in 2-byte code units (char16_t) - * @param latin1_buffer the pointer to buffer that can hold conversion result + * @param length the length of the string in 2-byte code units + * (char16_t) + * @param latin1_buffer the pointer to buffer that can hold conversion + * result * @return number of written code units; 0 if conversion is not possible */ - simdutf_warn_unused virtual size_t convert_valid_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0; + simdutf_warn_unused virtual size_t + convert_valid_utf16le_to_latin1(const char16_t *input, size_t length, + char *latin1_buffer) const noexcept = 0; /** * Convert valid UTF-16BE string into Latin1 string. * - * This function assumes that the input string is valid UTF-8. + * This function assumes that the input string is valid UTF16-BE and that it + * can be represented as Latin1. If you violate this assumption, the result is + * implementation defined and may include system-dependent behavior such as + * crashes. + * + * This function is for expert users only and not part of our public API. Use + * convert_utf16be_to_latin1 instead. * * This function is not BOM-aware. * * @param input the UTF-16BE string to convert - * @param length the length of the string in 2-byte code units (char16_t) - * @param latin1_buffer the pointer to buffer that can hold conversion result + * @param length the length of the string in 2-byte code units + * (char16_t) + * @param latin1_buffer the pointer to buffer that can hold conversion + * result * @return number of written code units; 0 if conversion is not possible */ - simdutf_warn_unused virtual size_t convert_valid_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0; + simdutf_warn_unused virtual size_t + convert_valid_utf16be_to_latin1(const char16_t *input, size_t length, + char *latin1_buffer) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 /** * Convert possibly broken UTF-16LE string into UTF-8 string. * @@ -2964,11 +12112,15 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16LE string to convert - * @param length the length of the string in 2-byte code units (char16_t) + * @param length the length of the string in 2-byte code units + * (char16_t) * @param utf8_buffer the pointer to buffer that can hold conversion result - * @return number of written code units; 0 if input is not a valid UTF-16LE string + * @return number of written code units; 0 if input is not a valid UTF-16LE + * string */ - simdutf_warn_unused virtual size_t convert_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0; + simdutf_warn_unused virtual size_t + convert_utf16le_to_utf8(const char16_t *input, size_t length, + char *utf8_buffer) const noexcept = 0; /** * Convert possibly broken UTF-16BE string into UTF-8 string. @@ -2979,14 +12131,19 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16BE string to convert - * @param length the length of the string in 2-byte code units (char16_t) + * @param length the length of the string in 2-byte code units + * (char16_t) * @param utf8_buffer the pointer to buffer that can hold conversion result - * @return number of written code units; 0 if input is not a valid UTF-16BE string + * @return number of written code units; 0 if input is not a valid UTF-16BE + * string */ - simdutf_warn_unused virtual size_t convert_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0; + simdutf_warn_unused virtual size_t + convert_utf16be_to_utf8(const char16_t *input, size_t length, + char *utf8_buffer) const noexcept = 0; /** - * Convert possibly broken UTF-16LE string into UTF-8 string and stop on error. + * Convert possibly broken UTF-16LE string into UTF-8 string and stop on + * error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -2994,14 +12151,21 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16LE string to convert - * @param length the length of the string in 2-byte code units (char16_t) + * @param length the length of the string in 2-byte code units + * (char16_t) * @param utf8_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of char written if + * successful. */ - simdutf_warn_unused virtual result convert_utf16le_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0; + simdutf_warn_unused virtual result + convert_utf16le_to_utf8_with_errors(const char16_t *input, size_t length, + char *utf8_buffer) const noexcept = 0; /** - * Convert possibly broken UTF-16BE string into UTF-8 string and stop on error. + * Convert possibly broken UTF-16BE string into UTF-8 string and stop on + * error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -3009,11 +12173,17 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16BE string to convert - * @param length the length of the string in 2-byte code units (char16_t) + * @param length the length of the string in 2-byte code units + * (char16_t) * @param utf8_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of char written if + * successful. */ - simdutf_warn_unused virtual result convert_utf16be_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0; + simdutf_warn_unused virtual result + convert_utf16be_to_utf8_with_errors(const char16_t *input, size_t length, + char *utf8_buffer) const noexcept = 0; /** * Convert valid UTF-16LE string into UTF-8 string. @@ -3023,11 +12193,15 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16LE string to convert - * @param length the length of the string in 2-byte code units (char16_t) - * @param utf8_buffer the pointer to buffer that can hold the conversion result + * @param length the length of the string in 2-byte code units + * (char16_t) + * @param utf8_buffer the pointer to a buffer that can hold the conversion + * result * @return number of written code units; 0 if conversion is not possible */ - simdutf_warn_unused virtual size_t convert_valid_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0; + simdutf_warn_unused virtual size_t + convert_valid_utf16le_to_utf8(const char16_t *input, size_t length, + char *utf8_buffer) const noexcept = 0; /** * Convert valid UTF-16BE string into UTF-8 string. @@ -3037,12 +12211,18 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16BE string to convert - * @param length the length of the string in 2-byte code units (char16_t) - * @param utf8_buffer the pointer to buffer that can hold the conversion result + * @param length the length of the string in 2-byte code units + * (char16_t) + * @param utf8_buffer the pointer to a buffer that can hold the conversion + * result * @return number of written code units; 0 if conversion is not possible */ - simdutf_warn_unused virtual size_t convert_valid_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0; + simdutf_warn_unused virtual size_t + convert_valid_utf16be_to_utf8(const char16_t *input, size_t length, + char *utf8_buffer) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 /** * Convert possibly broken UTF-16LE string into UTF-32 string. * @@ -3052,11 +12232,15 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16LE string to convert - * @param length the length of the string in 2-byte code units (char16_t) + * @param length the length of the string in 2-byte code units + * (char16_t) * @param utf32_buffer the pointer to buffer that can hold conversion result - * @return number of written code units; 0 if input is not a valid UTF-16LE string + * @return number of written code units; 0 if input is not a valid UTF-16LE + * string */ - simdutf_warn_unused virtual size_t convert_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; + simdutf_warn_unused virtual size_t + convert_utf16le_to_utf32(const char16_t *input, size_t length, + char32_t *utf32_buffer) const noexcept = 0; /** * Convert possibly broken UTF-16BE string into UTF-32 string. @@ -3067,14 +12251,19 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16BE string to convert - * @param length the length of the string in 2-byte code units (char16_t) + * @param length the length of the string in 2-byte code units + * (char16_t) * @param utf32_buffer the pointer to buffer that can hold conversion result - * @return number of written code units; 0 if input is not a valid UTF-16BE string + * @return number of written code units; 0 if input is not a valid UTF-16BE + * string */ - simdutf_warn_unused virtual size_t convert_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; + simdutf_warn_unused virtual size_t + convert_utf16be_to_utf32(const char16_t *input, size_t length, + char32_t *utf32_buffer) const noexcept = 0; /** - * Convert possibly broken UTF-16LE string into UTF-32 string and stop on error. + * Convert possibly broken UTF-16LE string into UTF-32 string and stop on + * error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -3082,14 +12271,21 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16LE string to convert - * @param length the length of the string in 2-byte code units (char16_t) + * @param length the length of the string in 2-byte code units + * (char16_t) * @param utf32_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of char32_t written if + * successful. */ - simdutf_warn_unused virtual result convert_utf16le_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; + simdutf_warn_unused virtual result convert_utf16le_to_utf32_with_errors( + const char16_t *input, size_t length, + char32_t *utf32_buffer) const noexcept = 0; /** - * Convert possibly broken UTF-16BE string into UTF-32 string and stop on error. + * Convert possibly broken UTF-16BE string into UTF-32 string and stop on + * error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -3097,11 +12293,17 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16BE string to convert - * @param length the length of the string in 2-byte code units (char16_t) + * @param length the length of the string in 2-byte code units + * (char16_t) * @param utf32_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of char32_t written if + * successful. */ - simdutf_warn_unused virtual result convert_utf16be_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; + simdutf_warn_unused virtual result convert_utf16be_to_utf32_with_errors( + const char16_t *input, size_t length, + char32_t *utf32_buffer) const noexcept = 0; /** * Convert valid UTF-16LE string into UTF-32 string. @@ -3111,11 +12313,15 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16LE string to convert - * @param length the length of the string in 2-byte code units (char16_t) - * @param utf32_buffer the pointer to buffer that can hold the conversion result + * @param length the length of the string in 2-byte code units + * (char16_t) + * @param utf32_buffer the pointer to a buffer that can hold the conversion + * result * @return number of written code units; 0 if conversion is not possible */ - simdutf_warn_unused virtual size_t convert_valid_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; + simdutf_warn_unused virtual size_t + convert_valid_utf16le_to_utf32(const char16_t *input, size_t length, + char32_t *utf32_buffer) const noexcept = 0; /** * Convert valid UTF-16LE string into UTF-32BE string. @@ -3125,38 +12331,56 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16BE string to convert - * @param length the length of the string in 2-byte code units (char16_t) - * @param utf32_buffer the pointer to buffer that can hold the conversion result + * @param length the length of the string in 2-byte code units + * (char16_t) + * @param utf32_buffer the pointer to a buffer that can hold the conversion + * result * @return number of written code units; 0 if conversion is not possible */ - simdutf_warn_unused virtual size_t convert_valid_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; + simdutf_warn_unused virtual size_t + convert_valid_utf16be_to_utf32(const char16_t *input, size_t length, + char32_t *utf32_buffer) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 /** - * Compute the number of bytes that this UTF-16LE string would require in UTF-8 format. + * Compute the number of bytes that this UTF-16LE string would require in + * UTF-8 format. * - * This function does not validate the input. + * This function does not validate the input. It is acceptable to pass invalid + * UTF-16 strings but in such cases the result is implementation defined. * * This function is not BOM-aware. * * @param input the UTF-16LE string to convert - * @param length the length of the string in 2-byte code units (char16_t) + * @param length the length of the string in 2-byte code units + * (char16_t) * @return the number of bytes required to encode the UTF-16LE string as UTF-8 */ - simdutf_warn_unused virtual size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept = 0; + simdutf_warn_unused virtual size_t + utf8_length_from_utf16le(const char16_t *input, + size_t length) const noexcept = 0; /** - * Compute the number of bytes that this UTF-16BE string would require in UTF-8 format. + * Compute the number of bytes that this UTF-16BE string would require in + * UTF-8 format. * - * This function does not validate the input. + * This function does not validate the input. It is acceptable to pass invalid + * UTF-16 strings but in such cases the result is implementation defined. * * This function is not BOM-aware. * * @param input the UTF-16BE string to convert - * @param length the length of the string in 2-byte code units (char16_t) + * @param length the length of the string in 2-byte code units + * (char16_t) * @return the number of bytes required to encode the UTF-16BE string as UTF-8 */ - simdutf_warn_unused virtual size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept = 0; + simdutf_warn_unused virtual size_t + utf8_length_from_utf16be(const char16_t *input, + size_t length) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 /** * Convert possibly broken UTF-32 string into Latin1 string. * @@ -3166,13 +12390,19 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte code units (char32_t) - * @param latin1_buffer the pointer to buffer that can hold conversion result - * @return number of written code units; 0 if input is not a valid UTF-32 string + * @param length the length of the string in 4-byte code units + * (char32_t) + * @param latin1_buffer the pointer to buffer that can hold conversion + * result + * @return number of written code units; 0 if input is not a valid UTF-32 + * string */ + simdutf_warn_unused virtual size_t + convert_utf32_to_latin1(const char32_t *input, size_t length, + char *latin1_buffer) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 - simdutf_warn_unused virtual size_t convert_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) const noexcept = 0; - +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 /** * Convert possibly broken UTF-32 string into Latin1 string and stop on error. * If the string cannot be represented as Latin1, an error is returned. @@ -3183,26 +12413,45 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte code units (char32_t) - * @param latin1_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. + * @param length the length of the string in 4-byte code units + * (char32_t) + * @param latin1_buffer the pointer to buffer that can hold conversion + * result + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of char written if + * successful. */ - simdutf_warn_unused virtual result convert_utf32_to_latin1_with_errors(const char32_t * input, size_t length, char* latin1_buffer) const noexcept = 0; + simdutf_warn_unused virtual result + convert_utf32_to_latin1_with_errors(const char32_t *input, size_t length, + char *latin1_buffer) const noexcept = 0; /** * Convert valid UTF-32 string into Latin1 string. * - * This function assumes that the input string is valid UTF-32. + * This function assumes that the input string is valid UTF-32 and can be + * represented as Latin1. If you violate this assumption, the result is + * implementation defined and may include system-dependent behavior such as + * crashes. + * + * This function is for expert users only and not part of our public API. Use + * convert_utf32_to_latin1 instead. * * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte code units (char32_t) - * @param latin1_buffer the pointer to buffer that can hold the conversion result + * @param length the length of the string in 4-byte code units + * (char32_t) + * @param latin1_buffer the pointer to a buffer that can hold the conversion + * result * @return number of written code units; 0 if conversion is not possible */ - simdutf_warn_unused virtual size_t convert_valid_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) const noexcept = 0; + simdutf_warn_unused virtual size_t + convert_valid_utf32_to_latin1(const char32_t *input, size_t length, + char *latin1_buffer) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 /** * Convert possibly broken UTF-32 string into UTF-8 string. * @@ -3212,11 +12461,15 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte code units (char32_t) + * @param length the length of the string in 4-byte code units + * (char32_t) * @param utf8_buffer the pointer to buffer that can hold conversion result - * @return number of written code units; 0 if input is not a valid UTF-32 string + * @return number of written code units; 0 if input is not a valid UTF-32 + * string */ - simdutf_warn_unused virtual size_t convert_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0; + simdutf_warn_unused virtual size_t + convert_utf32_to_utf8(const char32_t *input, size_t length, + char *utf8_buffer) const noexcept = 0; /** * Convert possibly broken UTF-32 string into UTF-8 string and stop on error. @@ -3227,11 +12480,17 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte code units (char32_t) + * @param length the length of the string in 4-byte code units + * (char32_t) * @param utf8_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of char written if + * successful. */ - simdutf_warn_unused virtual result convert_utf32_to_utf8_with_errors(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0; + simdutf_warn_unused virtual result + convert_utf32_to_utf8_with_errors(const char32_t *input, size_t length, + char *utf8_buffer) const noexcept = 0; /** * Convert valid UTF-32 string into UTF-8 string. @@ -3241,23 +12500,35 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte code units (char32_t) - * @param utf8_buffer the pointer to buffer that can hold the conversion result + * @param length the length of the string in 4-byte code units + * (char32_t) + * @param utf8_buffer the pointer to a buffer that can hold the conversion + * result * @return number of written code units; 0 if conversion is not possible */ - simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0; + simdutf_warn_unused virtual size_t + convert_valid_utf32_to_utf8(const char32_t *input, size_t length, + char *utf8_buffer) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 - - /** - * Return the number of bytes that this UTF-16 string would require in Latin1 format. +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + /** + * Return the number of bytes that this UTF-16 string would require in Latin1 + * format. * * * @param input the UTF-16 string to convert - * @param length the length of the string in 2-byte code units (char16_t) + * @param length the length of the string in 2-byte code units + * (char16_t) * @return the number of bytes required to encode the UTF-16 string as Latin1 */ - simdutf_warn_unused virtual size_t utf16_length_from_latin1(size_t length) const noexcept = 0; + simdutf_warn_unused virtual size_t + utf16_length_from_latin1(size_t length) const noexcept { + return length; + } +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 /** * Convert possibly broken UTF-32 string into UTF-16LE string. * @@ -3267,11 +12538,15 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte code units (char32_t) + * @param length the length of the string in 4-byte code units + * (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return number of written code units; 0 if input is not a valid UTF-32 string + * @return number of written code units; 0 if input is not a valid UTF-32 + * string */ - simdutf_warn_unused virtual size_t convert_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; + simdutf_warn_unused virtual size_t + convert_utf32_to_utf16le(const char32_t *input, size_t length, + char16_t *utf16_buffer) const noexcept = 0; /** * Convert possibly broken UTF-32 string into UTF-16BE string. @@ -3282,14 +12557,19 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte code units (char32_t) + * @param length the length of the string in 4-byte code units + * (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return number of written code units; 0 if input is not a valid UTF-32 string + * @return number of written code units; 0 if input is not a valid UTF-32 + * string */ - simdutf_warn_unused virtual size_t convert_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; + simdutf_warn_unused virtual size_t + convert_utf32_to_utf16be(const char32_t *input, size_t length, + char16_t *utf16_buffer) const noexcept = 0; /** - * Convert possibly broken UTF-32 string into UTF-16LE string and stop on error. + * Convert possibly broken UTF-32 string into UTF-16LE string and stop on + * error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -3297,14 +12577,21 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte code units (char32_t) + * @param length the length of the string in 4-byte code units + * (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of char16_t written if + * successful. */ - simdutf_warn_unused virtual result convert_utf32_to_utf16le_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; + simdutf_warn_unused virtual result convert_utf32_to_utf16le_with_errors( + const char32_t *input, size_t length, + char16_t *utf16_buffer) const noexcept = 0; /** - * Convert possibly broken UTF-32 string into UTF-16BE string and stop on error. + * Convert possibly broken UTF-32 string into UTF-16BE string and stop on + * error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -3312,11 +12599,17 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte code units (char32_t) + * @param length the length of the string in 4-byte code units + * (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in code units) if any, or the number of char16_t written if + * successful. */ - simdutf_warn_unused virtual result convert_utf32_to_utf16be_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; + simdutf_warn_unused virtual result convert_utf32_to_utf16be_with_errors( + const char32_t *input, size_t length, + char16_t *utf16_buffer) const noexcept = 0; /** * Convert valid UTF-32 string into UTF-16LE string. @@ -3326,11 +12619,15 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte code units (char32_t) - * @param utf16_buffer the pointer to buffer that can hold the conversion result + * @param length the length of the string in 4-byte code units + * (char32_t) + * @param utf16_buffer the pointer to a buffer that can hold the conversion + * result * @return number of written code units; 0 if conversion is not possible */ - simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; + simdutf_warn_unused virtual size_t + convert_valid_utf32_to_utf16le(const char32_t *input, size_t length, + char16_t *utf16_buffer) const noexcept = 0; /** * Convert valid UTF-32 string into UTF-16BE string. @@ -3340,294 +12637,564 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte code units (char32_t) - * @param utf16_buffer the pointer to buffer that can hold the conversion result + * @param length the length of the string in 4-byte code units + * (char32_t) + * @param utf16_buffer the pointer to a buffer that can hold the conversion + * result * @return number of written code units; 0 if conversion is not possible */ - simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; + simdutf_warn_unused virtual size_t + convert_valid_utf32_to_utf16be(const char32_t *input, size_t length, + char16_t *utf16_buffer) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF16 /** - * Change the endianness of the input. Can be used to go from UTF-16LE to UTF-16BE or - * from UTF-16BE to UTF-16LE. + * Change the endianness of the input. Can be used to go from UTF-16LE to + * UTF-16BE or from UTF-16BE to UTF-16LE. * * This function does not validate the input. * * This function is not BOM-aware. * * @param input the UTF-16 string to process - * @param length the length of the string in 2-byte code units (char16_t) - * @param output the pointer to buffer that can hold the conversion result + * @param length the length of the string in 2-byte code units + * (char16_t) + * @param output the pointer to a buffer that can hold the conversion + * result */ - virtual void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept = 0; + virtual void change_endianness_utf16(const char16_t *input, size_t length, + char16_t *output) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF16 - /** - * Return the number of bytes that this Latin1 string would require in UTF-8 format. +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 + /** + * Return the number of bytes that this Latin1 string would require in UTF-8 + * format. * * @param input the Latin1 string to convert * @param length the length of the string bytes * @return the number of bytes required to encode the Latin1 string as UTF-8 */ - simdutf_warn_unused virtual size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept = 0; + simdutf_warn_unused virtual size_t + utf8_length_from_latin1(const char *input, size_t length) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 /** - * Compute the number of bytes that this UTF-32 string would require in UTF-8 format. + * Compute the number of bytes that this UTF-32 string would require in UTF-8 + * format. * - * This function does not validate the input. + * This function does not validate the input. It is acceptable to pass invalid + * UTF-32 strings but in such cases the result is implementation defined. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte code units (char32_t) + * @param length the length of the string in 4-byte code units + * (char32_t) * @return the number of bytes required to encode the UTF-32 string as UTF-8 */ - simdutf_warn_unused virtual size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept = 0; + simdutf_warn_unused virtual size_t + utf8_length_from_utf32(const char32_t *input, + size_t length) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 /** - * Compute the number of bytes that this UTF-32 string would require in Latin1 format. + * Compute the number of bytes that this UTF-32 string would require in Latin1 + * format. * - * This function does not validate the input. + * This function does not validate the input. It is acceptable to pass invalid + * UTF-32 strings but in such cases the result is implementation defined. * - * @param length the length of the string in 4-byte code units (char32_t) + * @param length the length of the string in 4-byte code units + * (char32_t) * @return the number of bytes required to encode the UTF-32 string as Latin1 */ - simdutf_warn_unused virtual size_t latin1_length_from_utf32(size_t length) const noexcept = 0; + simdutf_warn_unused virtual size_t + latin1_length_from_utf32(size_t length) const noexcept { + return length; + } +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 /** - * Compute the number of bytes that this UTF-8 string would require in Latin1 format. + * Compute the number of bytes that this UTF-8 string would require in Latin1 + * format. * - * This function does not validate the input. + * This function does not validate the input. It is acceptable to pass invalid + * UTF-8 strings but in such cases the result is implementation defined. * * @param input the UTF-8 string to convert * @param length the length of the string in byte * @return the number of bytes required to encode the UTF-8 string as Latin1 */ - simdutf_warn_unused virtual size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept = 0; + simdutf_warn_unused virtual size_t + latin1_length_from_utf8(const char *input, size_t length) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 - /* - * Compute the number of bytes that this UTF-16LE/BE string would require in Latin1 format. +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 + /** + * Compute the number of bytes that this UTF-16LE/BE string would require in + * Latin1 format. * - * This function does not validate the input. + * This function does not validate the input. It is acceptable to pass invalid + * UTF-16 strings but in such cases the result is implementation defined. * * This function is not BOM-aware. * * @param input the UTF-16LE string to convert - * @param length the length of the string in 2-byte code units (char16_t) - * @return the number of bytes required to encode the UTF-16LE string as Latin1 + * @param length the length of the string in 2-byte code units + * (char16_t) + * @return the number of bytes required to encode the UTF-16LE string as + * Latin1 */ - simdutf_warn_unused virtual size_t latin1_length_from_utf16(size_t length) const noexcept = 0; + simdutf_warn_unused virtual size_t + latin1_length_from_utf16(size_t length) const noexcept { + return length; + } +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 /** - * Compute the number of two-byte code units that this UTF-32 string would require in UTF-16 format. + * Compute the number of two-byte code units that this UTF-32 string would + * require in UTF-16 format. * - * This function does not validate the input. + * This function does not validate the input. It is acceptable to pass invalid + * UTF-32 strings but in such cases the result is implementation defined. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte code units (char32_t) + * @param length the length of the string in 4-byte code units + * (char32_t) * @return the number of bytes required to encode the UTF-32 string as UTF-16 */ - simdutf_warn_unused virtual size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept = 0; + simdutf_warn_unused virtual size_t + utf16_length_from_utf32(const char32_t *input, + size_t length) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 - - /** - * Return the number of bytes that this UTF-32 string would require in Latin1 format. - * - * This function does not validate the input. +#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 + /** + * Return the number of bytes that this UTF-32 string would require in Latin1 + * format. * - * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte code units (char32_t) + * @param length the length of the string in 4-byte code units + * (char32_t) * @return the number of bytes required to encode the UTF-32 string as Latin1 */ - simdutf_warn_unused virtual size_t utf32_length_from_latin1(size_t length) const noexcept = 0; + simdutf_warn_unused virtual size_t + utf32_length_from_latin1(size_t length) const noexcept { + return length; + } +#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 - /* - * Compute the number of bytes that this UTF-16LE string would require in UTF-32 format. +#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 + /** + * Compute the number of bytes that this UTF-16LE string would require in + * UTF-32 format. * * This function is equivalent to count_utf16le. * - * This function does not validate the input. + * This function does not validate the input. It is acceptable to pass invalid + * UTF-16 strings but in such cases the result is implementation defined. * * This function is not BOM-aware. * * @param input the UTF-16LE string to convert - * @param length the length of the string in 2-byte code units (char16_t) - * @return the number of bytes required to encode the UTF-16LE string as UTF-32 + * @param length the length of the string in 2-byte code units + * (char16_t) + * @return the number of bytes required to encode the UTF-16LE string as + * UTF-32 */ - simdutf_warn_unused virtual size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept = 0; + simdutf_warn_unused virtual size_t + utf32_length_from_utf16le(const char16_t *input, + size_t length) const noexcept = 0; - /* - * Compute the number of bytes that this UTF-16BE string would require in UTF-32 format. + /** + * Compute the number of bytes that this UTF-16BE string would require in + * UTF-32 format. * * This function is equivalent to count_utf16be. * - * This function does not validate the input. + * This function does not validate the input. It is acceptable to pass invalid + * UTF-16 strings but in such cases the result is implementation defined. * * This function is not BOM-aware. * * @param input the UTF-16BE string to convert - * @param length the length of the string in 2-byte code units (char16_t) - * @return the number of bytes required to encode the UTF-16BE string as UTF-32 + * @param length the length of the string in 2-byte code units + * (char16_t) + * @return the number of bytes required to encode the UTF-16BE string as + * UTF-32 */ - simdutf_warn_unused virtual size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept = 0; + simdutf_warn_unused virtual size_t + utf32_length_from_utf16be(const char16_t *input, + size_t length) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 +#if SIMDUTF_FEATURE_UTF16 /** * Count the number of code points (characters) in the string assuming that * it is valid. * * This function assumes that the input string is valid UTF-16LE. + * It is acceptable to pass invalid UTF-16 strings but in such cases + * the result is implementation defined. * * This function is not BOM-aware. * * @param input the UTF-16LE string to process - * @param length the length of the string in 2-byte code units (char16_t) + * @param length the length of the string in 2-byte code units + * (char16_t) * @return number of code points */ - simdutf_warn_unused virtual size_t count_utf16le(const char16_t * input, size_t length) const noexcept = 0; + simdutf_warn_unused virtual size_t + count_utf16le(const char16_t *input, size_t length) const noexcept = 0; /** * Count the number of code points (characters) in the string assuming that * it is valid. * * This function assumes that the input string is valid UTF-16BE. + * It is acceptable to pass invalid UTF-16 strings but in such cases + * the result is implementation defined. * * This function is not BOM-aware. * * @param input the UTF-16BE string to process - * @param length the length of the string in 2-byte code units (char16_t) + * @param length the length of the string in 2-byte code units + * (char16_t) * @return number of code points */ - simdutf_warn_unused virtual size_t count_utf16be(const char16_t * input, size_t length) const noexcept = 0; - + simdutf_warn_unused virtual size_t + count_utf16be(const char16_t *input, size_t length) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF16 +#if SIMDUTF_FEATURE_UTF8 /** * Count the number of code points (characters) in the string assuming that * it is valid. * * This function assumes that the input string is valid UTF-8. + * It is acceptable to pass invalid UTF-8 strings but in such cases + * the result is implementation defined. * * @param input the UTF-8 string to process * @param length the length of the string in bytes * @return number of code points */ - simdutf_warn_unused virtual size_t count_utf8(const char * input, size_t length) const noexcept = 0; + simdutf_warn_unused virtual size_t + count_utf8(const char *input, size_t length) const noexcept = 0; +#endif // SIMDUTF_FEATURE_UTF8 +#if SIMDUTF_FEATURE_BASE64 /** * Provide the maximal binary length in bytes given the base64 input. - * In general, if the input contains ASCII spaces, the result will be less than - * the maximum length. + * As long as the input does not contain ignorable characters (e.g., ASCII + * spaces or linefeed characters), the result is exact. In particular, the + * function checks for padding characters. + * + * The function is fast (constant time). It checks up to two characters at + * the end of the string. The input is not otherwise validated or read.. * * @param input the base64 input to process * @param length the length of the base64 input in bytes * @return maximal number of binary bytes */ - simdutf_warn_unused virtual size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept = 0; + simdutf_warn_unused size_t maximal_binary_length_from_base64( + const char *input, size_t length) const noexcept; /** * Provide the maximal binary length in bytes given the base64 input. - * In general, if the input contains ASCII spaces, the result will be less than - * the maximum length. + * As long as the input does not contain ignorable characters (e.g., ASCII + * spaces or linefeed characters), the result is exact. In particular, the + * function checks for padding characters. * - * @param input the base64 input to process, in ASCII stored as 16-bit units + * The function is fast (constant time). It checks up to two characters at + * the end of the string. The input is not otherwise validated or read. + * + * @param input the base64 input to process, in ASCII stored as 16-bit + * units * @param length the length of the base64 input in 16-bit units * @return maximal number of binary bytes */ - simdutf_warn_unused virtual size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept = 0; + simdutf_warn_unused size_t maximal_binary_length_from_base64( + const char16_t *input, size_t length) const noexcept; /** * Convert a base64 input to a binary output. * - * This function follows the WHATWG forgiving-base64 format, which means that it will - * ignore any ASCII spaces in the input. You may provide a padded input (with one or two - * equal signs at the end) or an unpadded input (without any equal signs at the end). + * This function follows the WHATWG forgiving-base64 format, which means that + * it will ignore any ASCII spaces in the input. You may provide a padded + * input (with one or two equal signs at the end) or an unpadded input + * (without any equal signs at the end). + * + * See https://infra.spec.whatwg.org/#forgiving-base64-decode + * + * This function will fail in case of invalid input. When last_chunk_options = + * loose, there are two possible reasons for failure: the input contains a + * number of base64 characters that when divided by 4, leaves a single + * remainder character (BASE64_INPUT_REMAINDER), or the input contains a + * character that is not a valid base64 character (INVALID_BASE64_CHARACTER). + * + * You should call this function with a buffer that is at least + * maximal_binary_length_from_base64(input, length) bytes long. If you fail to + * provide that much space, the function may cause a buffer overflow. + * + * @param input the base64 string to process + * @param length the length of the string in bytes + * @param output the pointer to a buffer that can hold the conversion + * result (should be at least maximal_binary_length_from_base64(input, length) + * bytes long). + * @param options the base64 options to use, can be base64_default or + * base64_url, is base64_default by default. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and either position of the error + * (in the input in bytes) if any, or the number of bytes written if + * successful. + */ + simdutf_warn_unused virtual result + base64_to_binary(const char *input, size_t length, char *output, + base64_options options = base64_default, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept = 0; + + /** + * Convert a base64 input to a binary output while returning more details + * than base64_to_binary. + * + * This function follows the WHATWG forgiving-base64 format, which means that + * it will ignore any ASCII spaces in the input. You may provide a padded + * input (with one or two equal signs at the end) or an unpadded input + * (without any equal signs at the end). * * See https://infra.spec.whatwg.org/#forgiving-base64-decode * - * This function will fail in case of invalid input. There are two possible reasons for - * failure: the input contains a number of base64 characters that when divided by 4, leaves - * a single remainder character (BASE64_INPUT_REMAINDER), or the input contains a character - * that is not a valid base64 character (INVALID_BASE64_CHARACTER). + * This function will fail in case of invalid input. When last_chunk_options = + * loose, there are two possible reasons for failure: the input contains a + * number of base64 characters that when divided by 4, leaves a single + * remainder character (BASE64_INPUT_REMAINDER), or the input contains a + * character that is not a valid base64 character (INVALID_BASE64_CHARACTER). * - * You should call this function with a buffer that is at least maximal_binary_length_from_base64(input, length) bytes long. - * If you fail to provide that much space, the function may cause a buffer overflow. + * You should call this function with a buffer that is at least + * maximal_binary_length_from_base64(input, length) bytes long. If you fail to + * provide that much space, the function may cause a buffer overflow. * * @param input the base64 string to process * @param length the length of the string in bytes - * @param output the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long). - * @param options the base64 options to use, can be base64_default or base64_url, is base64_default by default. - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in bytes) if any, or the number of bytes written if successful. + * @param output the pointer to a buffer that can hold the conversion + * result (should be at least maximal_binary_length_from_base64(input, length) + * bytes long). + * @param options the base64 options to use, can be base64_default or + * base64_url, is base64_default by default. + * @return a full_result pair struct (of type simdutf::result containing the + * three fields error, input_count and output_count). */ - simdutf_warn_unused virtual result base64_to_binary(const char * input, size_t length, char* output, base64_options options = base64_default) const noexcept = 0; + simdutf_warn_unused virtual full_result base64_to_binary_details( + const char *input, size_t length, char *output, + base64_options options = base64_default, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept = 0; /** * Convert a base64 input to a binary output. * - * This function follows the WHATWG forgiving-base64 format, which means that it will - * ignore any ASCII spaces in the input. You may provide a padded input (with one or two - * equal signs at the end) or an unpadded input (without any equal signs at the end). + * This function follows the WHATWG forgiving-base64 format, which means that + * it will ignore any ASCII spaces in the input. You may provide a padded + * input (with one or two equal signs at the end) or an unpadded input + * (without any equal signs at the end). * * See https://infra.spec.whatwg.org/#forgiving-base64-decode * - * This function will fail in case of invalid input. There are two possible reasons for - * failure: the input contains a number of base64 characters that when divided by 4, leaves - * a single remainder character (BASE64_INPUT_REMAINDER), or the input contains a character - * that is not a valid base64 character (INVALID_BASE64_CHARACTER). + * This function will fail in case of invalid input. When last_chunk_options = + * loose, there are two possible reasons for failure: the input contains a + * number of base64 characters that when divided by 4, leaves a single + * remainder character (BASE64_INPUT_REMAINDER), or the input contains a + * character that is not a valid base64 character (INVALID_BASE64_CHARACTER). * - * You should call this function with a buffer that is at least maximal_binary_length_from_utf6_base64(input, length) bytes long. - * If you fail to provide that much space, the function may cause a buffer overflow. + * You should call this function with a buffer that is at least + * maximal_binary_length_from_base64(input, length) bytes long. If you + * fail to provide that much space, the function may cause a buffer overflow. * - * @param input the base64 string to process, in ASCII stored as 16-bit units + * @param input the base64 string to process, in ASCII stored as + * 16-bit units * @param length the length of the string in 16-bit units - * @param output the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long). - * @param options the base64 options to use, can be base64_default or base64_url, is base64_default by default. - * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and position of the INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number of bytes written if successful. + * @param output the pointer to a buffer that can hold the conversion + * result (should be at least maximal_binary_length_from_base64(input, length) + * bytes long). + * @param options the base64 options to use, can be base64_default or + * base64_url, is base64_default by default. + * @return a result pair struct (of type simdutf::result containing the two + * fields error and count) with an error code and position of the + * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the + * number of bytes written if successful. + */ + simdutf_warn_unused virtual result + base64_to_binary(const char16_t *input, size_t length, char *output, + base64_options options = base64_default, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept = 0; + + /** + * Convert a base64 input to a binary output while returning more details + * than base64_to_binary. + * + * This function follows the WHATWG forgiving-base64 format, which means that + * it will ignore any ASCII spaces in the input. You may provide a padded + * input (with one or two equal signs at the end) or an unpadded input + * (without any equal signs at the end). + * + * See https://infra.spec.whatwg.org/#forgiving-base64-decode + * + * This function will fail in case of invalid input. When last_chunk_options = + * loose, there are two possible reasons for failure: the input contains a + * number of base64 characters that when divided by 4, leaves a single + * remainder character (BASE64_INPUT_REMAINDER), or the input contains a + * character that is not a valid base64 character (INVALID_BASE64_CHARACTER). + * + * You should call this function with a buffer that is at least + * maximal_binary_length_from_base64(input, length) bytes long. If you fail to + * provide that much space, the function may cause a buffer overflow. + * + * @param input the base64 string to process + * @param length the length of the string in bytes + * @param output the pointer to a buffer that can hold the conversion + * result (should be at least maximal_binary_length_from_base64(input, length) + * bytes long). + * @param options the base64 options to use, can be base64_default or + * base64_url, is base64_default by default. + * @return a full_result pair struct (of type simdutf::result containing the + * three fields error, input_count and output_count). */ - simdutf_warn_unused virtual result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options = base64_default) const noexcept = 0; + simdutf_warn_unused virtual full_result base64_to_binary_details( + const char16_t *input, size_t length, char *output, + base64_options options = base64_default, + last_chunk_handling_options last_chunk_options = + last_chunk_handling_options::loose) const noexcept = 0; /** * Provide the base64 length in bytes given the length of a binary input. * * @param length the length of the input in bytes - * @param options the base64 options to use, can be base64_default or base64_url, is base64_default by default. + * @param options the base64 options to use, can be base64_default or + * base64_url, is base64_default by default. * @return number of base64 bytes */ - simdutf_warn_unused virtual size_t base64_length_from_binary(size_t length, base64_options options = base64_default) const noexcept = 0; + simdutf_warn_unused size_t base64_length_from_binary( + size_t length, base64_options options = base64_default) const noexcept; + + /** + * Convert a binary input to a base64 output. + * + * The default option (simdutf::base64_default) uses the characters `+` and + * `/` as part of its alphabet. Further, it adds padding (`=`) at the end of + * the output to ensure that the output length is a multiple of four. + * + * The URL option (simdutf::base64_url) uses the characters `-` and `_` as + * part of its alphabet. No padding is added at the end of the output. + * + * This function always succeeds. + * + * @param input the binary to process + * @param length the length of the input in bytes + * @param output the pointer to a buffer that can hold the conversion + * result (should be at least base64_length_from_binary(length) bytes long) + * @param options the base64 options to use, can be base64_default or + * base64_url, is base64_default by default. + * @return number of written bytes, will be equal to + * base64_length_from_binary(length, options) + */ + virtual size_t + binary_to_base64(const char *input, size_t length, char *output, + base64_options options = base64_default) const noexcept = 0; /** - * Convert a binary input to a base64 output. The output is always padded with equal signs so that it is - * a multiple of 4 bytes long. + * Convert a binary input to a base64 output with lines of given length. + * Lines are separated by a single linefeed character. + * + * The default option (simdutf::base64_default) uses the characters `+` and + * `/` as part of its alphabet. Further, it adds padding (`=`) at the end of + * the output to ensure that the output length is a multiple of four. + * + * The URL option (simdutf::base64_url) uses the characters `-` and `_` as + * part of its alphabet. No padding is added at the end of the output. * * This function always succeeds. * * @param input the binary to process * @param length the length of the input in bytes - * @param output the pointer to buffer that can hold the conversion result (should be at least base64_length_from_binary(length) bytes long) - * @param options the base64 options to use, can be base64_default or base64_url, is base64_default by default. - * @return number of written bytes, will be equal to base64_length_from_binary(length, options) + * @param output the pointer to a buffer that can hold the conversion + * result (should be at least base64_length_from_binary_with_lines(length, + * options, line_length) bytes long) + * @param line_length the length of each line, values smaller than 4 are + * interpreted as 4 + * @param options the base64 options to use, can be base64_default or + * base64_url, is base64_default by default. + * @return number of written bytes, will be equal to + * base64_length_from_binary_with_lines(length, options, line_length) */ - virtual size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options = base64_default) const noexcept = 0; + virtual size_t binary_to_base64_with_lines( + const char *input, size_t length, char *output, + size_t line_length = simdutf::default_line_length, + base64_options options = base64_default) const noexcept = 0; + /** + * Find the first occurrence of a character in a string. If the character is + * not found, return a pointer to the end of the string. + * @param start the start of the string + * @param end the end of the string + * @param character the character to find + * @return a pointer to the first occurrence of the character in the string, + * or a pointer to the end of the string if the character is not found. + * + */ + virtual const char *find(const char *start, const char *end, + char character) const noexcept = 0; + virtual const char16_t *find(const char16_t *start, const char16_t *end, + char16_t character) const noexcept = 0; +#endif // SIMDUTF_FEATURE_BASE64 + +#ifdef SIMDUTF_INTERNAL_TESTS + // This method is exported only in developer mode, its purpose + // is to expose some internal test procedures from the given + // implementation and then use them through our standard test + // framework. + // + // Regular users should not use it, the tests of the public + // API are enough. + + struct TestProcedure { + // display name + std::string name; + + // procedure should return whether given test pass or not + void (*procedure)(const implementation &); + }; + + virtual std::vector internal_tests() const; +#endif protected: - /** @private Construct an implementation with the given name and description. For subclasses. */ - simdutf_really_inline implementation( - const char* name, - const char* description, - uint32_t required_instruction_sets - ) : - _name(name), - _description(description), - _required_instruction_sets(required_instruction_sets) - { - } + /** @private Construct an implementation with the given name and description. + * For subclasses. */ + simdutf_really_inline implementation(const char *name, + const char *description, + uint32_t required_instruction_sets) + : _name(name), _description(description), + _required_instruction_sets(required_instruction_sets) {} + protected: ~implementation() = default; + private: /** * The name of this implementation. */ - const char* _name; + const char *_name; /** * The description of this implementation. */ - const char* _description; + const char *_description; /** * Instruction sets required for this implementation. @@ -3648,26 +13215,28 @@ class available_implementation_list { /** Number of implementations */ size_t size() const noexcept; /** STL const begin() iterator */ - const implementation * const *begin() const noexcept; + const implementation *const *begin() const noexcept; /** STL const end() iterator */ - const implementation * const *end() const noexcept; + const implementation *const *end() const noexcept; /** * Get the implementation with the given name. * * Case sensitive. * - * const implementation *impl = simdutf::available_implementations["westmere"]; - * if (!impl) { exit(1); } - * if (!imp->supported_by_runtime_system()) { exit(1); } + * const implementation *impl = + * simdutf::available_implementations["westmere"]; if (!impl) { exit(1); } if + * (!imp->supported_by_runtime_system()) { exit(1); } * simdutf::active_implementation = impl; * * @param name the implementation to find, e.g. "westmere", "haswell", "arm64" * @return the implementation, or nullptr if the parse failed. */ - const implementation * operator[](const std::string &name) const noexcept { - for (const implementation * impl : *this) { - if (impl->name() == name) { return impl; } + const implementation *operator[](const std::string &name) const noexcept { + for (const implementation *impl : *this) { + if (impl->name() == name) { + return impl; + } } return nullptr; } @@ -3677,48 +13246,54 @@ class available_implementation_list { * * This is used to initialize the implementation on startup. * - * const implementation *impl = simdutf::available_implementation::detect_best_supported(); + * const implementation *impl = + * simdutf::available_implementation::detect_best_supported(); * simdutf::active_implementation = impl; * - * @return the most advanced supported implementation for the current host, or an - * implementation that returns UNSUPPORTED_ARCHITECTURE if there is no supported - * implementation. Will never return nullptr. + * @return the most advanced supported implementation for the current host, or + * an implementation that returns UNSUPPORTED_ARCHITECTURE if there is no + * supported implementation. Will never return nullptr. */ const implementation *detect_best_supported() const noexcept; }; -template -class atomic_ptr { +template class atomic_ptr { public: atomic_ptr(T *_ptr) : ptr{_ptr} {} #if defined(SIMDUTF_NO_THREADS) - operator const T*() const { return ptr; } - const T& operator*() const { return *ptr; } - const T* operator->() const { return ptr; } - - operator T*() { return ptr; } - T& operator*() { return *ptr; } - T* operator->() { return ptr; } - atomic_ptr& operator=(T *_ptr) { ptr = _ptr; return *this; } + operator const T *() const { return ptr; } + const T &operator*() const { return *ptr; } + const T *operator->() const { return ptr; } + + operator T *() { return ptr; } + T &operator*() { return *ptr; } + T *operator->() { return ptr; } + atomic_ptr &operator=(T *_ptr) { + ptr = _ptr; + return *this; + } #else - operator const T*() const { return ptr.load(); } - const T& operator*() const { return *ptr; } - const T* operator->() const { return ptr.load(); } - - operator T*() { return ptr.load(); } - T& operator*() { return *ptr; } - T* operator->() { return ptr.load(); } - atomic_ptr& operator=(T *_ptr) { ptr = _ptr; return *this; } + operator const T *() const { return ptr.load(); } + const T &operator*() const { return *ptr; } + const T *operator->() const { return ptr.load(); } + + operator T *() { return ptr.load(); } + T &operator*() { return *ptr; } + T *operator->() { return ptr.load(); } + atomic_ptr &operator=(T *_ptr) { + ptr = _ptr; + return *this; + } #endif private: #if defined(SIMDUTF_NO_THREADS) - T* ptr; + T *ptr; #else - std::atomic ptr; + std::atomic ptr; #endif }; @@ -3729,27 +13304,268 @@ class detect_best_supported_implementation_on_first_use; /** * The list of available implementations compiled into simdutf. */ -extern SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list& get_available_implementations(); +extern SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list & +get_available_implementations(); + +/** + * The active implementation. + * + * Automatically initialized on first use to the most advanced implementation + * supported by this hardware. + */ +extern SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr & +get_active_implementation(); + +} // namespace simdutf + +#if SIMDUTF_FEATURE_BASE64 + // this header is not part of the public api +/* begin file include/simdutf/base64_implementation.h */ +#ifndef SIMDUTF_BASE64_IMPLEMENTATION_H +#define SIMDUTF_BASE64_IMPLEMENTATION_H + +// this is not part of the public api + +namespace simdutf { + +template +simdutf_warn_unused simdutf_constexpr23 result slow_base64_to_binary_safe_impl( + const chartype *input, size_t length, char *output, size_t &outlen, + base64_options options, + last_chunk_handling_options last_chunk_options) noexcept { + const bool ignore_garbage = (options & base64_default_accept_garbage) != 0; + auto ri = simdutf::scalar::base64::find_end(input, length, options); + size_t equallocation = ri.equallocation; + size_t equalsigns = ri.equalsigns; + length = ri.srclen; + size_t full_input_length = ri.full_input_length; + (void)full_input_length; + if (length == 0) { + outlen = 0; + if (!ignore_garbage && equalsigns > 0) { + return {INVALID_BASE64_CHARACTER, equallocation}; + } + return {SUCCESS, 0}; + } + + // The parameters of base64_tail_decode_safe are: + // - dst: the output buffer + // - outlen: the size of the output buffer + // - srcr: the input buffer + // - length: the size of the input buffer + // - padded_characters: the number of padding characters + // - options: the options for the base64 decoder + // - last_chunk_options: the options for the last chunk + // The function will return the number of bytes written to the output buffer + // and the number of bytes read from the input buffer. + // The function will also return an error code if the input buffer is not + // valid base64. + full_result r = scalar::base64::base64_tail_decode_safe( + output, outlen, input, length, equalsigns, options, last_chunk_options); + r = scalar::base64::patch_tail_result(r, 0, 0, equallocation, + full_input_length, last_chunk_options); + outlen = r.output_count; + if (!is_partial(last_chunk_options) && r.error == error_code::SUCCESS && + equalsigns > 0) { + // additional checks + if ((outlen % 3 == 0) || ((outlen % 3) + 1 + equalsigns != 4)) { + r.error = error_code::INVALID_BASE64_CHARACTER; + } + } + return {r.error, r.input_count}; // we cannot return r itself because it gets + // converted to error/output_count +} + +template +simdutf_warn_unused simdutf_constexpr23 result base64_to_binary_safe_impl( + const chartype *input, size_t length, char *output, size_t &outlen, + base64_options options, + last_chunk_handling_options last_chunk_handling_options, + bool decode_up_to_bad_char) noexcept { + static_assert(std::is_same::value || + std::is_same::value, + "Only char and char16_t are supported."); + size_t remaining_input_length = length; + size_t remaining_output_length = outlen; + size_t input_position = 0; + size_t output_position = 0; + + // We also do a first pass using the fast path to decode as much as possible + size_t safe_input = (std::min)( + remaining_input_length, + base64_length_from_binary(remaining_output_length / 3 * 3, options)); + bool done_with_partial = (safe_input == remaining_input_length); + simdutf::full_result r; + +#if SIMDUTF_CPLUSPLUS23 + if consteval { + r = scalar::base64::base64_to_binary_details_impl( + input + input_position, safe_input, output + output_position, options, + done_with_partial + ? last_chunk_handling_options + : simdutf::last_chunk_handling_options::only_full_chunks); + } else +#endif + { + r = get_active_implementation()->base64_to_binary_details( + input + input_position, safe_input, output + output_position, options, + done_with_partial + ? last_chunk_handling_options + : simdutf::last_chunk_handling_options::only_full_chunks); + } + simdutf_log_assert(r.input_count <= safe_input, + "You should not read more than safe_input"); + simdutf_log_assert(r.output_count <= remaining_output_length, + "You should not write more than remaining_output_length"); + // Technically redundant, but we want to be explicit about it. + input_position += r.input_count; + output_position += r.output_count; + remaining_input_length -= r.input_count; + remaining_output_length -= r.output_count; + if (r.error != simdutf::error_code::SUCCESS) { + // There is an error. We return. + if (decode_up_to_bad_char && + r.error == error_code::INVALID_BASE64_CHARACTER) { + return slow_base64_to_binary_safe_impl( + input, length, output, outlen, options, last_chunk_handling_options); + } + outlen = output_position; + return {r.error, input_position}; + } + + if (done_with_partial) { + // We are done. We have decoded everything. + outlen = output_position; + return {simdutf::error_code::SUCCESS, input_position}; + } + // We have decoded some data, but we still have some data to decode. + // We need to decode the rest of the input buffer. + r = simdutf::scalar::base64::base64_to_binary_details_safe_impl( + input + input_position, remaining_input_length, output + output_position, + remaining_output_length, options, last_chunk_handling_options); + input_position += r.input_count; + output_position += r.output_count; + remaining_input_length -= r.input_count; + remaining_output_length -= r.output_count; + + if (r.error != simdutf::error_code::SUCCESS) { + // There is an error. We return. + if (decode_up_to_bad_char && + r.error == error_code::INVALID_BASE64_CHARACTER) { + return slow_base64_to_binary_safe_impl( + input, length, output, outlen, options, last_chunk_handling_options); + } + outlen = output_position; + return {r.error, input_position}; + } + if (input_position < length) { + // We cannot process the entire input in one go, so we need to + // process it in two steps: first the fast path, then the slow path. + // In some cases, the processing might 'eat up' trailing ignorable + // characters in the fast path, but that can be a problem. + // suppose we have just white space followed by a single base64 character. + // If we first process the white space with the fast path, it will + // eat all of it. But, by the JavaScript standard, we should consume + // no character. See + // https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64 + while (input_position > 0 && + base64_ignorable(input[input_position - 1], options)) { + input_position--; + } + } + outlen = output_position; + return {simdutf::error_code::SUCCESS, input_position}; +} + +} // namespace simdutf +#endif // SIMDUTF_BASE64_IMPLEMENTATION_H +/* end file include/simdutf/base64_implementation.h */ +namespace simdutf { + #if SIMDUTF_SPAN /** - * The active implementation. - * - * Automatically initialized on first use to the most advanced implementation supported by this hardware. - */ -extern SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr& get_active_implementation(); + * @brief span overload + * @return a tuple of result and outlen + */ +simdutf_really_inline + simdutf_constexpr23 simdutf_warn_unused std::tuple + base64_to_binary_safe( + const detail::input_span_of_byte_like auto &input, + detail::output_span_of_byte_like auto &&binary_output, + base64_options options = base64_default, + last_chunk_handling_options last_chunk_options = loose, + bool decode_up_to_bad_char = false) noexcept { + size_t outlen = binary_output.size(); + #if SIMDUTF_CPLUSPLUS23 + if consteval { + using CInput = std::decay_t; + static_assert(std::is_same_v, + "sorry, the constexpr implementation is for now limited to " + "input of type char"); + using COutput = std::decay_t; + static_assert(std::is_same_v, + "sorry, the constexpr implementation is for now limited to " + "output of type char"); + auto r = base64_to_binary_safe_impl( + input.data(), input.size(), binary_output.data(), outlen, options, + last_chunk_options, decode_up_to_bad_char); + return {r, outlen}; + } else + #endif + { + auto r = base64_to_binary_safe_impl( + reinterpret_cast(input.data()), input.size(), + reinterpret_cast(binary_output.data()), outlen, options, + last_chunk_options, decode_up_to_bad_char); + return {r, outlen}; + } +} + #if SIMDUTF_SPAN +/** + * @brief span overload + * @return a tuple of result and outlen + */ +simdutf_really_inline + simdutf_warn_unused simdutf_constexpr23 std::tuple + base64_to_binary_safe( + std::span input, + detail::output_span_of_byte_like auto &&binary_output, + base64_options options = base64_default, + last_chunk_handling_options last_chunk_options = loose, + bool decode_up_to_bad_char = false) noexcept { + size_t outlen = binary_output.size(); + #if SIMDUTF_CPLUSPLUS23 + if consteval { + auto r = base64_to_binary_safe_impl( + input.data(), input.size(), binary_output.data(), outlen, options, + last_chunk_options, decode_up_to_bad_char); + return {r, outlen}; + } else + #endif + { + auto r = base64_to_binary_safe( + input.data(), input.size(), + reinterpret_cast(binary_output.data()), outlen, options, + last_chunk_options, decode_up_to_bad_char); + return {r, outlen}; + } +} + #endif // SIMDUTF_SPAN + #endif // SIMDUTF_SPAN } // namespace simdutf +#endif // SIMDUTF_FEATURE_BASE64 + #endif // SIMDUTF_IMPLEMENTATION_H /* end file include/simdutf/implementation.h */ - -// Implementation-internal files (must be included before the implementations themselves, to keep -// amalgamation working--otherwise, the first time a file is included, it might be put inside the -// #ifdef SIMDUTF_IMPLEMENTATION_ARM64/FALLBACK/etc., which means the other implementations can't -// compile unless that implementation is turned on). - +// Implementation-internal files (must be included before the implementations +// themselves, to keep amalgamation working--otherwise, the first time a file is +// included, it might be put inside the #ifdef +// SIMDUTF_IMPLEMENTATION_ARM64/FALLBACK/etc., which means the other +// implementations can't compile unless that implementation is turned on). SIMDUTF_POP_DISABLE_WARNINGS diff --git a/simdutf/simdutf_c.h b/simdutf/simdutf_c.h new file mode 100644 index 00000000..abb8618f --- /dev/null +++ b/simdutf/simdutf_c.h @@ -0,0 +1,339 @@ +/*** + * simdutf_c.h.h - C API for simdutf + * This is currently experimental. + * We are committed to keeping the C API, but there might be mistakes in our + * implementation. Please report any issues you find. + */ + +#ifndef SIMDUTF_C_H +#define SIMDUTF_C_H + +#include +#include +#include + +#ifdef __has_include + #if __has_include() + #include + #else // __has_include() + #define char16_t uint16_t + #define char32_t uint32_t + #endif // __has_include() +#else // __has_include() + #define char16_t uint16_t + #define char32_t uint32_t +#endif // __has_include + +#ifdef __cplusplus +extern "C" { +#endif + +/* C-friendly subset of simdutf errors */ +typedef enum simdutf_error_code { + SIMDUTF_ERROR_SUCCESS = 0, + SIMDUTF_ERROR_HEADER_BITS, + SIMDUTF_ERROR_TOO_SHORT, + SIMDUTF_ERROR_TOO_LONG, + SIMDUTF_ERROR_OVERLONG, + SIMDUTF_ERROR_TOO_LARGE, + SIMDUTF_ERROR_SURROGATE, + SIMDUTF_ERROR_INVALID_BASE64_CHARACTER, + SIMDUTF_ERROR_BASE64_INPUT_REMAINDER, + SIMDUTF_ERROR_BASE64_EXTRA_BITS, + SIMDUTF_ERROR_OUTPUT_BUFFER_TOO_SMALL, + SIMDUTF_ERROR_OTHER +} simdutf_error_code; + +typedef struct simdutf_result { + simdutf_error_code error; + size_t count; /* position of error or number of code units validated */ +} simdutf_result; + +typedef enum simdutf_encoding_type { + SIMDUTF_ENCODING_UNSPECIFIED = 0, + SIMDUTF_ENCODING_UTF8 = 1, + SIMDUTF_ENCODING_UTF16_LE = 2, + SIMDUTF_ENCODING_UTF16_BE = 4, + SIMDUTF_ENCODING_UTF32_LE = 8, + SIMDUTF_ENCODING_UTF32_BE = 16 +} simdutf_encoding_type; + +/* Validate UTF-8: returns true iff input is valid UTF-8 */ +bool simdutf_validate_utf8(const char *buf, size_t len); + +/* Validate UTF-8 with detailed result */ +simdutf_result simdutf_validate_utf8_with_errors(const char *buf, size_t len); + +/* Encoding detection */ +simdutf_encoding_type simdutf_autodetect_encoding(const char *input, + size_t length); +int simdutf_detect_encodings(const char *input, size_t length); + +/* ASCII validation */ +bool simdutf_validate_ascii(const char *buf, size_t len); +simdutf_result simdutf_validate_ascii_with_errors(const char *buf, size_t len); + +/* UTF-16 ASCII checks */ +bool simdutf_validate_utf16_as_ascii(const char16_t *buf, size_t len); +bool simdutf_validate_utf16be_as_ascii(const char16_t *buf, size_t len); +bool simdutf_validate_utf16le_as_ascii(const char16_t *buf, size_t len); + +/* UTF-16/UTF-8/UTF-32 validation (native/endian-specific) */ +bool simdutf_validate_utf16(const char16_t *buf, size_t len); +bool simdutf_validate_utf16le(const char16_t *buf, size_t len); +bool simdutf_validate_utf16be(const char16_t *buf, size_t len); +simdutf_result simdutf_validate_utf16_with_errors(const char16_t *buf, + size_t len); +simdutf_result simdutf_validate_utf16le_with_errors(const char16_t *buf, + size_t len); +simdutf_result simdutf_validate_utf16be_with_errors(const char16_t *buf, + size_t len); + +bool simdutf_validate_utf32(const char32_t *buf, size_t len); +simdutf_result simdutf_validate_utf32_with_errors(const char32_t *buf, + size_t len); + +/* to_well_formed UTF-16 helpers */ +void simdutf_to_well_formed_utf16le(const char16_t *input, size_t len, + char16_t *output); +void simdutf_to_well_formed_utf16be(const char16_t *input, size_t len, + char16_t *output); +void simdutf_to_well_formed_utf16(const char16_t *input, size_t len, + char16_t *output); + +/* Counting */ +size_t simdutf_count_utf16(const char16_t *input, size_t length); +size_t simdutf_count_utf16le(const char16_t *input, size_t length); +size_t simdutf_count_utf16be(const char16_t *input, size_t length); +size_t simdutf_count_utf8(const char *input, size_t length); + +/* Length estimators */ +size_t simdutf_utf8_length_from_latin1(const char *input, size_t length); +size_t simdutf_latin1_length_from_utf8(const char *input, size_t length); +size_t simdutf_latin1_length_from_utf16(size_t length); +size_t simdutf_latin1_length_from_utf32(size_t length); +size_t simdutf_utf16_length_from_utf8(const char *input, size_t length); +size_t simdutf_utf32_length_from_utf8(const char *input, size_t length); +size_t simdutf_utf8_length_from_utf16(const char16_t *input, size_t length); +simdutf_result +simdutf_utf8_length_from_utf16_with_replacement(const char16_t *input, + size_t length); +size_t simdutf_utf8_length_from_utf16le(const char16_t *input, size_t length); +size_t simdutf_utf8_length_from_utf16be(const char16_t *input, size_t length); +simdutf_result +simdutf_utf8_length_from_utf16le_with_replacement(const char16_t *input, + size_t length); +simdutf_result +simdutf_utf8_length_from_utf16be_with_replacement(const char16_t *input, + size_t length); + +/* Conversions: latin1 <-> utf8, utf8 <-> utf16/utf32, utf16 <-> utf8, etc. */ +size_t simdutf_convert_latin1_to_utf8(const char *input, size_t length, + char *output); +size_t simdutf_convert_latin1_to_utf8_safe(const char *input, size_t length, + char *output, size_t utf8_len); +size_t simdutf_convert_latin1_to_utf16le(const char *input, size_t length, + char16_t *output); +size_t simdutf_convert_latin1_to_utf16be(const char *input, size_t length, + char16_t *output); +size_t simdutf_convert_latin1_to_utf32(const char *input, size_t length, + char32_t *output); + +size_t simdutf_convert_utf8_to_latin1(const char *input, size_t length, + char *output); +size_t simdutf_convert_utf8_to_utf16le(const char *input, size_t length, + char16_t *output); +size_t simdutf_convert_utf8_to_utf16be(const char *input, size_t length, + char16_t *output); +size_t simdutf_convert_utf8_to_utf16(const char *input, size_t length, + char16_t *output); + +size_t simdutf_convert_utf8_to_utf32(const char *input, size_t length, + char32_t *output); +simdutf_result simdutf_convert_utf8_to_latin1_with_errors(const char *input, + size_t length, + char *output); +simdutf_result simdutf_convert_utf8_to_utf16_with_errors(const char *input, + size_t length, + char16_t *output); +simdutf_result simdutf_convert_utf8_to_utf16le_with_errors(const char *input, + size_t length, + char16_t *output); +simdutf_result simdutf_convert_utf8_to_utf16be_with_errors(const char *input, + size_t length, + char16_t *output); +simdutf_result simdutf_convert_utf8_to_utf32_with_errors(const char *input, + size_t length, + char32_t *output); + +/* Conversions assuming valid input */ +size_t simdutf_convert_valid_utf8_to_latin1(const char *input, size_t length, + char *output); +size_t simdutf_convert_valid_utf8_to_utf16le(const char *input, size_t length, + char16_t *output); +size_t simdutf_convert_valid_utf8_to_utf16be(const char *input, size_t length, + char16_t *output); +size_t simdutf_convert_valid_utf8_to_utf32(const char *input, size_t length, + char32_t *output); + +/* UTF-16 -> UTF-8 and related conversions */ +size_t simdutf_convert_utf16_to_utf8(const char16_t *input, size_t length, + char *output); +size_t simdutf_convert_utf16le_to_utf8(const char16_t *input, size_t length, + char *output); +size_t simdutf_convert_utf16be_to_utf8(const char16_t *input, size_t length, + char *output); +size_t simdutf_convert_utf16_to_utf8_safe(const char16_t *input, size_t length, + char *output, size_t utf8_len); +size_t simdutf_convert_utf16_to_latin1(const char16_t *input, size_t length, + char *output); +size_t simdutf_convert_utf16le_to_latin1(const char16_t *input, size_t length, + char *output); +size_t simdutf_convert_utf16be_to_latin1(const char16_t *input, size_t length, + char *output); +simdutf_result +simdutf_convert_utf16_to_latin1_with_errors(const char16_t *input, + size_t length, char *output); +simdutf_result +simdutf_convert_utf16le_to_latin1_with_errors(const char16_t *input, + size_t length, char *output); +simdutf_result +simdutf_convert_utf16be_to_latin1_with_errors(const char16_t *input, + size_t length, char *output); + +simdutf_result simdutf_convert_utf16_to_utf8_with_errors(const char16_t *input, + size_t length, + char *output); +simdutf_result +simdutf_convert_utf16le_to_utf8_with_errors(const char16_t *input, + size_t length, char *output); +simdutf_result +simdutf_convert_utf16be_to_utf8_with_errors(const char16_t *input, + size_t length, char *output); + +size_t simdutf_convert_valid_utf16_to_utf8(const char16_t *input, size_t length, + char *output); +size_t simdutf_convert_valid_utf16_to_latin1(const char16_t *input, + size_t length, char *output); +size_t simdutf_convert_valid_utf16le_to_latin1(const char16_t *input, + size_t length, char *output); +size_t simdutf_convert_valid_utf16be_to_latin1(const char16_t *input, + size_t length, char *output); + +size_t simdutf_convert_valid_utf16le_to_utf8(const char16_t *input, + size_t length, char *output); +size_t simdutf_convert_valid_utf16be_to_utf8(const char16_t *input, + size_t length, char *output); + +/* UTF-16 <-> UTF-32 conversions */ +size_t simdutf_convert_utf16_to_utf32(const char16_t *input, size_t length, + char32_t *output); +size_t simdutf_convert_utf16le_to_utf32(const char16_t *input, size_t length, + char32_t *output); +size_t simdutf_convert_utf16be_to_utf32(const char16_t *input, size_t length, + char32_t *output); +simdutf_result simdutf_convert_utf16_to_utf32_with_errors(const char16_t *input, + size_t length, + char32_t *output); +simdutf_result +simdutf_convert_utf16le_to_utf32_with_errors(const char16_t *input, + size_t length, char32_t *output); +simdutf_result +simdutf_convert_utf16be_to_utf32_with_errors(const char16_t *input, + size_t length, char32_t *output); + +/* Valid UTF-16 conversions */ +size_t simdutf_convert_valid_utf16_to_utf32(const char16_t *input, + size_t length, char32_t *output); +size_t simdutf_convert_valid_utf16le_to_utf32(const char16_t *input, + size_t length, char32_t *output); +size_t simdutf_convert_valid_utf16be_to_utf32(const char16_t *input, + size_t length, char32_t *output); + +/* UTF-32 -> ... conversions */ +size_t simdutf_convert_utf32_to_utf8(const char32_t *input, size_t length, + char *output); +simdutf_result simdutf_convert_utf32_to_utf8_with_errors(const char32_t *input, + size_t length, + char *output); +size_t simdutf_convert_valid_utf32_to_utf8(const char32_t *input, size_t length, + char *output); + +size_t simdutf_convert_utf32_to_utf16(const char32_t *input, size_t length, + char16_t *output); +size_t simdutf_convert_utf32_to_utf16le(const char32_t *input, size_t length, + char16_t *output); +size_t simdutf_convert_utf32_to_utf16be(const char32_t *input, size_t length, + char16_t *output); +simdutf_result +simdutf_convert_utf32_to_latin1_with_errors(const char32_t *input, + size_t length, char *output); + +/* --- Find helpers --- */ +const char *simdutf_find(const char *start, const char *end, char character); +const char16_t *simdutf_find_utf16(const char16_t *start, const char16_t *end, + char16_t character); + +/* --- Base64 enums and helpers --- */ +typedef enum simdutf_base64_options { + SIMDUTF_BASE64_DEFAULT = 0, + SIMDUTF_BASE64_URL = 1, + SIMDUTF_BASE64_DEFAULT_NO_PADDING = 2, + SIMDUTF_BASE64_URL_WITH_PADDING = 3, + SIMDUTF_BASE64_DEFAULT_ACCEPT_GARBAGE = 4, + SIMDUTF_BASE64_URL_ACCEPT_GARBAGE = 5, + SIMDUTF_BASE64_DEFAULT_OR_URL = 8, + SIMDUTF_BASE64_DEFAULT_OR_URL_ACCEPT_GARBAGE = 12 +} simdutf_base64_options; + +typedef enum simdutf_last_chunk_handling_options { + SIMDUTF_LAST_CHUNK_LOOSE = 0, + SIMDUTF_LAST_CHUNK_STRICT = 1, + SIMDUTF_LAST_CHUNK_STOP_BEFORE_PARTIAL = 2, + SIMDUTF_LAST_CHUNK_ONLY_FULL_CHUNKS = 3 +} simdutf_last_chunk_handling_options; + +/* maximal binary length estimators */ +size_t simdutf_maximal_binary_length_from_base64(const char *input, + size_t length); +size_t simdutf_maximal_binary_length_from_base64_utf16(const char16_t *input, + size_t length); + +/* base64 decoding/encoding */ +simdutf_result simdutf_base64_to_binary( + const char *input, size_t length, char *output, + simdutf_base64_options options, + simdutf_last_chunk_handling_options last_chunk_options); +simdutf_result simdutf_base64_to_binary_utf16( + const char16_t *input, size_t length, char *output, + simdutf_base64_options options, + simdutf_last_chunk_handling_options last_chunk_options); + +size_t simdutf_base64_length_from_binary(size_t length, + simdutf_base64_options options); +size_t simdutf_base64_length_from_binary_with_lines( + size_t length, simdutf_base64_options options, size_t line_length); + +size_t simdutf_binary_to_base64(const char *input, size_t length, char *output, + simdutf_base64_options options); +size_t simdutf_binary_to_base64_with_lines(const char *input, size_t length, + char *output, size_t line_length, + simdutf_base64_options options); + +/* safe decoding that provides an in/out outlen parameter */ +simdutf_result simdutf_base64_to_binary_safe( + const char *input, size_t length, char *output, size_t *outlen, + simdutf_base64_options options, + simdutf_last_chunk_handling_options last_chunk_options, + bool decode_up_to_bad_char); +simdutf_result simdutf_base64_to_binary_safe_utf16( + const char16_t *input, size_t length, char *output, size_t *outlen, + simdutf_base64_options options, + simdutf_last_chunk_handling_options last_chunk_options, + bool decode_up_to_bad_char); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* SIMDUTF_C_H */ diff --git a/text.cabal b/text.cabal index 552ce811..d9953277 100644 --- a/text.cabal +++ b/text.cabal @@ -66,6 +66,7 @@ extra-source-files: scripts/*.hs simdutf/LICENSE-APACHE simdutf/LICENSE-MIT + simdutf/simdutf_c.h simdutf/simdutf.h tests/literal-rule-test.sh tests/LiteralRuleTest.hs @@ -116,8 +117,8 @@ library if flag(simdutf) && !(arch(javascript) || flag(pure-haskell)) exposed-modules: Data.Text.Internal.Validate.Simd include-dirs: simdutf + c-sources: simdutf/hs_simdutf.c cxx-sources: simdutf/simdutf.cpp - cbits/validate_utf8.cpp cxx-options: -std=c++17 cpp-options: -DSIMDUTF if impl(ghc >= 9.4)