Skip to content

Commit 5eef8ab

Browse files
Merge pull request #60 from contour-terminal/feature/armv8-simd
Enable ARM64 SIMD in scan module
2 parents 00cb0e7 + bdd9a6d commit 5eef8ab

File tree

4 files changed

+260
-18
lines changed

4 files changed

+260
-18
lines changed

Changelog.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
- Fixes unicode-query's output for "character width".
44
- Fixes decoding invalid UTF-8 locking up.
55
- Fixes stage1 multistage-table sizes, reducing memory footprint a bit.
6+
- Adds SIMD implementation for scan API on ARM64 (NEON).
67
- unicode-query is now linked statically on UNIX platforms.
78

89
## 0.2.0 (2022-11-13)

src/unicode/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ set(public_headers
8787
convert.h
8888
emoji_segmenter.h
8989
grapheme_segmenter.h
90+
intrinsics.h
9091
run_segmenter.h
9192
scan.h
9293
script_segmenter.h
@@ -109,7 +110,6 @@ target_link_libraries(unicode_tablegen PRIVATE unicode::loader)
109110
# {{{ installation
110111
set(LIBUNICODE_CMAKE_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/libunicode" CACHE PATH "Installation directory for cmake files, a relative path that will be joined with ${CMAKE_INSTALL_PREFIX} or an absolute path.")
111112
set(LIBUNICODE_INSTALL_CMAKE_FILES ${MASTER_PROJECT} CACHE BOOL "Decides whether or not to install CMake config and -version files.")
112-
message(NOTICE "HELLO HERE: ${LIBUNICODE_CMAKE_DIR}")
113113

114114
set(INSTALL_TARGETS unicode_ucd unicode_loader unicode)
115115
set(TARGETS_EXPORT_NAME unicode-targets)

src/unicode/intrinsics.h

Lines changed: 244 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,244 @@
1+
/**
2+
* This file is part of the "libunicode" project
3+
* Copyright (c) 2023 Christian Parpart <christian@parpart.family>
4+
*
5+
* Licensed under the Apache License, Version 2.0 (the "License");
6+
* you may not use this file except in compliance with the License.
7+
*
8+
* Unless required by applicable law or agreed to in writing, software
9+
* distributed under the License is distributed on an "AS IS" BASIS,
10+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
* See the License for the specific language governing permissions and
12+
* limitations under the License.
13+
*/
14+
#pragma once
15+
16+
#if defined(__x86_64__) || defined(_M_AMD64)
17+
#include <emmintrin.h> // AVX, AVX2, FMP
18+
#include <immintrin.h> // SSE2
19+
#endif
20+
21+
#if defined(__aarch64__) || defined(_M_ARM64)
22+
#include <arm_neon.h>
23+
#endif
24+
25+
namespace unicode
26+
{
27+
28+
template <typename>
29+
struct platform_intrinsics;
30+
31+
#if defined(__GNUC__) && defined(__x86_64__)
32+
// For some reason, GCC associates attributes with __m128i that are not obvious (alignment),
33+
// and then complains about it when used below.
34+
#pragma GCC diagnostic ignored "-Wignored-attributes"
35+
#endif
36+
37+
#if defined(__x86_64__) || defined(_M_AMD64) // {{{
38+
39+
template <>
40+
struct platform_intrinsics<__m128i>
41+
{
42+
using m128i = __m128i;
43+
44+
static inline m128i setzero() noexcept { return _mm_setzero_si128(); }
45+
46+
static inline m128i set1_epi8(signed char w) { return _mm_set1_epi8(w); }
47+
48+
static inline m128i load32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept
49+
{
50+
return _mm_set_epi32(
51+
static_cast<int>(a), static_cast<int>(b), static_cast<int>(c), static_cast<int>(d));
52+
}
53+
54+
static inline m128i xor128(m128i a, m128i b) noexcept { return _mm_xor_si128(a, b); }
55+
56+
static inline m128i and128(m128i a, m128i b) noexcept { return _mm_and_si128(a, b); }
57+
58+
// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
59+
static inline m128i or128(m128i a, m128i b) { return _mm_or_si128(a, b); }
60+
61+
static inline m128i load_unaligned(m128i const* p) noexcept
62+
{
63+
return _mm_loadu_si128(static_cast<m128i const*>(p));
64+
}
65+
66+
static inline int32_t to_i32(m128i a) { return _mm_cvtsi128_si32(a); }
67+
68+
static inline bool compare(m128i a, m128i b) noexcept
69+
{
70+
return _mm_movemask_epi8(_mm_cmpeq_epi32(a, b)) == 0xFFFF;
71+
}
72+
73+
static inline m128i compare_less(m128i a, m128i b) noexcept { return _mm_cmplt_epi8(a, b); }
74+
75+
static inline int movemask_epi8(m128i a) { return _mm_movemask_epi8(a); }
76+
77+
static inline m128i cvtsi64_si128(int64_t a) { return _mm_cvtsi64_si128(a); }
78+
};
79+
80+
using intrinsics = platform_intrinsics<__m128i>;
81+
82+
#endif
83+
// }}}
84+
85+
#if defined(__aarch64__) || defined(_M_ARM64) // {{{
86+
template <>
87+
struct platform_intrinsics<int64x2_t>
88+
{
89+
// The following inline functions (in its initial version) were borrowed from:
90+
// https://github.com/f1ed/emp/blob/master/emp-tool/utils/block.h
91+
92+
using m128i = int64x2_t;
93+
94+
static inline m128i setzero() noexcept { return vreinterpretq_s64_s32(vdupq_n_s32(0)); }
95+
96+
static inline m128i set1_epi8(signed char w) { return vreinterpretq_s64_s8(vdupq_n_s8(w)); }
97+
98+
static inline m128i load32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept
99+
{
100+
alignas(16) int32_t data[4] = {
101+
static_cast<int>(a),
102+
static_cast<int>(b),
103+
static_cast<int>(c),
104+
static_cast<int>(d),
105+
};
106+
return vreinterpretq_s64_s32(vld1q_s32(data));
107+
}
108+
109+
static inline m128i xor128(m128i a, m128i b) noexcept
110+
{
111+
// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
112+
// b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
113+
return vreinterpretq_s64_s32(veorq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
114+
}
115+
116+
static inline m128i and128(m128i a, m128i b) noexcept
117+
{
118+
return vreinterpretq_s64_s32(vandq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
119+
}
120+
121+
// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
122+
static inline m128i or128(m128i a, m128i b)
123+
{
124+
return vreinterpretq_s64_s32(vorrq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
125+
}
126+
127+
// Loads 128-bit value. :
128+
// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
129+
static inline m128i load_unaligned(m128i const* p) noexcept
130+
{
131+
return vreinterpretq_s64_s32(vld1q_s32((int32_t const*) p));
132+
}
133+
134+
// Copy the lower 32-bit integer in a to dst.
135+
//
136+
// dst[31:0] := a[31:0]
137+
//
138+
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
139+
static inline int32_t to_i32(m128i a) { return vgetq_lane_s32(vreinterpretq_s32_s64(a), 0); }
140+
141+
static inline bool compare(m128i a, m128i b) noexcept
142+
{
143+
return movemask_epi8(
144+
vreinterpretq_s64_u32(vceqq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))))
145+
== 0xFFFF;
146+
}
147+
148+
static inline m128i compare_less(m128i a, m128i b) noexcept
149+
{
150+
// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
151+
// in b for lesser than.
152+
// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
153+
return vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(a), vreinterpretq_s8_s64(b)));
154+
}
155+
156+
static inline int movemask_epi8(m128i a)
157+
{
158+
// Use increasingly wide shifts+adds to collect the sign bits
159+
// together.
160+
// Since the widening shifts would be rather confusing to follow in little
161+
// endian, everything will be illustrated in big endian order instead. This
162+
// has a different result - the bits would actually be reversed on a big
163+
// endian machine.
164+
165+
// Starting input (only half the elements are shown):
166+
// 89 ff 1d c0 00 10 99 33
167+
uint8x16_t input = vreinterpretq_u8_s64(a);
168+
169+
// Shift out everything but the sign bits with an unsigned shift right.
170+
//
171+
// Bytes of the vector::
172+
// 89 ff 1d c0 00 10 99 33
173+
// \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7)
174+
// | | | | | | | |
175+
// 01 01 00 01 00 00 01 00
176+
//
177+
// Bits of first important lane(s):
178+
// 10001001 (89)
179+
// \______
180+
// |
181+
// 00000001 (01)
182+
uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
183+
184+
// Merge the even lanes together with a 16-bit unsigned shift right + add.
185+
// 'xx' represents garbage data which will be ignored in the final result.
186+
// In the important bytes, the add functions like a binary OR.
187+
//
188+
// 01 01 00 01 00 00 01 00
189+
// \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7))
190+
// \| \| \| \|
191+
// xx 03 xx 01 xx 00 xx 02
192+
//
193+
// 00000001 00000001 (01 01)
194+
// \_______ |
195+
// \|
196+
// xxxxxxxx xxxxxx11 (xx 03)
197+
uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
198+
199+
// Repeat with a wider 32-bit shift + add.
200+
// xx 03 xx 01 xx 00 xx 02
201+
// \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >>
202+
// 14))
203+
// \| \|
204+
// xx xx xx 0d xx xx xx 02
205+
//
206+
// 00000011 00000001 (03 01)
207+
// \\_____ ||
208+
// '----.\||
209+
// xxxxxxxx xxxx1101 (xx 0d)
210+
uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
211+
212+
// Last, an even wider 64-bit shift + add to get our result in the low 8 bit
213+
// lanes. xx xx xx 0d xx xx xx 02
214+
// \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >>
215+
// 28))
216+
// \|
217+
// xx xx xx xx xx xx xx d2
218+
//
219+
// 00001101 00000010 (0d 02)
220+
// \ \___ | |
221+
// '---. \| |
222+
// xxxxxxxx 11010010 (xx d2)
223+
uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
224+
225+
// Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
226+
// xx xx xx xx xx xx xx d2
227+
// || return paired64[0]
228+
// d2
229+
// Note: Little endian would return the correct value 4b (01001011) instead.
230+
return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
231+
}
232+
};
233+
234+
using intrinsics = platform_intrinsics<int64x2_t>;
235+
#endif
236+
// }}}
237+
238+
// #if defined(INTRINSICS_HAS_ARM64_NEON)
239+
// using m128i = int64x2_t; // 128-bit vector containing integers
240+
// #else
241+
// using m128i = __m128i;
242+
// #endif
243+
244+
} // namespace unicode

src/unicode/scan.cpp

Lines changed: 14 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
* limitations under the License.
1313
*/
1414
#include <unicode/grapheme_segmenter.h>
15+
#include <unicode/intrinsics.h>
1516
#include <unicode/scan.h>
1617
#include <unicode/utf8.h>
1718
#include <unicode/width.h>
@@ -38,16 +39,14 @@ namespace unicode
3839

3940
namespace
4041
{
41-
#if defined(__SSE2__)
4242
[[maybe_unused]] int countTrailingZeroBits(unsigned int value) noexcept
4343
{
44-
#if defined(_WIN32)
44+
#if defined(_WIN32)
4545
return _tzcnt_u32(value);
46-
#else
46+
#else
4747
return __builtin_ctz(value);
48-
#endif
49-
}
5048
#endif
49+
}
5150

5251
template <typename T>
5352
constexpr bool ascending(T low, T val, T high) noexcept
@@ -78,26 +77,24 @@ size_t detail::scan_for_text_ascii(string_view text, size_t maxColumnCount) noex
7877
auto input = text.data();
7978
auto const end = text.data() + min(text.size(), maxColumnCount);
8079

81-
#if defined(__SSE2__) // TODO: support __aarch64__
82-
__m128i const ControlCodeMax = _mm_set1_epi8(0x20); // 0..0x1F
83-
__m128i const Complex = _mm_set1_epi8(static_cast<char>(0x80));
80+
intrinsics::m128i const ControlCodeMax = intrinsics::set1_epi8(0x20); // 0..0x1F
81+
intrinsics::m128i const Complex = intrinsics::set1_epi8(-128); // equals to 0x80 (0b1000'0000)
8482

85-
while (input < end - sizeof(__m128i))
83+
while (input < end - sizeof(intrinsics::m128i))
8684
{
87-
__m128i batch = _mm_loadu_si128((__m128i*) input);
88-
__m128i isControl = _mm_cmplt_epi8(batch, ControlCodeMax);
89-
__m128i isComplex = _mm_and_si128(batch, Complex);
90-
//__m128i isComplex = _mm_cmplt_epi8(batch, Complex);
91-
__m128i testPack = _mm_or_si128(isControl, isComplex);
92-
if (int const check = _mm_movemask_epi8(testPack); check != 0)
85+
intrinsics::m128i batch = intrinsics::load_unaligned((intrinsics::m128i*) input);
86+
intrinsics::m128i isControl = intrinsics::compare_less(batch, ControlCodeMax);
87+
intrinsics::m128i isComplex = intrinsics::and128(batch, Complex);
88+
// intrinsics::m128i isComplex = _mm_cmplt_epi8(batch, Complex);
89+
intrinsics::m128i testPack = intrinsics::or128(isControl, isComplex);
90+
if (int const check = intrinsics::movemask_epi8(testPack); check != 0)
9391
{
9492
int advance = countTrailingZeroBits(static_cast<unsigned>(check));
9593
input += advance;
9694
break;
9795
}
98-
input += sizeof(__m128i);
96+
input += sizeof(intrinsics::m128i);
9997
}
100-
#endif
10198

10299
while (input != end && is_ascii(*input))
103100
++input;

0 commit comments

Comments
 (0)