diff --git a/bazel/config/BUILD.bazel b/bazel/config/BUILD.bazel index 16289db41..9026a51e5 100644 --- a/bazel/config/BUILD.bazel +++ b/bazel/config/BUILD.bazel @@ -140,7 +140,7 @@ string_flag( ], ) -# PICO_BAZEL_CONFIG: PICO_DEFAULT_PRINTF_IMPL, The default implementation for pico_printf to link. compiler lets the compiler control printf behavior while pico provides a pico-specific implementation, type=string, default=double, group=build +# PICO_BAZEL_CONFIG: PICO_DEFAULT_PRINTF_IMPL, The default implementation for pico_printf to link. compiler lets the compiler control printf behavior while pico provides a pico-specific implementation, type=string, default=pico, group=build string_flag( name = "PICO_DEFAULT_PRINTF_IMPL", build_setting_default = "pico", @@ -151,6 +151,17 @@ string_flag( ], ) +# PICO_BAZEL_CONFIG: PICO_DEFAULT_TLS_IMPL, The default implementation for pico_thread_local to link; per_thread provides per thread locals global provide shared global values, type=string, default=per_thread, group=build +string_flag( + name = "PICO_DEFAULT_TLS_IMPL", + build_setting_default = "per_thread", + values = [ + "per_thread", + "global", + "none", + ], +) + # PICO_BAZEL_CONFIG: PICO_ASYNC_CONTEXT_IMPL, The default implementation for pico_async_context to link, type=string, default=threadsafe_background, group=build string_flag( name = "PICO_ASYNC_CONTEXT_IMPL", diff --git a/bazel/constraint/BUILD.bazel b/bazel/constraint/BUILD.bazel index 571bb9696..747147ed1 100644 --- a/bazel/constraint/BUILD.bazel +++ b/bazel/constraint/BUILD.bazel @@ -180,6 +180,17 @@ config_setting( flag_values = {"//bazel/config:PICO_DEFAULT_PRINTF_IMPL": "compiler"}, ) +config_setting( + name = "pico_thread_local_per_thread_enabled", + flag_values = {"//bazel/config:PICO_DEFAULT_TLS_IMPL": "per_thread"}, +) + +config_setting( + name = "pico_thread_local_global_enabled", + flag_values = {"//bazel/config:PICO_DEFAULT_TLS_IMPL": "global"}, +) + + config_setting( name = "pico_async_context_poll_enabled", flag_values = {"//bazel/config:PICO_ASYNC_CONTEXT_IMPL": "poll"}, diff --git a/docs/index.h b/docs/index.h index 572d94c70..ebf1d0aed 100644 --- a/docs/index.h +++ b/docs/index.h @@ -118,6 +118,7 @@ * \cond pico_stdio \defgroup pico_stdio pico_stdio \endcond * \cond pico_standard_binary_info \defgroup pico_standard_binary_info pico_standard_binary_info \endcond * \cond pico_standard_link \defgroup pico_standard_link pico_standard_link \endcond + * \cond pico_thread_local \defgroup pico_thread_local pico_thread_local \endcond * @} * * \defgroup misc External API Headers diff --git a/src/cmake/rp2_common.cmake b/src/cmake/rp2_common.cmake index 273839f70..51b7b2784 100644 --- a/src/cmake/rp2_common.cmake +++ b/src/cmake/rp2_common.cmake @@ -107,6 +107,7 @@ if (NOT PICO_BARE_METAL) pico_add_subdirectory(rp2_common/pico_malloc) pico_add_subdirectory(rp2_common/pico_printf) pico_add_subdirectory(rp2_common/pico_rand) + pico_add_subdirectory(rp2_common/pico_thread_local) if (PICO_COMBINED_DOCS OR NOT PICO_RP2040) pico_add_subdirectory(rp2_common/pico_sha256) diff --git a/src/common/pico_base_headers/BUILD.bazel b/src/common/pico_base_headers/BUILD.bazel index 4a859a3f2..088e78929 100644 --- a/src/common/pico_base_headers/BUILD.bazel +++ b/src/common/pico_base_headers/BUILD.bazel @@ -125,6 +125,7 @@ cc_library( "//src/rp2_common/pico_runtime:__pkg__", "//src/rp2_common/pico_runtime_init:__pkg__", "//src/rp2_common/pico_time_adapter:__pkg__", + "//src/rp2_common/pico_thread_local:__pkg__", "@picotool//:__subpackages__", ], deps = [ diff --git a/src/rp2040/pico_platform/memmap_blocked_ram.ld b/src/rp2040/pico_platform/memmap_blocked_ram.ld index 7dfa94c33..443017b59 100644 --- a/src/rp2040/pico_platform/memmap_blocked_ram.ld +++ b/src/rp2040/pico_platform/memmap_blocked_ram.ld @@ -33,7 +33,7 @@ INCLUDE "memmap_default.incl" * │ ├── section_ram_vector_table.incl rp2_common/pico_standard_link * │ ├── section_uninitialized_data.incl rp2_common/pico_standard_link * │ ├── section_default_data.incl rp2_common/pico_standard_link - * │ └── section_bss.incl rp2_common/pico_standard_link + * │ └── section_tls.incl rp2_common/pico_standard_link * ├── section_generated_post_data.incl rp2_common/pico_standard_link * ├── section_extra_post_data.incl rp2_common/pico_standard_link * ├── section_heap.incl rp2_common/pico_standard_link diff --git a/src/rp2040/pico_platform/memmap_copy_to_ram.ld b/src/rp2040/pico_platform/memmap_copy_to_ram.ld index 411b62d48..283047c0a 100644 --- a/src/rp2040/pico_platform/memmap_copy_to_ram.ld +++ b/src/rp2040/pico_platform/memmap_copy_to_ram.ld @@ -29,7 +29,7 @@ INCLUDE "memmap_copy_to_ram.incl" * ├── section_extra_post_text.incl rp2_common/pico_standard_link * ├── sections_copy_to_ram_data.incl rp2_common/pico_standard_link * │ ├── section_copy_to_ram_data.incl rp2_common/pico_standard_link - * │ └── section_bss.incl rp2_common/pico_standard_link + * │ └── section_tls.incl rp2_common/pico_standard_link * ├── section_generated_post_data.incl rp2_common/pico_standard_link * ├── section_extra_post_data.incl rp2_common/pico_standard_link * ├── section_heap.incl rp2_common/pico_standard_link diff --git a/src/rp2040/pico_platform/memmap_default.ld b/src/rp2040/pico_platform/memmap_default.ld index 3b6e09409..4958d5e8a 100644 --- a/src/rp2040/pico_platform/memmap_default.ld +++ b/src/rp2040/pico_platform/memmap_default.ld @@ -30,7 +30,7 @@ INCLUDE "memmap_default.incl" * │ ├── section_ram_vector_table.incl rp2_common/pico_standard_link * │ ├── section_uninitialized_data.incl rp2_common/pico_standard_link * │ ├── section_default_data.incl rp2_common/pico_standard_link - * │ └── section_bss.incl rp2_common/pico_standard_link + * │ └── section_tls.incl rp2_common/pico_standard_link * ├── section_generated_post_data.incl rp2_common/pico_standard_link * ├── section_extra_post_data.incl rp2_common/pico_standard_link * ├── section_heap.incl rp2_common/pico_standard_link diff --git a/src/rp2040/pico_platform/memmap_no_flash.ld b/src/rp2040/pico_platform/memmap_no_flash.ld index 00a434989..93ab9a7ff 100644 --- a/src/rp2040/pico_platform/memmap_no_flash.ld +++ b/src/rp2040/pico_platform/memmap_no_flash.ld @@ -24,7 +24,7 @@ INCLUDE "memmap_no_flash.incl" * ├── sections_no_flash_data.incl rp2_common/pico_standard_link * │ ├── section_no_flash_data.incl rp2_common/pico_standard_link * │ ├── section_uninitialized_data.incl rp2_common/pico_standard_link - * │ └── section_bss.incl rp2_common/pico_standard_link + * │ └── section_tls.incl rp2_common/pico_standard_link * ├── section_generated_post_data.incl rp2_common/pico_standard_link * ├── section_extra_post_data.incl rp2_common/pico_standard_link * ├── section_heap.incl rp2_common/pico_standard_link diff --git a/src/rp2350/pico_platform/memmap_copy_to_ram.ld b/src/rp2350/pico_platform/memmap_copy_to_ram.ld index 2a534b940..7e71fa9f3 100644 --- a/src/rp2350/pico_platform/memmap_copy_to_ram.ld +++ b/src/rp2350/pico_platform/memmap_copy_to_ram.ld @@ -29,7 +29,7 @@ INCLUDE "memmap_copy_to_ram.incl" * ├── section_extra_post_text.incl rp2_common/pico_standard_link * ├── sections_copy_to_ram_data.incl rp2_common/pico_standard_link * │ ├── section_copy_to_ram_data.incl rp2_common/pico_standard_link - * │ └── section_bss.incl rp2_common/pico_standard_link + * │ └── section_tls.incl rp2_common/pico_standard_link * ├── section_generated_post_data.incl rp2_common/pico_standard_link * ├── section_extra_post_data.incl rp2_common/pico_standard_link * ├── section_heap.incl rp2_common/pico_standard_link diff --git a/src/rp2350/pico_platform/memmap_default.ld b/src/rp2350/pico_platform/memmap_default.ld index 965541bd5..89452823d 100644 --- a/src/rp2350/pico_platform/memmap_default.ld +++ b/src/rp2350/pico_platform/memmap_default.ld @@ -30,7 +30,7 @@ INCLUDE "memmap_default.incl" * │ ├── section_ram_vector_table.incl rp2_common/pico_standard_link * │ ├── section_uninitialized_data.incl rp2_common/pico_standard_link * │ ├── section_default_data.incl rp2_common/pico_standard_link - * │ └── section_bss.incl rp2_common/pico_standard_link + * │ └── section_tls.incl rp2_common/pico_standard_link * ├── section_generated_post_data.incl rp2_common/pico_standard_link * ├── section_extra_post_data.incl rp2_common/pico_standard_link * ├── section_heap.incl rp2_common/pico_standard_link diff --git a/src/rp2350/pico_platform/memmap_no_flash.ld b/src/rp2350/pico_platform/memmap_no_flash.ld index 67acac37c..ca4cf8974 100644 --- a/src/rp2350/pico_platform/memmap_no_flash.ld +++ b/src/rp2350/pico_platform/memmap_no_flash.ld @@ -24,7 +24,7 @@ INCLUDE "memmap_no_flash.incl" * ├── sections_no_flash_data.incl rp2_common/pico_standard_link * │ ├── section_no_flash_data.incl rp2_common/pico_standard_link * │ ├── section_uninitialized_data.incl rp2_common/pico_standard_link - * │ └── section_bss.incl rp2_common/pico_standard_link + * │ └── section_tls.incl rp2_common/pico_standard_link * ├── section_generated_post_data.incl rp2_common/pico_standard_link * ├── section_extra_post_data.incl rp2_common/pico_standard_link * ├── section_heap.incl rp2_common/pico_standard_link diff --git a/src/rp2_common/BUILD.bazel b/src/rp2_common/BUILD.bazel index fe1c0c3e3..5b60c03e4 100644 --- a/src/rp2_common/BUILD.bazel +++ b/src/rp2_common/BUILD.bazel @@ -80,6 +80,7 @@ alias( "//src/rp2_common/pico_mem_ops:__pkg__", "//src/rp2_common/pico_printf:__pkg__", "//src/rp2_common/pico_runtime_init:__pkg__", + "//src/rp2_common/pico_thread_local:__pkg__", ], ) diff --git a/src/rp2_common/pico_clib_interface/newlib_interface.c b/src/rp2_common/pico_clib_interface/newlib_interface.c index 61adeb480..9e4b058e7 100644 --- a/src/rp2_common/pico_clib_interface/newlib_interface.c +++ b/src/rp2_common/pico_clib_interface/newlib_interface.c @@ -157,6 +157,7 @@ int __attribute__((weak)) _getentropy (__unused void *buffer, __unused size_t le // want to pull in pico_rand. the user can supply their own strong implementation if they need it! return -1; } + // exit is not useful... no desire to pull in __call_exitprocs void exit(int status) { _exit(status); @@ -201,4 +202,4 @@ void runtime_init(void) { for (void (**p)(void) = &__init_array_start; p < &__init_array_end; ++p) { (*p)(); } -} \ No newline at end of file +} diff --git a/src/rp2_common/pico_clib_interface/picolibc_interface.c b/src/rp2_common/pico_clib_interface/picolibc_interface.c index a0e1e08f5..afc6434ff 100644 --- a/src/rp2_common/pico_clib_interface/picolibc_interface.c +++ b/src/rp2_common/pico_clib_interface/picolibc_interface.c @@ -134,27 +134,3 @@ void runtime_init(void) { extern void __libc_init_array(void); __libc_init_array(); } - -#if !PICO_RUNTIME_NO_INIT_PER_CORE_TLS_SETUP -__weak void runtime_init_pre_core_tls_setup(void) { - // for now we just set the same global area on both cores - // note: that this is superfluous with the stock picolibc it seems, since it is itself - // using a version of __aeabi_read_tp that returns the same pointer on both cores - extern char __tls_base[]; - extern void _set_tls(void *tls); - _set_tls(__tls_base); -} -#endif - -#if !PICO_RUNTIME_SKIP_INIT_PER_CORE_TLS_SETUP -PICO_RUNTIME_INIT_FUNC_PER_CORE(runtime_init_pre_core_tls_setup, PICO_RUNTIME_INIT_PER_CORE_TLS_SETUP); -#endif - -//// naked as it must preserve everything except r0 and lr -//uint32_t __attribute__((naked)) WRAPPER_FUNC(__aeabi_read_tp)() { -// // note for now we are just returning a shared instance on both cores -// pico_default_asm_volatile( -// "ldr r0, =__tls_base\n" -// "bx lr\n" -// ); -//} \ No newline at end of file diff --git a/src/rp2_common/pico_runtime/CMakeLists.txt b/src/rp2_common/pico_runtime/CMakeLists.txt index 793a1661b..d9a7aa1d1 100644 --- a/src/rp2_common/pico_runtime/CMakeLists.txt +++ b/src/rp2_common/pico_runtime/CMakeLists.txt @@ -36,6 +36,7 @@ set(PICO_RUNTIME_LIBRARIES pico_crt0 pico_clib_interface pico_stdio + pico_thread_local ) foreach(LIB IN LISTS PICO_RUNTIME_LIBRARIES) @@ -83,7 +84,8 @@ function(pico_minimize_runtime TARGET) DOUBLE FPGA_CHECK PANIC - AUTO_INIT_MUTEX) + AUTO_INIT_MUTEX + TLS) cmake_parse_arguments(RUNTIME "" "" "INCLUDE;EXCLUDE" ${ARGN} ) foreach (INCL_EXCL IN ITEMS INCLUDE EXCLUDE) @@ -114,6 +116,9 @@ function(pico_minimize_runtime TARGET) if (NOT RUNTIME_INCLUDE_AUTO_INIT_MUTEX) target_compile_definitions(${TARGET} PRIVATE PICO_RUNTIME_SKIP_INIT_MUTEX=1) endif() + if (NOT RUNTIME_INCLUDE_TLS) + pico_set_tls_implementation(${TARGET} none) + endif() if (RUNTIME_INCLUDE_PRINTF) if (NOT RUNTIME_INCLUDE_PRINTF_MINIMAL) diff --git a/src/rp2_common/pico_standard_link/script_include/section_bss.incl b/src/rp2_common/pico_standard_link/script_include/section_bss.incl deleted file mode 100644 index 38de94018..000000000 --- a/src/rp2_common/pico_standard_link/script_include/section_bss.incl +++ /dev/null @@ -1,31 +0,0 @@ -/* Defines the following symbols for use by code: - __bss_start__, __bss_end__ - __tls_base, __tls_end - __tbss_end - __global_pointer$ -*/ - -SECTIONS -{ - .tbss (NOLOAD) : { - . = ALIGN(4); - __bss_start__ = .; - __tls_base = .; - *(.tbss .tbss.* .gnu.linkonce.tb.*) - *(.tcommon) - - __tls_end = .; - } > RAM - - .bss (NOLOAD) : { - . = ALIGN(4); - __tbss_end = .; - - *(SORT_BY_ALIGNMENT(SORT_BY_NAME(.bss*))) - *(COMMON) - PROVIDE(__global_pointer$ = . + 2K); - *(.sbss*) - . = ALIGN(4); - __bss_end__ = .; - } > RAM -} diff --git a/src/rp2_common/pico_standard_link/script_include/section_copy_to_ram_data.incl b/src/rp2_common/pico_standard_link/script_include/section_copy_to_ram_data.incl index 48b2914ae..646523756 100644 --- a/src/rp2_common/pico_standard_link/script_include/section_copy_to_ram_data.incl +++ b/src/rp2_common/pico_standard_link/script_include/section_copy_to_ram_data.incl @@ -7,6 +7,10 @@ __fini_array_start, __fini_array_end __tdata_end __etext + __bss_start__, __bss_end__ + __tls_base, __tls_end + __tbss_end + __global_pointer$ */ SECTIONS @@ -25,6 +29,11 @@ SECTIONS *(.data*) *(.sdata*) + . = ALIGN(4); + PROVIDE_HIDDEN (__emutls_array_start = .); + *(.*.__emutls_v.*) + PROVIDE_HIDDEN (__emutls_array_end = .); + . = ALIGN(4); *(.after_data.*) . = ALIGN(4); @@ -71,4 +80,33 @@ SECTIONS /* __etext is (for backwards compatibility) the name of the .data init source pointer (...) */ __etext = LOADADDR(.data); + + . = ALIGN(4); + /* note that .tbss is expected to directly follow .tdata */ + .tbss (NOLOAD) : { + __bss_start__ = .; + __tls_base = .; + *(.tbss .tbss.* .gnu.linkonce.tb.*) + *(.tcommon) + + . = ALIGN(4); + __tls_end = .; + } > RAM + + /* workaround RISC-V GNU linker bug with spacer section (otherwise it places .bss over .tbss) */ + .tbss_space (NOLOAD) : { + . = __bss_start__ + SIZEOF(.tbss); + } > RAM + + .bss (NOLOAD) : { + . = ALIGN(4); + __tbss_end = .; + + *(SORT_BY_ALIGNMENT(SORT_BY_NAME(.bss*))) + *(COMMON) + PROVIDE(__global_pointer$ = . + 2K); + *(.sbss*) + . = ALIGN(4); + __bss_end__ = .; + } > RAM } diff --git a/src/rp2_common/pico_standard_link/script_include/section_default_data.incl b/src/rp2_common/pico_standard_link/script_include/section_default_data.incl index 114b3c493..c9e4061a9 100644 --- a/src/rp2_common/pico_standard_link/script_include/section_default_data.incl +++ b/src/rp2_common/pico_standard_link/script_include/section_default_data.incl @@ -2,6 +2,10 @@ __data_start__, __data_end__ __mutex_array_start, __mutex_array_end __etext + __bss_start__, __bss_end__ + __tls_base, __tls_end + __tbss_end + __global_pointer$ */ SECTIONS @@ -18,9 +22,15 @@ SECTIONS *(.rodata*) . = ALIGN(4); + PROVIDE_HIDDEN (__emutls_array_start = .); + *(.*.__emutls_v.*) + PROVIDE_HIDDEN (__emutls_array_end = .); + . = ALIGN(4); + *(.data*) *(.sdata*) + . = ALIGN(4); *(.after_data.*) . = ALIGN(4); @@ -44,4 +54,34 @@ SECTIONS /* __etext is (for backwards compatibility) the name of the .data init source pointer (...) */ __etext = LOADADDR(.data); + + . = ALIGN(4); + /* note that .tbss is expected to directly follow .tdata */ + .tbss (NOLOAD) : { + __bss_start__ = .; + __tls_base = .; + *(.tbss .tbss.* .gnu.linkonce.tb.*) + *(.tcommon) + + . = ALIGN(4); + __tls_end = .; + } > RAM + + /* workaround RISC-V GNU linker bug with spacer section (otherwise it places .bss over .tbss) */ + .tbss_space (NOLOAD) : { + . = __bss_start__ + SIZEOF(.tbss); + } > RAM + + .bss (NOLOAD) : { + . = ALIGN(4); + __tbss_end = .; + + *(SORT_BY_ALIGNMENT(SORT_BY_NAME(.bss*))) + *(COMMON) + PROVIDE(__global_pointer$ = . + 2K); + *(.sbss*) + . = ALIGN(4); + __bss_end__ = .; + } > RAM + } diff --git a/src/rp2_common/pico_standard_link/script_include/section_no_flash_data.incl b/src/rp2_common/pico_standard_link/script_include/section_no_flash_data.incl index 8a1bcd5ec..d162a45b1 100644 --- a/src/rp2_common/pico_standard_link/script_include/section_no_flash_data.incl +++ b/src/rp2_common/pico_standard_link/script_include/section_no_flash_data.incl @@ -7,6 +7,10 @@ __fini_array_start, __fini_array_end __tdata_end __etext + __bss_start__, __bss_end__ + __tls_base, __tls_end + __tbss_end + __global_pointer$ */ SECTIONS @@ -20,6 +24,11 @@ SECTIONS *(.data*) *(.sdata*) + . = ALIGN(4); + PROVIDE_HIDDEN (__emutls_array_start = .); + *(.*.__emutls_v.*) + PROVIDE_HIDDEN (__emutls_array_end = .); + . = ALIGN(4); *(.after_data.*) . = ALIGN(4); @@ -66,4 +75,33 @@ SECTIONS /* __etext is (for backwards compatibility) the name of the .data init source pointer (...) */ __etext = LOADADDR(.data); + + . = ALIGN(4); + /* note that .tbss is expected to directly follow .tdata */ + .tbss (NOLOAD) : { + __bss_start__ = .; + __tls_base = .; + *(.tbss .tbss.* .gnu.linkonce.tb.*) + *(.tcommon) + + . = ALIGN(4); + __tls_end = .; + } > RAM + + /* workaround RISC-V GNU linker bug with spacer section (otherwise it places .bss over .tbss) */ + .tbss_space (NOLOAD) : { + . = __bss_start__ + SIZEOF(.tbss); + } > RAM + + .bss (NOLOAD) : { + . = ALIGN(4); + __tbss_end = .; + + *(SORT_BY_ALIGNMENT(SORT_BY_NAME(.bss*))) + *(COMMON) + PROVIDE(__global_pointer$ = . + 2K); + *(.sbss*) + . = ALIGN(4); + __bss_end__ = .; + } > RAM } diff --git a/src/rp2_common/pico_standard_link/script_include/section_tls.incl b/src/rp2_common/pico_standard_link/script_include/section_tls.incl new file mode 100644 index 000000000..0ea4a635d --- /dev/null +++ b/src/rp2_common/pico_standard_link/script_include/section_tls.incl @@ -0,0 +1,63 @@ +/* Sections for proper TLS + + Defines the following symbols for use by code: + __tls_start, __tls_size, __tls_align, __tls_size_align + __arm32_tls_tcb_offset + __tdata_size, __tdata_source, __tbss_size, __tbs_offset +*/ + +SECTIONS +{ + PROVIDE( __tls_start = SIZEOF(.tdata) ? ADDR(.tdata) : ADDR(.tbss) ); + PROVIDE( __tls_size = __tls_end - __tls_start ); + PROVIDE( __tls_align = MAX(ALIGNOF(.tdata), ALIGNOF(.tbss)) ); + PROVIDE( __tls_size_align = ALIGN( __tls_size, __tls_align) ); + PROVIDE( __arm32_tls_tcb_offset = MAX(8, __tls_align) ); + PROVIDE( __tdata_size = SIZEOF(.tdata) ); + PROVIDE( __tdata_source = LOADADDR(.tdata) ); + PROVIDE( __tbss_size = SIZEOF(.tbss) ); + PROVIDE( __tbss_offset = ADDR(.tbss) - __tls_start ); + + /* this marker is present if the code knows we don't need __tls0 and __tls1... we device + * to waste 4 bytes in that case rather than possibly large amounts for __tls0 and __tls1 + * which will never be used */ + .tlsX_not_needed_marker (NOLOAD) : { + KEEP(*(.tlsX_not_needed_marker)) + } + PROVIDE( __tlsX_align = SIZEOF(.tlsX_not_needed_marker) > 0 ? 4 : __tls_align); + PROVIDE( __tlsX_size_align = SIZEOF(.tlsX_not_needed_marker) > 0 ? 0 : __tls_size_align); + + /* + * Workaround older toolchain bugs by doing the PROVIDE(x = y) followed by x = DEFINED(x) ? x : y + * + * Note there is probably a less verbose fix in this case as many of these symbols are + * potentially not needed on older GCC for instance which is certainly using emutls + * but this fix style is known to be correct, so is likely clearer + */ + __tls_start = DEFINED( __tls_start) ? __tls_start : SIZEOF(.tdata) ? ADDR(.tdata) : ADDR(.tbss); + __tls_size = DEFINED( __tls_size) ? __tls_size : __tls_end - __tls_start; + __tls_align = DEFINED( __tls_align) ? __tls_align : MAX(ALIGNOF(.tdata), ALIGNOF(.tbss)); + __tls_size_align = DEFINED( __tls_size_align) ? __tls_size_align : ALIGN( __tls_size, __tls_align); + __arm32_tls_tcb_offset = DEFINED( __arm32_tls_tcb_offset) ? __arm32_tls_tcb_offset : MAX(8, __tls_align); + __tdata_size = DEFINED( __tdata_size) ? __tdata_size : SIZEOF(.tdata); + __tdata_source = DEFINED( __tdata_source) ? __tdata_source : LOADADDR(.tdata); + __tbss_size = DEFINED( __tbss_size) ? __tbss_size : SIZEOF(.tbss); + __tbss_offset = DEFINED( __tbss_offset) ? __tbss_offset : ADDR(.tbss) - __tls_start; + + __tlsX_align = DEFINED(__tlsX_align) ? __tlsX_align : (SIZEOF(.tlsX_not_needed_marker) > 0 ? 4 : __tls_align); + __tlsX_size_align = DEFINED(__tlsX_size_align) ? __tlsX_size_align : (SIZEOF(.tlsX_not_needed_marker) > 0 ? 0 : __tls_size_align); + + /* Thread local storage static allocations, one per core */ + .tls0 (NOLOAD) : { + . = ALIGN(__tlsX_align); + PROVIDE(__tls0_base = .); + . = . + __tlsX_size_align; + } > RAM + + .tls1 (NOLOAD) : { + . = ALIGN(__tlsX_align); + PROVIDE(__tls1_base = .); + . = . + __tlsX_size_align; + } > RAM + +} diff --git a/src/rp2_common/pico_standard_link/script_include/sections_copy_to_ram_data.incl b/src/rp2_common/pico_standard_link/script_include/sections_copy_to_ram_data.incl index ff90005e1..9a9803bbb 100644 --- a/src/rp2_common/pico_standard_link/script_include/sections_copy_to_ram_data.incl +++ b/src/rp2_common/pico_standard_link/script_include/sections_copy_to_ram_data.incl @@ -1,4 +1,4 @@ /* Sub-section list for read/write sections in copy_to_ram binaries */ INCLUDE "section_copy_to_ram_data.incl" -INCLUDE "section_bss.incl" +INCLUDE "section_tls.incl" diff --git a/src/rp2_common/pico_standard_link/script_include/sections_default_data.incl b/src/rp2_common/pico_standard_link/script_include/sections_default_data.incl index cfa77cb2f..ac1f79c8c 100644 --- a/src/rp2_common/pico_standard_link/script_include/sections_default_data.incl +++ b/src/rp2_common/pico_standard_link/script_include/sections_default_data.incl @@ -3,4 +3,4 @@ INCLUDE "section_ram_vector_table.incl" INCLUDE "section_uninitialized_data.incl" INCLUDE "section_default_data.incl" -INCLUDE "section_bss.incl" +INCLUDE "section_tls.incl" diff --git a/src/rp2_common/pico_standard_link/script_include/sections_no_flash_data.incl b/src/rp2_common/pico_standard_link/script_include/sections_no_flash_data.incl index 6a55a1481..e1aa85826 100644 --- a/src/rp2_common/pico_standard_link/script_include/sections_no_flash_data.incl +++ b/src/rp2_common/pico_standard_link/script_include/sections_no_flash_data.incl @@ -2,4 +2,4 @@ INCLUDE "section_no_flash_data.incl" INCLUDE "section_uninitialized_data.incl" -INCLUDE "section_bss.incl" +INCLUDE "section_tls.incl" diff --git a/src/rp2_common/pico_thread_local/BUILD.bazel b/src/rp2_common/pico_thread_local/BUILD.bazel new file mode 100644 index 000000000..d8438bbee --- /dev/null +++ b/src/rp2_common/pico_thread_local/BUILD.bazel @@ -0,0 +1,55 @@ +load("@rules_cc//cc:cc_library.bzl", "cc_library") +load("//bazel:defs.bzl", "compatible_with_rp2") + +package(default_visibility = ["//visibility:public"]) + +alias( + name = "pico_thread_local", + actual = select({ + "//bazel/constraint:pico_thread_local_per_thread_enabled": ":pico_thread_local_per_thread", + "//bazel/constraint:pico_thread_local_global_enabled": ":pico_thread_local_global", + "//conditions:default": ":pico_thread_local_none", + }), +) + +cc_library( + name = "pico_thread_local_per_thread", + srcs = ["thread_local.c"], + hdrs = ["include/pico/thread_local.h", "include/picotls.h"], + defines = ["PICO_THREAD_LOCAL_MODE_PER_THREAD=1"], + includes = ["include"], + target_compatible_with = compatible_with_rp2(), + deps = [ + "//src/common/pico_base_headers", + "//src/rp2_common:pico_platform_internal", + "//src/rp2_common/pico_runtime_init:pico_runtime_init_link", + ], +) + +cc_library( + name = "pico_thread_local_global", + srcs = ["thread_local.c"], + hdrs = ["include/pico/thread_local.h", "include/picotls.h"], + defines = ["PICO_THREAD_LOCAL_MODE_GLOBAL=1"], + includes = ["include"], + target_compatible_with = compatible_with_rp2(), + deps = [ + "//src/common/pico_base_headers", + "//src/rp2_common:pico_platform_internal", + "//src/rp2_common/pico_runtime_init:pico_runtime_init_link", + ], +) + +cc_library( + name = "pico_thread_local_none", + srcs = ["thread_local.c"], + hdrs = ["include/pico/thread_local.h", "include/picotls.h"], + defines = ["PICO_THREAD_LOCAL_MODE_NONE=1"], + includes = ["include"], + target_compatible_with = compatible_with_rp2(), + deps = [ + "//src/common/pico_base_headers", + "//src/rp2_common:pico_platform_internal", + "//src/rp2_common/pico_runtime_init:pico_runtime_init_link", + ], +) diff --git a/src/rp2_common/pico_thread_local/CMakeLists.txt b/src/rp2_common/pico_thread_local/CMakeLists.txt new file mode 100644 index 000000000..220b74a52 --- /dev/null +++ b/src/rp2_common/pico_thread_local/CMakeLists.txt @@ -0,0 +1,92 @@ + +pico_add_library(pico_thread_local_none NOFLAG) +target_sources(pico_thread_local_none INTERFACE ${CMAKE_CURRENT_LIST_DIR}/thread_local.c) +target_include_directories(pico_thread_local_none SYSTEM INTERFACE ${CMAKE_CURRENT_LIST_DIR}/include) +target_compile_definitions(pico_thread_local_none INTERFACE PICO_THREAD_LOCAL_MODE_NONE=1) + +pico_add_library(pico_thread_local_global NOFLAG) +target_sources(pico_thread_local_global INTERFACE ${CMAKE_CURRENT_LIST_DIR}/thread_local.c) +target_include_directories(pico_thread_local_global SYSTEM INTERFACE ${CMAKE_CURRENT_LIST_DIR}/include) +target_compile_definitions(pico_thread_local_global INTERFACE PICO_THREAD_LOCAL_MODE_GLOBAL=1) + +pico_add_library(pico_thread_local_per_thread NOFLAG) +target_sources(pico_thread_local_per_thread INTERFACE ${CMAKE_CURRENT_LIST_DIR}/thread_local.c) +target_include_directories(pico_thread_local_per_thread SYSTEM INTERFACE ${CMAKE_CURRENT_LIST_DIR}/include) +target_compile_definitions(pico_thread_local_per_thread INTERFACE PICO_THREAD_LOCAL_MODE_PER_THREAD=1) + +set(PICO_DEFAULT_TLS_IMPL pico_thread_local_per_thread) + +pico_add_library(pico_thread_local NOFLAG) +target_link_libraries(pico_thread_local INTERFACE + $>,$,${PICO_DEFAULT_TLS_IMPL}>) + +macro(pico_set_tls_implementation TARGET IMPL) + get_target_property(target_type ${TARGET} TYPE) + if ("EXECUTABLE" STREQUAL "${target_type}") + set_target_properties(${TARGET} PROPERTIES PICO_TARGET_TLS_IMPL "pico_thread_local_${IMPL}") + else() + message(FATAL_ERROR "tls implementation must be set on executable not library") + endif() +endmacro() + +# Everything below is an optimization that uses a test compile of __thread variable into a static library +# and looks at what symbols it pulls in, which allows us to identify the actual TLS type on Arm +function(pico_detect_tls_type OUTPUT_VAR) + include(CheckCSourceCompiles) + + # 1. Try to compile a snippet using TLS + set(TLS_CHECK_SOURCE " + __thread int test_var = 42; + int tls_check() { + return test_var; + } + ") + + # Create a temporary file for the source + file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/tls_check.c" "${TLS_CHECK_SOURCE}") + + set(TLS_CHECK_LIBRARY "${CMAKE_CURRENT_BINARY_DIR}/tls_check.a") + set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) + # Attempt to compile the source file + try_compile(CAN_COMPILE_TLS + "${CMAKE_CURRENT_BINARY_DIR}" # Directory for temporary build files + "${CMAKE_CURRENT_BINARY_DIR}/tls_check.c" # Source file to compile + COPY_FILE ${TLS_CHECK_LIBRARY} + ) + + if (CAN_COMPILE_TLS) + execute_process( + COMMAND ${CMAKE_NM} ${TLS_CHECK_LIBRARY} + OUTPUT_VARIABLE NM_OUT + ) + if(NM_OUT MATCHES "__emutls_get_address") + message(STATUS "Detected emulated TLS (__emutls_get_address)") + set(TLS_TYPE "emutls") + elseif(NM_OUT MATCHES "__aeabi_read_tp") + message(STATUS "Detected native TLS (__aeabi_read_tp)") + set(TLS_TYPE "arm-aeabi") + else() + if (NOT PICO_RISCV) + message(WARNING "Could not identify TLS mechanism") + endif() + set(TLS_TYPE "unknown") + endif() + else() + message(STATUS "Compiler does not support __thread keyword.") + set(TLS_TYPE "unsupported") + endif() + set(${OUTPUT_VAR} ${TLS_TYPE} PARENT_SCOPE) + file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/tls_check.c ${CMAKE_CURRENT_BINARY_DIR}/tls_check.a) +endfunction() + +if (NOT PICO_THREAD_LOCAL_TYPE) + pico_detect_tls_type(_TLS_TYPE) + set(PICO_THREAD_LOCAL_TYPE "${_TLS_TYPE}" CACHE INTERNAL "") + unset(_TLS_TYPE) +endif() + +if (PICO_THREAD_LOCAL_TYPE STREQUAL "emutls") + target_compile_definitions(pico_thread_local INTERFACE PICO_THREAD_LOCAL_EMUTLS_CONFIRMED=1) +elseif(PICO_THREAD_LOCAL_TYPE STREQUAL "arm-aeabi") + target_compile_definitions(pico_thread_local INTERFACE PICO_THREAD_LOCAL_THREAD_POINTER_ARM_AEABI_CONFIRMED=1) +endif() \ No newline at end of file diff --git a/src/rp2_common/pico_thread_local/include/pico/thread_local.h b/src/rp2_common/pico_thread_local/include/pico/thread_local.h new file mode 100644 index 000000000..47bffab8e --- /dev/null +++ b/src/rp2_common/pico_thread_local/include/pico/thread_local.h @@ -0,0 +1,43 @@ +/* +* Copyright (c) 2026 Raspberry Pi (Trading) Ltd. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#ifndef _PICO_THREAD_LOCAL_H +#define _PICO_THREAD_LOCAL_H + +#include + +// PICO_CONFIG: PICO_THREAD_LOCAL_CORE1_REINITIALIZE, Whether re-initializing core 1 should reset its thread local variables. A tiny space-saving can be achieved by disabling this support if not needed, type=bool, default=1, advanced=true, group=pico_thread_local +// PICO_CONFIG: PICO_THREAD_LOCAL_PROVIDE_INIT_TLS, Whether pico_thread_local should provide its own implementation of _init_tls, type=bool, default=1 for non-picolibc and 0 for picolibc, advanced=true, group=pico_thread_local +// PICO_CONFIG: PICO_THREAD_LOCAL_PROVIDE_SET_TLS, Whether pico_thread_local should provide its own implementation of _set_tls, type=bool, default=1, advanced=true, group=pico_thread_local + +#if !defined(PICO_THREAD_LOCAL_CORE1_REINITIALIZE) && LIB_PICO_MULTICORE +#define PICO_THREAD_LOCAL_CORE1_REINITIALIZE 1 +#endif + +#if !PICO_THREAD_LOCAL_MODE_NONE +#ifndef PICO_THREAD_LOCAL_PROVIDE_INIT_TLS +#ifndef PICOLIBC_TLS // this provides the function, and frankly in that case we don't expect emutls - is that fair tho? +#define PICO_THREAD_LOCAL_PROVIDE_INIT_TLS 1 +#endif +#endif + +#ifndef PICO_THREAD_LOCAL_PROVIDE_SET_TLS +#define PICO_THREAD_LOCAL_PROVIDE_SET_TLS 1 +#endif +#endif + +#if PICO_THREAD_LOCAL_SUPPORT_THREAD_POINTER +// auto-select THREAD_POINTER type if not provided +#if !PICO_THREAD_LOCAL_THREAD_POINTER_VIA_ARM_EABI && !PICO_THREAD_LOCAL_THREAD_POINTER_VIA_RISCV_REG +#if __riscv || PICO_THREAD_LOCAL_THREAD_POINTER_RISCV_REG_CONFIRMED +#define PICO_THREAD_LOCAL_THREAD_POINTER_VIA_RISCV_REG 1 +#else +#define PICO_THREAD_LOCAL_THREAD_POINTER_VIA_ARM_EABI 1 +#endif +#endif +#endif + +#endif \ No newline at end of file diff --git a/src/rp2_common/pico_thread_local/include/picotls.h b/src/rp2_common/pico_thread_local/include/picotls.h new file mode 100644 index 000000000..ffe7c3620 --- /dev/null +++ b/src/rp2_common/pico_thread_local/include/picotls.h @@ -0,0 +1,122 @@ +/* + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2019 Keith Packard + * Copyright (c) 2026 Raspberry Pi (Trading) Ltd. + */ +#ifndef _PICO_THREAD_LOCAL_PICOTLS_H +#define _PICO_THREAD_LOCAL_PICOTLS_H + +// Note the name of this file is slightly confusing - this is actually overriding from picolibc (if present)! + +/** \file pico/tls.h + * \defgroup pico_thread_local pico_thread_local + * + * \brief C/C++ __thread/thread_local support + * + * This library provides transparent runtime support for thread locals for the different types of thread local implementations used by the SDK supported compilers and C libraries. + * + * This means that by default, each core will get its own value for the thread local variable, and the core 1 value will be re-initialized if the core is restarted. + * + * Additionally, this library provides the "picolibc" style methods `_tls_size()`, `_init_tls()` and `_set_tls()` even when not using picolibc, which makes + * it easy to enable thread locals per task on at least FreeRTOS (just set `#define configUSE_PICOLIBC_TLS 1` in your `FreeRTOSConfig.h`). + * + * The pico_thread_local library comes in three main flavors: + * + * 1. `pico_thread_local_per_thread` (default) - Full support for thread local variables. Each thread/core gets its own copy of the variable initialized when it starts executing. The library tries to minimize the overhead if there are no thread locals are used, however particularly on RISC-V there is always a small overhead. + * 2. `pico_thread_local_global` - Thread local variables are allowed in code, but each thread/core shares the same value (i.e. they aren't really thread local). There is very minimial overhead for this option. + * 3. `pico_thread_local_none` - No support for thread locals is provided. Code using them may not compile or link, and if it does the values won't be shared and may not even be initialized correctly. This mode is however guaranteed to have basically no overhead when thread locals are known not to be used. + * + * The TLS provided by the `pico_thread_local` library may be set from CMake via `pico_set_tls_implementation( per_thread|global|none)` and the default for all targets may be set via CMake variable (e.g. `set(PICO_DEFAULT_TLS_IMPL pico_thread_local_none`). + */ + +// PICO_CONFIG: PICO_THREAD_LOCAL_MODE_PER_THREAD, Enable proper thread local support with one value per thread, type=bool, default=1, group=pico_thread_local +// PICO_CONFIG: PICO_THREAD_LOCAL_MODE_GLOBAL, Support compiling code with thread local variables but only keep one global value, type=bool, default=0, group=pico_thread_local +// PICO_CONFIG: PICO_THREAD_LOCAL_MODE_NONE, No support for thread local variables. Code using __thread may or may not compile/link or work correctly, type=bool, default=0, group=pico_thread_local + +// PICO_CONFIG: PICO_THREAD_LOCAL_SUPPORT_EMUTLS, Thread local support should work with compilers that use emutls, type=bool, default=1, advanced=true, group=pico_thread_local +// PICO_CONFIG: PICO_THREAD_LOCAL_SUPPORT_THREAD_POINTER, Thread local support should work with compilers that use a per thread pointer and provide .tdata and .tbss, type=bool, default=1, advanced=true, group=pico_thread_local + +// PICO_CONFIG: PICO_THREAD_LOCAL_EMUTLS_CONFIRMED, Passed from build if the compiler is known to use Emutls and so other types of support are not needed, type=bool, default=0, advanced=true, group=pico_thread_local +// PICO_CONFIG: PICO_THREAD_LOCAL_THREAD_POINTER_ARM_AEABI_CONFIRMED, Passed from build if the compiler is known to use a thread pointer accessed via __areabi_read_tp and so other types of support are not needed, type=bool, default=0, advanced=true, group=pico_thread_local +// PICO_CONFIG: PICO_THREAD_LOCAL_THREAD_POINTER_RISCV_REG_CONFIRMED, Passed from build if the compiler is known to use a thread pointer stored in the TP register and so other types of support are not needed, type=bool, default=0, advanced=true, group=pico_thread_local + +// Default is PICO_THREAD_LOCAL_MODE_PER_THREAD if not specified +#if !PICO_THREAD_LOCAL_MODE_PER_THREAD && !PICO_THREAD_LOCAL_MODE_GLOBAL && !PICO_THREAD_LOCAL_MODE_NONE +#undef PICO_THREAD_LOCAL_MODE_PER_THREAD +#define PICO_THREAD_LOCAL_MODE_PER_THREAD 1 +#endif + +// Rest of file is contingent on not being in PICO_THREAD_LOCAL_MODE_NONE... +#if !PICO_THREAD_LOCAL_MODE_NONE +#if !PICO_THREAD_LOCAL_SUPPORT_EMUTLS && !PICO_THREAD_LOCAL_SUPPORT_THREAD_POINTER +// PICO_THREAD_LOCAL_EMUTLS_CONFIRMED, PICO_THREAD_LOCAL_THREAD_POINTER_ARM_AEABI_CONFIRMED PICO_THREAD_LOCAL_THREAD_POINTER_RISCV_REG_CONFIRMED are +// optional defines provided by the build that allow us to improve code size slightly if only one is set + +#if PICO_THREAD_LOCAL_EMUTLS_CONFIRMED + PICO_THREAD_LOCAL_THREAD_POINTER_ARM_AEABI_CONFIRMED + PICO_THREAD_LOCAL_THREAD_POINTER_RISCV_REG_CONFIRMED > 1 +#error expected at most one of PICO_THREAD_LOCAL_EMUTLS_CONFIRMED, PICO_THREAD_LOCAL_THREAD_POINTER_ARM_AEABI_CONFIRMED and PICO_THREAD_LOCAL_THREAD_POINTER_RISCV_REG_CONFIRMED to be set +#endif + +#if PICO_THREAD_LOCAL_EMUTLS_CONFIRMED +#define PICO_THREAD_LOCAL_SUPPORT_THREAD_POINTER 0 +#define PICO_THREAD_LOCAL_SUPPORT_EMUTLS 1 +#elif PICO_THREAD_LOCAL_THREAD_POINTER_ARM_AEABI_CONFIRMED || PICO_THREAD_LOCAL_THREAD_POINTER_RISCV_REG_CONFIRMED || defined(PICOLIBC_TLS) || __riscv +// we assume here and in thread_local.c that if PICOLIBC_TLS is set, or we're on RISC-V +// then we're not using emutls - we may want to re-visit if in practice this is not always the case +// +// particularly, with respect to RISC-V the fact that we have to set TP on thread init +// means that TLS code cannot be elided on RISC-V so we want it to be as small as possible +#define PICO_THREAD_LOCAL_SUPPORT_THREAD_POINTER 1 +#define PICO_THREAD_LOCAL_SUPPORT_EMUTLS 0 +#else +// we know nothing, so support both at potential extra runtime cost +#define PICO_THREAD_LOCAL_SUPPORT_THREAD_POINTER 1 +#define PICO_THREAD_LOCAL_SUPPORT_EMUTLS 1 +#endif +#endif + +#if PICO_THREAD_LOCAL_SUPPORT_EMUTLS || PICO_THREAD_LOCAL_MODE_GLOBAL || PICO_THREAD_LOCAL_MODE_NONE +#define PICO_THREAD_LOCAL_REQUIRES_CUSTOM_TLS_SIZE 1 +#endif + +// we can't use the stock header if supporting emutls as the TLS size isn't known at link time +#if !PICO_THREAD_LOCAL_REQUIRES_CUSTOM_TLS_SIZE && defined(__has_include_next) && __has_include_next() +// just use the stock header +#include_next +#else + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#if PICO_THREAD_LOCAL_MODE_GLOBAL || PICO_THREAD_LOCAL_MODE_NONE +static inline size_t _tls_size(void) { return 0; } +#elif PICO_THREAD_LOCAL_SUPPORT_EMUTLS +extern size_t _runtime_tls_size(void); +static inline size_t _tls_size(void) { return _runtime_tls_size(); } +#else +// standard definition for picolibc +extern char __tls_size[]; +static inline size_t _tls_size(void) { return (size_t) (uintptr_t) __tls_size; } +#endif + +/* + * Initialize a TLS block, copying the data segment from flash and + * zeroing the BSS segment. + */ +void _init_tls(void *tls); + +/* Set the TLS pointer to the specific block */ +void _set_tls(void *tls); + +#ifdef __cplusplus +} +#endif + +#endif + +#endif + +#endif diff --git a/src/rp2_common/pico_thread_local/thread_local.c b/src/rp2_common/pico_thread_local/thread_local.c new file mode 100644 index 000000000..36d33b3f5 --- /dev/null +++ b/src/rp2_common/pico_thread_local/thread_local.c @@ -0,0 +1,338 @@ +/* + * Copyright (c) 2026 Raspberry Pi (Trading) Ltd. + * + * SPDX-License-Identifier: BSD-3-Clause + */ +#include +#include +#include "pico.h" +#include +#include "pico/thread_local.h" +#include "pico/runtime_init.h" +#include "hardware/sync.h" + +#if !PICO_THREAD_LOCAL_MODE_PER_THREAD +// if not using per-thread mode we use tdata and tbss directly, so don't waste space on them. +// this is a bit of a hacky way of telling the linker this info! +char __used __attribute__((section(".tlsX_not_needed_marker"))) _tlsX_not_needed_marker; +#endif + +#if PICO_THREAD_LOCAL_MODE_PER_THREAD +#if PICO_THREAD_LOCAL_MODE_GLOBAL +#error PICO_THREAD_LOCAL_MODE_PER_THREAD and PICO_THREAD_LOCAL_MODE_GLOBAL are both specified +#endif +#if PICO_THREAD_LOCAL_MODE_NONE +#error PICO_THREAD_LOCAL_MODE_PER_THREAD and PICO_THREAD_LOCAL_MODE_NONE are both specified +#endif +// ------------------------------------------------------------ +// Proper TLS support per thread (PICO_THREAD_LOCAL_MODE_PER_THREAD = 1) +// ------------------------------------------------------------- + +#if !PICO_THREAD_LOCAL_THREAD_POINTER_VIA_RISCV_REG +// we don't have TP register so need to track one tls region per code +static_assert(NUM_CORES <= 2, ""); + +// per core pointers (not needed on RISC-V as we use TP reg) +static void *__tls_adjusted_by_core[2]; + +#if PICO_THREAD_LOCAL_SUPPORT_THREAD_POINTER && !__riscv +/* The size of the thread control block. + * TLS relocations are generated relative to + * a location this far *before* the first thread + * variable (!) + * NB: The actual size before tp also includes padding + * to align up to the alignment of .tdata/.tbss. + */ +extern uint8_t __arm32_tls_tcb_offset; +#define TLS_ADJUST ((size_t)&__arm32_tls_tcb_offset) +#endif +#endif + +#ifndef TLS_ADJUST +#define TLS_ADJUST 0 +#endif + +// Note our emutls support is only inside of PICO_THREAD_LOCAL_MODE_PER_THREAD as +// the library version suffices otherwise +#if PICO_THREAD_LOCAL_SUPPORT_EMUTLS +// From emutls.c: +// 'For every TLS variable xyz, there is one __emutls_control variable named __emutls_v.xyz. If xyz has +// non-zero initial value, __emutls_v.xyz's "value" will point to __emutls_t.xyz, which has the initial value.' +// +// The linker script groups all the __emutls_v.xyz variables into a single array and provides symbols +// __emutls_array_start and __emutls_array_end, which can be used to iterate over the array. This allows +// the storage for each core's thread local variables to be pre-allocated and pre-initialized, which leaves +// minimal work for __wrap___emutls_get_address. +// +// This array is available to other TLS implementations too, such a TLS implementation for an RTOS. + +// Same layout as libgcc __emutls_object. Unfortunately, __emutls_object doesn't appear in any header files. +typedef struct { + uint32_t size; + uint32_t align; + uint32_t offset; + void *tplate; +} tls_object_t; + +extern __weak tls_object_t __emutls_array_start; +extern __weak tls_object_t __emutls_array_end; + +static volatile bool emutls_one_time_init_done; +static uint32_t _emutls_size; +static uint32_t _emutls_align; +#endif + +// fill a linear region with data from the tls metadata (either emutls objects or tdata/tbss) +static inline void _tls_init_from_emutls_or_tdata(void *tls) { +#if PICO_THREAD_LOCAL_SUPPORT_EMUTLS + uint8_t *tls_adjusted = ((uint8_t *)tls) - TLS_ADJUST; + for (tls_object_t* tls_obj = &__emutls_array_start; tls_obj < &__emutls_array_end; ++tls_obj) { + if (tls_obj->tplate) { + memcpy(tls_adjusted + tls_obj->offset, tls_obj->tplate, tls_obj->size); + } else { + memset(tls_adjusted + tls_obj->offset, 0, tls_obj->size); + } + } +#endif +#if PICO_THREAD_LOCAL_SUPPORT_THREAD_POINTER + // when using thread pointers we expect data to come from tdata/tbss + extern __weak uint8_t __tdata_source[]; + extern __weak uint8_t __tdata_size[]; + extern __weak uint8_t __tbss_size[]; + + uint8_t *tdata_dest = ((uint8_t *)tls); + uint8_t *tbss_dest = ((uint8_t *)tls) + (size_t)__tdata_size; + + /* Copy initialized TLS data from the template */ + memcpy(tdata_dest, __tdata_source, (size_t)__tdata_size); + + /* Zero the uninitialized TLS data (.tbss) */ + memset(tbss_dest, 0, (size_t)__tbss_size); +#endif +} +#define _INIT_TLS_IMPL _tls_init_from_emutls_or_tdata + +static inline void _set_tls_per_thread(void *tls) { + assert(tls); // we should never be setting 0 +#if !PICO_THREAD_LOCAL_THREAD_POINTER_VIA_RISCV_REG + __tls_adjusted_by_core[get_core_num()] = (uint8_t *)tls - TLS_ADJUST; +#else + pico_default_asm_volatile("mv tp, %0\n" : : "r" (tls)); +#endif +} +#define _SET_TLS_IMPL _set_tls_per_thread + +#if PICO_THREAD_LOCAL_SUPPORT_THREAD_POINTER +static __used void *_init_core_local_tls(void) { + /* Initialized by the linker, one per core */ + extern uint8_t __tls0_base[], __tls1_base[]; + static void * const __tls_bases[2] = { __tls0_base, __tls1_base }; + void *tls = __tls_bases[get_core_num()]; + _init_tls(tls); + _set_tls(tls); + return tls - TLS_ADJUST; +} + +#if PICO_THREAD_LOCAL_THREAD_POINTER_VIA_ARM_EABI +uint32_t __attribute__((naked)) __aeabi_read_tp(void) { +#if !__ARM_ARCH_6M__ + pico_default_asm_volatile( + "push {r1,lr} /* Save R1 (and LR) */\n" + "ldr r1,=0xd0000000 /* Address of SIO->CPUID */\n" + "ldr r1,[r1] /* Fetch active core */\n" + "ldr r0,=%0 /* Address of __tls array */\n" + "ldr r0,[r0,r1,lsl #2] /* Fetch __tls[CPUID] */\n" + "cbz r0, 1f\n" + "pop {r1,pc}\n" /* Restore R1 and return */ + "1:\n" + "pop {r1}\n" + "push {r1-r3}\n" + "bl _init_core_local_tls\n" + "pop {r1-r3,pc}\n" + : : "i" (__tls_adjusted_by_core) : "ip", "cc" + ); +#else + pico_default_asm_volatile( + "push {r1,lr} /* Save R1 (and LR) */\n" + "ldr r1,=0xd0000000 /* Address of SIO->CPUID */\n" + "ldr r1,[r1] /* Fetch active core */\n" + "ldr r0,=%0 /* Address of __tls array */\n" + "lsls r1, #2\n" + "ldr r0,[r0,r1] /* Fetch __tls[CPUID] */\n" + "cmp r0, #0\n" + "beq 1f\n" + "pop {r1,pc}\n" /* Restore R1 and return */ + "1:\n" + "pop {r1}\n" + "push {r1-r3}\n" + "bl _init_core_local_tls\n" + "pop {r1-r3,pc}\n" + : : "i" (__tls_adjusted_by_core) : "ip", "cc" + ); +#endif +} +#endif +#endif + +#if PICO_THREAD_LOCAL_SUPPORT_EMUTLS +static inline void *_get_tls_adjusted_for_core(uint core_num) { +#if !PICO_THREAD_LOCAL_THREAD_POINTER_VIA_RISCV_REG + return __tls_adjusted_by_core[core_num]; +#else +#error unsupported tls configuration // we haven't seen this in the wild so error for now + // ((void)core_num); + // void *tls; + // pico_default_asm_volatile("mv %0, tp\n" : "+r" (tls)); + // return tls; +#endif +} + +// This is called lazily (either as a result of __emutls_get_address +// or someone (perhaps RTOS) calling _runtime_tls_size first) to figure out +// the size of the TLS area per thread +static void _emutls_one_time_init(void) { + // Use spinlock so we don't add dependency on mutex code - we don't call anything + spin_lock_t *lock = spin_lock_instance(PICO_SPINLOCK_ID_HARDWARE_CLAIM); + uint32_t save = spin_lock_blocking(lock); + if (!emutls_one_time_init_done) { + // Three passes: + // 1) Calculate the offset of each thread local variable and the total storage to be allocated for each thread. + assert(!_emutls_size); + _emutls_align = 1; + for (tls_object_t* tls_obj = &__emutls_array_start; tls_obj < &__emutls_array_end; ++tls_obj) { + assert((tls_obj->align & (tls_obj->align - 1)) == 0); + + if (tls_obj->align > _emutls_align) { + _emutls_align = tls_obj->align; + } + + _emutls_size = (_emutls_size + tls_obj->align - 1) & ~(tls_obj->align - 1); + tls_obj->offset = _emutls_size + TLS_ADJUST; + _emutls_size += tls_obj->size; + } + emutls_one_time_init_done = true; + } + spin_unlock(lock, save); +} + +// When we support EMUTLS we have _tls_size() redirect here (from our replacement ) +size_t _runtime_tls_size(void) { + static_assert(PICO_THREAD_LOCAL_SUPPORT_EMUTLS, ""); // this function is only provided in this case + if (!emutls_one_time_init_done) _emutls_one_time_init(); + size_t tls_size = _emutls_size; +#if PICO_THREAD_LOCAL_SUPPORT_THREAD_POINTER + extern __weak char __tls_size[]; + // be defensive about having someone put both types of data + tls_size = MAX(tls_size, (size_t)&__tls_size); +#endif + return tls_size; +} + +static void *_emutls_per_core_init(void) { + uint core_num = get_core_num(); + assert(!_get_tls_adjusted_for_core(core_num)); + // do this first, as _emutls_align may be set as a side effect + size_t size = _tls_size(); + assert(size); + if (size) { + // aligned_alloc is not available in all libraries wew support, and it isn't thread safe anyway, + // so we'll just do the padded malloc + // tls = aligned_alloc(size, _emutls_align); + void *tls = malloc(size + _emutls_align - 1); + // note we never free the memory, so bumping the pointer is fine + tls = (void *)((((uintptr_t)tls) + (_emutls_align - 1)) & ~(_emutls_align - 1)); + if (tls) { + _init_tls(tls); + _set_tls(tls); + } + } + return _get_tls_adjusted_for_core(core_num); +} + +void* __emutls_get_address(void* obj) { + void *tls_adjusted = _get_tls_adjusted_for_core(get_core_num()); + if (!tls_adjusted) { + tls_adjusted = _emutls_per_core_init(); + } + assert(tls_adjusted); + return tls_adjusted + ((tls_object_t *)obj)->offset; +} +#endif + +#if PICO_THREAD_LOCAL_SUPPORT_THREAD_POINTER && PICO_THREAD_LOCAL_THREAD_POINTER_VIA_RISCV_REG +// on RISC-V we must set up the pointer each time, note we don't actually respect PICO_THREAD_LOCAL_CORE1_REINITIALIZE +// on RISC-V as it is an optimization flag not a bejavioral flag (i.e. it is indended to be set to 0 +// if you don't need it vs don't want it) +#define _RUNTIME_INIT_PER_CORE_TLS_SETUP_IMPL _init_core_local_tls +#elif PICO_THREAD_LOCAL_CORE1_REINITIALIZE +static inline void _defer_core_local_init(void) { + // note that __tls_adjusted is in .bss, so on core 0 init it is definitely 0 already, so no need to set it to 0 + // to save space, we don't call get_core_num() but just clear __tls_adjusted[1], as runtime_init_per_core_tls_setup + // for core 0 will always be called before runtime_init_per_core_tls_setup for core 1 (so setting __tls_adjusted[1] + // to 0 is idempotent, and then runtime_init_per_core_tls_setup is never called again + + // note we defer any actual work + // __tls_adjusted_by_core[get_core_num()] = 0; + __tls_adjusted_by_core[1] = 0; +} +#define _RUNTIME_INIT_PER_CORE_TLS_SETUP_IMPL _defer_core_local_init +#endif + +#elif PICO_THREAD_LOCAL_MODE_GLOBAL +#if PICO_THREAD_LOCAL_MODE_NONE +#error PICO_THREAD_LOCAL_MODE_GLOBAL and PICO_THREAD_LOCAL_MODE_NONE are both specified +#endif + +// ------------------------------------------------------------ +// TLS values are global +// ------------------------------------------------------------- +#define _INIT_TLS_IMPL(tls) ((void)tls) +#define _SET_TLS_IMPL(tls) ((void)tls) + +#if PICO_THREAD_LOCAL_SUPPORT_THREAD_POINTER +#if PICO_THREAD_LOCAL_THREAD_POINTER_VIA_ARM_EABI +// naked as we must preserve all regs +uint32_t __weak __attribute__((naked)) __aeabi_read_tp(void) { + pico_default_asm_volatile( + ".weak __tls_start\n" + "push {r1,lr}\n" + "ldr r0, = __tls_start\n" + "ldr r1, = __arm32_tls_tcb_offset\n" + "subs r0, r1\n" + "pop {r1, pc}\n" + ); +} +#elif PICO_THREAD_LOCAL_THREAD_POINTER_VIA_RISCV_REG +static inline void _global_tp_init(void) { + extern __weak char __tls_start[]; + pico_default_asm_volatile("mv tp, %0\n" : : "r" (__tls_start)); +} +#define _RUNTIME_INIT_PER_CORE_TLS_SETUP_IMPL _global_tp_init +#endif +#endif +#endif + +#if PICO_THREAD_LOCAL_PROVIDE_INIT_TLS +void _init_tls(void *tls) { + // we expect an impl in this case + _INIT_TLS_IMPL(tls); +} +#endif + +#if PICO_THREAD_LOCAL_PROVIDE_SET_TLS +void _set_tls(void *tls) { + // we expect an impl in this case + _SET_TLS_IMPL(tls); +} +#endif + +#ifdef _RUNTIME_INIT_PER_CORE_TLS_SETUP_IMPL +void runtime_init_per_core_tls_setup(void) { + _RUNTIME_INIT_PER_CORE_TLS_SETUP_IMPL(); +} + +#if !PICO_RUNTIME_SKIP_INIT_PER_CORE_TLS_SETUP +PICO_RUNTIME_INIT_FUNC_PER_CORE(runtime_init_per_core_tls_setup, PICO_RUNTIME_INIT_PER_CORE_TLS_SETUP); +#endif +#endif + diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 6e646f847..c9cd94124 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -15,4 +15,5 @@ if (PICO_ON_DEVICE) add_subdirectory(pico_sem_test) add_subdirectory(pico_sha256_test) add_subdirectory(pico_async_context_test) + add_subdirectory(pico_thread_local_test) endif() diff --git a/test/pico_thread_local_test/BUILD.bazel b/test/pico_thread_local_test/BUILD.bazel new file mode 100644 index 000000000..84c7a4cbf --- /dev/null +++ b/test/pico_thread_local_test/BUILD.bazel @@ -0,0 +1,27 @@ +load("@rules_cc//cc:cc_binary.bzl", "cc_binary") +load("//bazel:defs.bzl", "compatible_with_rp2") +load("//bazel/util:transition.bzl", "extra_copts_for_all_deps") + +package(default_visibility = ["//visibility:public"]) + +cc_binary( + name = "pico_thread_local_test_actual", + testonly = True, + srcs = ["pico_thread_local_test.c"], + tags = ["manual"], # Built via pico_thread_local_test. + # Doesn't appear to work on host builds yet. + target_compatible_with = compatible_with_rp2(), + deps = [ + "//src/rp2_common/pico_multicore", + "//src/rp2_common/pico_stdlib", + "//src/rp2_common/pico_thread_local", + ], +) + +extra_copts_for_all_deps( + name = "pico_thread_local_test", + testonly = True, + src = ":pico_thread_local_test_actual", + # Host doesn't support pico_multicore without pico_host_sdl. + target_compatible_with = compatible_with_rp2(), +) diff --git a/test/pico_thread_local_test/CMakeLists.txt b/test/pico_thread_local_test/CMakeLists.txt new file mode 100644 index 000000000..277a8af7b --- /dev/null +++ b/test/pico_thread_local_test/CMakeLists.txt @@ -0,0 +1,20 @@ +if (TARGET pico_multicore) + # Note the global versions don't actually need multicore, but it didn't seem worth adding #ifdef to them all + add_executable(pico_thread_local_test pico_thread_local_test.c) + target_link_libraries(pico_thread_local_test PRIVATE pico_stdlib pico_multicore) + pico_add_extra_outputs(pico_thread_local_test) + + add_executable(pico_thread_local_test_global pico_thread_local_test.c) + target_link_libraries(pico_thread_local_test_global PRIVATE pico_stdlib pico_multicore) + pico_add_extra_outputs(pico_thread_local_test_global) + pico_set_tls_implementation(pico_thread_local_test_global global) + + add_executable(pico_thread_local_test_cpp pico_thread_local_test_cpp.cpp) + target_link_libraries(pico_thread_local_test_cpp PRIVATE pico_stdlib pico_multicore) + pico_add_extra_outputs(pico_thread_local_test_cpp) + + add_executable(pico_thread_local_test_cpp_global pico_thread_local_test_cpp.cpp) + target_link_libraries(pico_thread_local_test_cpp_global PRIVATE pico_stdlib pico_multicore) + pico_add_extra_outputs(pico_thread_local_test_cpp_global) + pico_set_tls_implementation(pico_thread_local_test_cpp_global global) +endif() \ No newline at end of file diff --git a/test/pico_thread_local_test/pico_thread_local_test.c b/test/pico_thread_local_test/pico_thread_local_test.c new file mode 100644 index 000000000..e2a6e6730 --- /dev/null +++ b/test/pico_thread_local_test/pico_thread_local_test.c @@ -0,0 +1,97 @@ +/** + * Copyright (c) 2020 Raspberry Pi (Trading) Ltd. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#include +#include +#include +#include +#include "pico/stdlib.h" +#include "pico/multicore.h" +#include "pico/sem.h" + +#include +#define COUNT 10000 + +volatile __thread int counter = 7; +__thread int zero; +int core1_count; + +semaphore_t sem; + +int do_count(int delta) { + for (int i=0;i