generated from nhcarrigan/template
feat: we successfully have the installer working for windows!
Models are downloaded at runtime instead of build.
This commit is contained in:
@@ -0,0 +1,490 @@
|
||||
include(CheckCXXCompilerFlag)
|
||||
include("../cmake/common.cmake")
|
||||
|
||||
add_compile_definitions(GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES})
|
||||
|
||||
# enable libstdc++ assertions for debug builds
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
|
||||
endif()
|
||||
|
||||
if (NOT MSVC)
|
||||
if (GGML_SANITIZE_THREAD)
|
||||
add_compile_options(-fsanitize=thread)
|
||||
link_libraries (-fsanitize=thread)
|
||||
endif()
|
||||
|
||||
if (GGML_SANITIZE_ADDRESS)
|
||||
add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
|
||||
link_libraries (-fsanitize=address)
|
||||
endif()
|
||||
|
||||
if (GGML_SANITIZE_UNDEFINED)
|
||||
add_compile_options(-fsanitize=undefined)
|
||||
link_libraries (-fsanitize=undefined)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_FATAL_WARNINGS)
|
||||
if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
||||
list(APPEND C_FLAGS -Werror)
|
||||
list(APPEND CXX_FLAGS -Werror)
|
||||
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
|
||||
add_compile_options(/WX)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_ALL_WARNINGS)
|
||||
if (NOT MSVC)
|
||||
list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
|
||||
list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
|
||||
-Werror=implicit-int -Werror=implicit-function-declaration)
|
||||
list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
|
||||
|
||||
list(APPEND C_FLAGS ${WARNING_FLAGS})
|
||||
list(APPEND CXX_FLAGS ${WARNING_FLAGS})
|
||||
|
||||
ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
|
||||
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
|
||||
"$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
|
||||
else()
|
||||
# todo : msvc
|
||||
set(C_FLAGS "")
|
||||
set(CXX_FLAGS "")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_LTO)
|
||||
include(CheckIPOSupported)
|
||||
check_ipo_supported(RESULT result OUTPUT output)
|
||||
if (result)
|
||||
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
|
||||
else()
|
||||
message(WARNING "IPO is not supported: ${output}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_CCACHE AND NOT CMAKE_C_COMPILER_LAUNCHER AND NOT CMAKE_CXX_COMPILER_LAUNCHER)
|
||||
find_program(GGML_CCACHE_FOUND ccache)
|
||||
find_program(GGML_SCCACHE_FOUND sccache)
|
||||
|
||||
if (GGML_CCACHE_FOUND OR GGML_SCCACHE_FOUND)
|
||||
if(GGML_CCACHE_FOUND)
|
||||
set(GGML_CCACHE_VARIANT ccache)
|
||||
else()
|
||||
set(GGML_CCACHE_VARIANT sccache)
|
||||
endif()
|
||||
# TODO: should not be set globally
|
||||
if (GGML_SYCL AND GGML_CCACHE_FOUND AND WIN32)
|
||||
set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "ccache compiler_type=icl")
|
||||
else ()
|
||||
set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${GGML_CCACHE_VARIANT}")
|
||||
endif ()
|
||||
set(ENV{CCACHE_SLOPPINESS} time_macros)
|
||||
message(STATUS "${GGML_CCACHE_VARIANT} found, compilation results will be cached. Disable with GGML_CCACHE=OFF.")
|
||||
else()
|
||||
message(STATUS "Warning: ccache not found - consider installing it for faster compilation or disable this warning with GGML_CCACHE=OFF")
|
||||
endif ()
|
||||
endif()
|
||||
|
||||
# this version of Apple ld64 is buggy
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
|
||||
ERROR_VARIABLE output
|
||||
OUTPUT_QUIET
|
||||
)
|
||||
|
||||
if (output MATCHES "dyld-1015\.7")
|
||||
add_compile_definitions(HAVE_BUGGY_APPLE_LINKER)
|
||||
endif()
|
||||
|
||||
# architecture specific
|
||||
# TODO: probably these flags need to be tweaked on some architectures
|
||||
# feel free to update the Makefile for your architecture and send a pull request or issue
|
||||
message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
|
||||
if (MSVC)
|
||||
string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR)
|
||||
message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}")
|
||||
else ()
|
||||
set(CMAKE_GENERATOR_PLATFORM_LWR "")
|
||||
endif ()
|
||||
ggml_get_system_arch()
|
||||
message(STATUS "GGML_SYSTEM_ARCH: ${GGML_SYSTEM_ARCH}")
|
||||
|
||||
if (NOT MSVC)
|
||||
if (GGML_STATIC)
|
||||
if (UNIX AND NOT APPLE)
|
||||
set(CMAKE_FIND_LIBRARY_SUFFIXES ".a;.so")
|
||||
endif()
|
||||
add_link_options(-static)
|
||||
if (MINGW)
|
||||
add_link_options(-static-libgcc -static-libstdc++)
|
||||
endif()
|
||||
endif()
|
||||
if (GGML_GPROF)
|
||||
add_compile_options(-pg)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
#
|
||||
# POSIX conformance
|
||||
#
|
||||
|
||||
# clock_gettime came in POSIX.1b (1993)
|
||||
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
|
||||
# posix_memalign came in POSIX.1-2001 / SUSv3
|
||||
# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
|
||||
|
||||
# Somehow in OpenBSD whenever POSIX conformance is specified
|
||||
# some string functions rely on locale_t availability,
|
||||
# which was introduced in POSIX.1-2008, forcing us to go higher
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
|
||||
add_compile_definitions(_XOPEN_SOURCE=700)
|
||||
elseif (CMAKE_SYSTEM_NAME MATCHES "AIX")
|
||||
# Don't define _XOPEN_SOURCE. We need _ALL_SOURCE, which is the default,
|
||||
# in order to define _SC_PHYS_PAGES.
|
||||
else()
|
||||
add_compile_definitions(_XOPEN_SOURCE=600)
|
||||
endif()
|
||||
|
||||
# Data types, macros and functions related to controlling CPU affinity and
|
||||
# some memory allocation are available on Linux through GNU extensions in libc
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Android")
|
||||
add_compile_definitions(_GNU_SOURCE)
|
||||
endif()
|
||||
|
||||
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
|
||||
# and on macOS its availability depends on enabling Darwin extensions
|
||||
# similarly on DragonFly, enabling BSD extensions is necessary
|
||||
if (
|
||||
CMAKE_SYSTEM_NAME MATCHES "Darwin" OR
|
||||
CMAKE_SYSTEM_NAME MATCHES "iOS" OR
|
||||
CMAKE_SYSTEM_NAME MATCHES "tvOS" OR
|
||||
CMAKE_SYSTEM_NAME MATCHES "DragonFly"
|
||||
)
|
||||
add_compile_definitions(_DARWIN_C_SOURCE)
|
||||
endif()
|
||||
|
||||
# alloca is a non-standard interface that is not visible on BSDs when
|
||||
# POSIX conformance is specified, but not all of them provide a clean way
|
||||
# to enable it in such cases
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
|
||||
add_compile_definitions(__BSD_VISIBLE)
|
||||
endif()
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
|
||||
add_compile_definitions(_NETBSD_SOURCE)
|
||||
endif()
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
|
||||
add_compile_definitions(_BSD_SOURCE)
|
||||
endif()
|
||||
|
||||
if (WIN32)
|
||||
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
|
||||
endif()
|
||||
|
||||
# ggml
|
||||
|
||||
if (GGML_BACKEND_DL AND NOT BUILD_SHARED_LIBS)
|
||||
message(FATAL_ERROR "GGML_BACKEND_DL requires BUILD_SHARED_LIBS")
|
||||
endif()
|
||||
|
||||
add_library(ggml-base
|
||||
../include/ggml.h
|
||||
../include/ggml-alloc.h
|
||||
../include/ggml-backend.h
|
||||
../include/ggml-cpp.h
|
||||
../include/ggml-opt.h
|
||||
../include/gguf.h
|
||||
ggml.c
|
||||
ggml.cpp
|
||||
ggml-alloc.c
|
||||
ggml-backend.cpp
|
||||
ggml-opt.cpp
|
||||
ggml-threading.cpp
|
||||
ggml-threading.h
|
||||
ggml-quants.c
|
||||
ggml-quants.h
|
||||
gguf.cpp)
|
||||
|
||||
set_target_properties(ggml-base PROPERTIES
|
||||
VERSION ${GGML_VERSION}
|
||||
SOVERSION ${GGML_VERSION_MAJOR}
|
||||
)
|
||||
|
||||
target_include_directories(ggml-base PRIVATE .)
|
||||
if (GGML_BACKEND_DL)
|
||||
target_compile_definitions(ggml-base PUBLIC GGML_BACKEND_DL)
|
||||
endif()
|
||||
|
||||
if (GGML_SCHED_NO_REALLOC)
|
||||
target_compile_definitions(ggml-base PUBLIC GGML_SCHED_NO_REALLOC)
|
||||
endif()
|
||||
|
||||
add_library(ggml
|
||||
ggml-backend-reg.cpp)
|
||||
add_library(ggml::ggml ALIAS ggml)
|
||||
|
||||
set_target_properties(ggml PROPERTIES
|
||||
VERSION ${GGML_VERSION}
|
||||
SOVERSION ${GGML_VERSION_MAJOR}
|
||||
)
|
||||
|
||||
if (GGML_BACKEND_DIR)
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
message(FATAL_ERROR "GGML_BACKEND_DIR requires GGML_BACKEND_DL")
|
||||
endif()
|
||||
target_compile_definitions(ggml PUBLIC GGML_BACKEND_DIR="${GGML_BACKEND_DIR}")
|
||||
endif()
|
||||
|
||||
target_link_libraries(ggml PUBLIC ggml-base)
|
||||
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
target_link_libraries(ggml PRIVATE dl)
|
||||
endif()
|
||||
|
||||
function(ggml_add_backend_library backend)
|
||||
if (GGML_BACKEND_DL)
|
||||
add_library(${backend} MODULE ${ARGN})
|
||||
# write the shared library to the output directory
|
||||
set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
|
||||
target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
|
||||
add_dependencies(ggml ${backend})
|
||||
if (GGML_BACKEND_DIR)
|
||||
install(TARGETS ${backend} LIBRARY DESTINATION ${GGML_BACKEND_DIR})
|
||||
else()
|
||||
install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||
endif()
|
||||
else()
|
||||
add_library(${backend} ${ARGN})
|
||||
target_link_libraries(ggml PUBLIC ${backend})
|
||||
install(TARGETS ${backend} LIBRARY)
|
||||
endif()
|
||||
|
||||
target_link_libraries(${backend} PRIVATE ggml-base)
|
||||
target_include_directories(${backend} PRIVATE ..)
|
||||
|
||||
if (${BUILD_SHARED_LIBS})
|
||||
target_compile_definitions(${backend} PRIVATE GGML_BACKEND_BUILD)
|
||||
target_compile_definitions(${backend} PUBLIC GGML_BACKEND_SHARED)
|
||||
endif()
|
||||
|
||||
# Set versioning properties for all backend libraries
|
||||
# Building a MODULE library with a version is not supported on macOS (https://gitlab.kitware.com/cmake/cmake/-/issues/20782)
|
||||
if (NOT (APPLE AND GGML_BACKEND_DL))
|
||||
set_target_properties(${backend} PROPERTIES
|
||||
VERSION ${GGML_VERSION}
|
||||
SOVERSION ${GGML_VERSION_MAJOR}
|
||||
)
|
||||
endif()
|
||||
|
||||
if(NOT GGML_AVAILABLE_BACKENDS)
|
||||
set(GGML_AVAILABLE_BACKENDS "${backend}"
|
||||
CACHE INTERNAL "List of backends for cmake package")
|
||||
else()
|
||||
list(FIND GGML_AVAILABLE_BACKENDS "${backend}" has_backend)
|
||||
if(has_backend EQUAL -1)
|
||||
set(GGML_AVAILABLE_BACKENDS "${GGML_AVAILABLE_BACKENDS};${backend}"
|
||||
CACHE INTERNAL "List of backends for cmake package")
|
||||
endif()
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
function(ggml_add_backend backend)
|
||||
string(TOUPPER "GGML_${backend}" backend_id)
|
||||
if (${backend_id})
|
||||
string(TOLOWER "ggml-${backend}" backend_target)
|
||||
add_subdirectory(${backend_target})
|
||||
message(STATUS "Including ${backend} backend")
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
string(TOUPPER "GGML_USE_${backend}" backend_use)
|
||||
target_compile_definitions(ggml PUBLIC ${backend_use})
|
||||
endif()
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
function(ggml_add_cpu_backend_variant tag_name)
|
||||
set(GGML_CPU_TAG_NAME ${tag_name})
|
||||
# other: OPENMP LLAMAFILE CPU_HBM
|
||||
if (GGML_SYSTEM_ARCH STREQUAL "x86")
|
||||
foreach (feat NATIVE
|
||||
SSE42
|
||||
AVX AVX2 BMI2 AVX_VNNI FMA F16C
|
||||
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
|
||||
AMX_TILE AMX_INT8 AMX_BF16)
|
||||
set(GGML_${feat} OFF)
|
||||
endforeach()
|
||||
|
||||
foreach (feat ${ARGN})
|
||||
set(GGML_${feat} ON)
|
||||
endforeach()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "ARM")
|
||||
foreach (feat ${ARGN})
|
||||
set(GGML_INTERNAL_${feat} ON)
|
||||
endforeach()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
|
||||
foreach (feat ${ARGN})
|
||||
set(GGML_INTERNAL_${feat} ON)
|
||||
endforeach()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
|
||||
foreach (feat VXE2 NNPA)
|
||||
set(GGML_INTERNAL_${feat} OFF)
|
||||
endforeach()
|
||||
|
||||
foreach (feat ${ARGN})
|
||||
set(GGML_INTERNAL_${feat} ON)
|
||||
endforeach()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
|
||||
foreach (feat RVV)
|
||||
set(GGML_INTERNAL_${feat} OFF)
|
||||
endforeach()
|
||||
|
||||
foreach (feat ${ARGN})
|
||||
set(GGML_INTERNAL_${feat} ON)
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
ggml_add_cpu_backend_variant_impl(${tag_name})
|
||||
endfunction()
|
||||
|
||||
ggml_add_backend(CPU)
|
||||
|
||||
if (GGML_CPU_ALL_VARIANTS)
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
|
||||
elseif (GGML_CPU_ARM_ARCH)
|
||||
message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS")
|
||||
endif()
|
||||
if (GGML_SYSTEM_ARCH STREQUAL "x86")
|
||||
ggml_add_cpu_backend_variant(x64)
|
||||
ggml_add_cpu_backend_variant(sse42 SSE42)
|
||||
ggml_add_cpu_backend_variant(sandybridge SSE42 AVX)
|
||||
if (NOT MSVC)
|
||||
# __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
||||
ggml_add_cpu_backend_variant(ivybridge SSE42 AVX F16C)
|
||||
ggml_add_cpu_backend_variant(piledriver SSE42 AVX F16C FMA)
|
||||
endif()
|
||||
ggml_add_cpu_backend_variant(haswell SSE42 AVX F16C FMA AVX2 BMI2)
|
||||
ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C FMA AVX2 BMI2 AVX512)
|
||||
ggml_add_cpu_backend_variant(cannonlake SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI)
|
||||
ggml_add_cpu_backend_variant(cascadelake SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VNNI)
|
||||
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI AVX512_VNNI)
|
||||
if (NOT MSVC)
|
||||
# MSVC 2022 doesn't support BF16 intrinsics without `/arch:AVX10.1` ?!
|
||||
# https://learn.microsoft.com/en-us/cpp/intrinsics/x64-amd64-intrinsics-list?view=msvc-170
|
||||
# https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170
|
||||
ggml_add_cpu_backend_variant(cooperlake SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VNNI AVX512_BF16)
|
||||
ggml_add_cpu_backend_variant(zen4 SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16)
|
||||
endif()
|
||||
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C FMA AVX2 BMI2 AVX_VNNI)
|
||||
if (NOT MSVC)
|
||||
# MSVC doesn't support AMX
|
||||
ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
||||
endif()
|
||||
elseif(GGML_SYSTEM_ARCH STREQUAL "ARM")
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
# Many of these features are optional so we build versions with popular
|
||||
# combinations and name the backends based on the version they were
|
||||
# first released with
|
||||
ggml_add_cpu_backend_variant(armv8.0_1)
|
||||
ggml_add_cpu_backend_variant(armv8.2_1 DOTPROD)
|
||||
ggml_add_cpu_backend_variant(armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
|
||||
ggml_add_cpu_backend_variant(armv8.2_3 DOTPROD FP16_VECTOR_ARITHMETIC SVE)
|
||||
ggml_add_cpu_backend_variant(armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8)
|
||||
ggml_add_cpu_backend_variant(armv8.6_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2)
|
||||
ggml_add_cpu_backend_variant(armv9.2_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SME)
|
||||
ggml_add_cpu_backend_variant(armv9.2_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2 SME)
|
||||
elseif (CMAKE_SYSTEM_NAME MATCHES "Android")
|
||||
# Android-specific backends with SoC-compatible feature sets
|
||||
ggml_add_cpu_backend_variant(android_armv8.0_1)
|
||||
ggml_add_cpu_backend_variant(android_armv8.2_1 DOTPROD)
|
||||
ggml_add_cpu_backend_variant(android_armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
|
||||
ggml_add_cpu_backend_variant(android_armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC MATMUL_INT8)
|
||||
ggml_add_cpu_backend_variant(android_armv9.0_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE2)
|
||||
ggml_add_cpu_backend_variant(android_armv9.2_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE SME)
|
||||
ggml_add_cpu_backend_variant(android_armv9.2_2 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE SVE2 SME)
|
||||
elseif (APPLE)
|
||||
ggml_add_cpu_backend_variant(apple_m1 DOTPROD)
|
||||
ggml_add_cpu_backend_variant(apple_m2_m3 DOTPROD MATMUL_INT8)
|
||||
ggml_add_cpu_backend_variant(apple_m4 DOTPROD MATMUL_INT8 NOSVE SME)
|
||||
else()
|
||||
message(FATAL_ERROR "Unsupported ARM target OS: ${CMAKE_SYSTEM_NAME}")
|
||||
endif()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
ggml_add_cpu_backend_variant(power0)
|
||||
ggml_add_cpu_backend_variant(power7_1 POWER7)
|
||||
ggml_add_cpu_backend_variant(power7_2 POWER7 VSX)
|
||||
ggml_add_cpu_backend_variant(power8_1 POWER8)
|
||||
ggml_add_cpu_backend_variant(power8_2 POWER8 VSX)
|
||||
ggml_add_cpu_backend_variant(power9 POWER9 VSX)
|
||||
ggml_add_cpu_backend_variant(power10 POWER10 VSX)
|
||||
ggml_add_cpu_backend_variant(power11 POWER11 VSX)
|
||||
else()
|
||||
message(FATAL_ERROR "Unsupported PowerPC target OS: ${CMAKE_SYSTEM_NAME}")
|
||||
endif()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
ggml_add_cpu_backend_variant(z15 Z15 VXE2)
|
||||
ggml_add_cpu_backend_variant(z16 Z16 VXE2 NNPA)
|
||||
else()
|
||||
message(FATAL_ERROR "Unsupported s390x target OS: ${CMAKE_SYSTEM_NAME}")
|
||||
endif()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
ggml_add_cpu_backend_variant(riscv64_0)
|
||||
ggml_add_cpu_backend_variant(riscv64_v RVV)
|
||||
else()
|
||||
message(FATAL_ERROR "Unsupported RISC-V target OS: ${CMAKE_SYSTEM_NAME}")
|
||||
endif()
|
||||
else()
|
||||
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}")
|
||||
endif()
|
||||
elseif (GGML_CPU)
|
||||
ggml_add_cpu_backend_variant_impl("")
|
||||
endif()
|
||||
|
||||
ggml_add_backend(BLAS)
|
||||
ggml_add_backend(CANN)
|
||||
ggml_add_backend(CUDA)
|
||||
ggml_add_backend(HIP)
|
||||
ggml_add_backend(METAL)
|
||||
ggml_add_backend(MUSA)
|
||||
ggml_add_backend(RPC)
|
||||
ggml_add_backend(SYCL)
|
||||
ggml_add_backend(Vulkan)
|
||||
ggml_add_backend(WebGPU)
|
||||
ggml_add_backend(zDNN)
|
||||
ggml_add_backend(OpenCL)
|
||||
ggml_add_backend(Hexagon)
|
||||
ggml_add_backend(ZenDNN)
|
||||
|
||||
foreach (target ggml-base ggml)
|
||||
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
|
||||
target_compile_features (${target} PRIVATE c_std_11 cxx_std_17) # don't bump
|
||||
endforeach()
|
||||
|
||||
target_link_libraries(ggml-base PRIVATE Threads::Threads)
|
||||
|
||||
find_library(MATH_LIBRARY m)
|
||||
if (MATH_LIBRARY)
|
||||
if (NOT WIN32 OR NOT DEFINED ENV{ONEAPI_ROOT})
|
||||
target_link_libraries(ggml-base PRIVATE m)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Android")
|
||||
target_link_libraries(ggml-base PRIVATE dl)
|
||||
endif()
|
||||
|
||||
if(CMAKE_SYSTEM_NAME MATCHES "visionOS")
|
||||
target_compile_definitions(ggml-base PUBLIC _DARWIN_C_SOURCE)
|
||||
endif()
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
foreach (target ggml-base ggml)
|
||||
set_target_properties(${target} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_compile_definitions(${target} PRIVATE GGML_BUILD)
|
||||
target_compile_definitions(${target} PUBLIC GGML_SHARED)
|
||||
endforeach()
|
||||
endif()
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,255 @@
|
||||
#pragma once
|
||||
|
||||
// ggml-backend internal header
|
||||
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define GGML_BACKEND_API_VERSION 2
|
||||
|
||||
//
|
||||
// Backend buffer type
|
||||
//
|
||||
|
||||
struct ggml_backend_buffer_type_i {
|
||||
const char * (*get_name) (ggml_backend_buffer_type_t buft);
|
||||
// allocate a buffer of this type
|
||||
ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
||||
// tensor alignment
|
||||
size_t (*get_alignment) (ggml_backend_buffer_type_t buft);
|
||||
// (optional) max buffer size that can be allocated (defaults to SIZE_MAX)
|
||||
size_t (*get_max_size) (ggml_backend_buffer_type_t buft);
|
||||
// (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
|
||||
size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
|
||||
// (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
|
||||
bool (*is_host) (ggml_backend_buffer_type_t buft);
|
||||
};
|
||||
|
||||
struct ggml_backend_buffer_type {
|
||||
struct ggml_backend_buffer_type_i iface;
|
||||
ggml_backend_dev_t device;
|
||||
void * context;
|
||||
};
|
||||
|
||||
//
|
||||
// Backend buffer
|
||||
//
|
||||
|
||||
struct ggml_backend_buffer_i {
|
||||
// (optional) free the buffer
|
||||
void (*free_buffer) (ggml_backend_buffer_t buffer);
|
||||
// base address of the buffer
|
||||
void * (*get_base) (ggml_backend_buffer_t buffer);
|
||||
// (optional) initialize a tensor in the buffer (eg. add tensor extras)
|
||||
enum ggml_status (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
// tensor data access
|
||||
void (*memset_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
|
||||
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
// (optional) tensor copy: dst is in the buffer, src may be in any buffer, including buffers from a different backend (return false if not supported)
|
||||
bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
// clear the entire buffer
|
||||
void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
||||
// (optional) reset any internal state due to tensor initialization, such as tensor extras
|
||||
void (*reset) (ggml_backend_buffer_t buffer);
|
||||
};
|
||||
|
||||
struct ggml_backend_buffer {
|
||||
struct ggml_backend_buffer_i iface;
|
||||
ggml_backend_buffer_type_t buft;
|
||||
void * context;
|
||||
size_t size;
|
||||
enum ggml_backend_buffer_usage usage;
|
||||
};
|
||||
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||
ggml_backend_buffer_type_t buft,
|
||||
struct ggml_backend_buffer_i iface,
|
||||
void * context,
|
||||
size_t size);
|
||||
|
||||
// do not use directly, use ggml_backend_tensor_copy instead
|
||||
GGML_API bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
|
||||
// multi-buffer
|
||||
// buffer that contains a collection of buffers
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
|
||||
GGML_API bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
||||
|
||||
//
|
||||
// Backend (stream)
|
||||
//
|
||||
|
||||
struct ggml_backend_i {
|
||||
const char * (*get_name)(ggml_backend_t backend);
|
||||
|
||||
void (*free)(ggml_backend_t backend);
|
||||
|
||||
// (optional) asynchronous tensor data access
|
||||
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
|
||||
// (optional) complete all pending operations (required if the backend supports async operations)
|
||||
void (*synchronize)(ggml_backend_t backend);
|
||||
|
||||
// (optional) graph plans (not used currently)
|
||||
// compute graph with a plan
|
||||
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
||||
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
|
||||
void (*graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
|
||||
// compute the graph with the plan
|
||||
enum ggml_status (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
|
||||
// compute graph (always async if supported by the backend)
|
||||
enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
|
||||
// (optional) event synchronization
|
||||
// record an event on this stream
|
||||
void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
|
||||
// wait for an event on on a different stream
|
||||
void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
|
||||
|
||||
// (optional) sort/optimize the nodes in the graph
|
||||
void (*graph_optimize) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
};
|
||||
|
||||
struct ggml_backend {
|
||||
ggml_guid_t guid;
|
||||
struct ggml_backend_i iface;
|
||||
ggml_backend_dev_t device;
|
||||
void * context;
|
||||
};
|
||||
|
||||
struct ggml_backend_event {
|
||||
struct ggml_backend_device * device;
|
||||
void * context;
|
||||
};
|
||||
|
||||
//
|
||||
// Backend device
|
||||
//
|
||||
|
||||
// Note: if additional properties are needed, we should add a struct with all of them
|
||||
// the current functions to obtain the properties can remain, since they are more convenient for often used properties
|
||||
struct ggml_backend_device_i {
|
||||
// device name: short identifier for this device, such as "CPU" or "CUDA0"
|
||||
const char * (*get_name)(ggml_backend_dev_t dev);
|
||||
|
||||
// device description: short informative description of the device, could be the model name
|
||||
const char * (*get_description)(ggml_backend_dev_t dev);
|
||||
|
||||
// device memory in bytes: 0 bytes to indicate no memory to report
|
||||
void (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total);
|
||||
|
||||
// device type
|
||||
enum ggml_backend_dev_type (*get_type)(ggml_backend_dev_t dev);
|
||||
|
||||
// device properties
|
||||
void (*get_props)(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props);
|
||||
|
||||
// backend (stream) initialization
|
||||
ggml_backend_t (*init_backend)(ggml_backend_dev_t dev, const char * params);
|
||||
|
||||
// preferred buffer type
|
||||
ggml_backend_buffer_type_t (*get_buffer_type)(ggml_backend_dev_t dev);
|
||||
|
||||
// (optional) host buffer type (in system memory, typically this is a pinned memory buffer for faster transfers between host and device)
|
||||
ggml_backend_buffer_type_t (*get_host_buffer_type)(ggml_backend_dev_t dev);
|
||||
|
||||
// (optional) buffer from pointer: create a buffer from a host pointer (useful for memory mapped models and importing data from other libraries)
|
||||
ggml_backend_buffer_t (*buffer_from_host_ptr)(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size);
|
||||
|
||||
// check if the backend can compute an operation
|
||||
bool (*supports_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);
|
||||
|
||||
// check if the backend can use tensors allocated in a buffer type
|
||||
bool (*supports_buft)(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft);
|
||||
|
||||
// (optional) check if the backend wants to run an operation, even if the weights are allocated in an incompatible buffer
|
||||
// these should be expensive operations that may benefit from running on this backend instead of the CPU backend
|
||||
bool (*offload_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);
|
||||
|
||||
// (optional) event synchronization
|
||||
ggml_backend_event_t (*event_new) (ggml_backend_dev_t dev);
|
||||
void (*event_free) (ggml_backend_dev_t dev, ggml_backend_event_t event);
|
||||
void (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event);
|
||||
};
|
||||
|
||||
struct ggml_backend_device {
|
||||
struct ggml_backend_device_i iface;
|
||||
ggml_backend_reg_t reg;
|
||||
void * context;
|
||||
};
|
||||
|
||||
//
|
||||
// Backend (reg)
|
||||
//
|
||||
|
||||
struct ggml_backend_reg_i {
|
||||
const char * (*get_name)(ggml_backend_reg_t reg);
|
||||
|
||||
// enumerate available devices
|
||||
size_t (*get_device_count)(ggml_backend_reg_t reg);
|
||||
ggml_backend_dev_t (*get_device)(ggml_backend_reg_t reg, size_t index);
|
||||
|
||||
// (optional) get a pointer to a function in the backend
|
||||
// backends can add custom functions that are not part of the standard ggml-backend interface
|
||||
void * (*get_proc_address)(ggml_backend_reg_t reg, const char * name);
|
||||
};
|
||||
|
||||
struct ggml_backend_reg {
|
||||
int api_version; // initialize to GGML_BACKEND_API_VERSION
|
||||
struct ggml_backend_reg_i iface;
|
||||
void * context;
|
||||
};
|
||||
|
||||
// Add backend dynamic loading support to the backend
|
||||
|
||||
// Initialize the backend
|
||||
typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
|
||||
// Optional: obtain a score for the backend based on the system configuration
|
||||
// Higher scores are preferred, 0 means the backend is not supported in the current system
|
||||
typedef int (*ggml_backend_score_t)(void);
|
||||
|
||||
#ifdef GGML_BACKEND_DL
|
||||
# ifdef __cplusplus
|
||||
# define GGML_BACKEND_DL_IMPL(reg_fn) \
|
||||
extern "C" { \
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
|
||||
} \
|
||||
ggml_backend_reg_t ggml_backend_init(void) { \
|
||||
return reg_fn(); \
|
||||
}
|
||||
# define GGML_BACKEND_DL_SCORE_IMPL(score_fn) \
|
||||
extern "C" { \
|
||||
GGML_BACKEND_API int ggml_backend_score(void); \
|
||||
} \
|
||||
int ggml_backend_score(void) { \
|
||||
return score_fn(); \
|
||||
}
|
||||
# else
|
||||
# define GGML_BACKEND_DL_IMPL(reg_fn) \
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
|
||||
ggml_backend_reg_t ggml_backend_init(void) { \
|
||||
return reg_fn(); \
|
||||
}
|
||||
# define GGML_BACKEND_DL_SCORE_IMPL(score_fn) \
|
||||
GGML_BACKEND_API int ggml_backend_score(void); \
|
||||
int ggml_backend_score(void) { \
|
||||
return score_fn(); \
|
||||
}
|
||||
# endif
|
||||
#else
|
||||
# define GGML_BACKEND_DL_IMPL(reg_fn)
|
||||
# define GGML_BACKEND_DL_SCORE_IMPL(score_fn)
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,632 @@
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-impl.h"
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
#include <filesystem>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
#include <cctype>
|
||||
|
||||
#ifdef _WIN32
|
||||
# define WIN32_LEAN_AND_MEAN
|
||||
# ifndef NOMINMAX
|
||||
# define NOMINMAX
|
||||
# endif
|
||||
# include <windows.h>
|
||||
#elif defined(__APPLE__)
|
||||
# include <mach-o/dyld.h>
|
||||
# include <dlfcn.h>
|
||||
#else
|
||||
# include <dlfcn.h>
|
||||
# include <unistd.h>
|
||||
#endif
|
||||
|
||||
// Backend registry
|
||||
#ifdef GGML_USE_CPU
|
||||
#include "ggml-cpu.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CUDA
|
||||
#include "ggml-cuda.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
#include "ggml-metal.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_SYCL
|
||||
#include "ggml-sycl.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_VULKAN
|
||||
#include "ggml-vulkan.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_WEBGPU
|
||||
#include "ggml-webgpu.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_ZDNN
|
||||
#include "ggml-zdnn.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_OPENCL
|
||||
#include "ggml-opencl.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_HEXAGON
|
||||
#include "ggml-hexagon.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_BLAS
|
||||
#include "ggml-blas.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_RPC
|
||||
#include "ggml-rpc.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CANN
|
||||
#include "ggml-cann.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_ZENDNN
|
||||
#include "ggml-zendnn.h"
|
||||
#endif
|
||||
|
||||
// disable C++17 deprecation warning for std::codecvt_utf8
|
||||
#if defined(__clang__)
|
||||
# pragma clang diagnostic push
|
||||
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
||||
#elif defined(__GNUC__)
|
||||
# pragma GCC diagnostic push
|
||||
# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
|
||||
#endif
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
|
||||
static std::string path_str(const fs::path & path) {
|
||||
std::string u8path;
|
||||
try {
|
||||
#if defined(__cpp_lib_char8_t)
|
||||
// C++20 and later: u8string() returns std::u8string
|
||||
std::u8string u8str = path.u8string();
|
||||
u8path = std::string(reinterpret_cast<const char*>(u8str.c_str()));
|
||||
#else
|
||||
// C++17: u8string() returns std::string
|
||||
u8path = path.u8string();
|
||||
#endif
|
||||
} catch (...) {
|
||||
}
|
||||
return u8path;
|
||||
}
|
||||
|
||||
#if defined(__clang__)
|
||||
# pragma clang diagnostic pop
|
||||
#elif defined(__GNUC__)
|
||||
# pragma GCC diagnostic pop
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
using dl_handle = std::remove_pointer_t<HMODULE>;
|
||||
|
||||
struct dl_handle_deleter {
|
||||
void operator()(HMODULE handle) {
|
||||
FreeLibrary(handle);
|
||||
}
|
||||
};
|
||||
|
||||
static dl_handle * dl_load_library(const fs::path & path) {
|
||||
// suppress error dialogs for missing DLLs
|
||||
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
||||
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
||||
|
||||
HMODULE handle = LoadLibraryW(path.wstring().c_str());
|
||||
|
||||
SetErrorMode(old_mode);
|
||||
|
||||
return handle;
|
||||
}
|
||||
|
||||
static void * dl_get_sym(dl_handle * handle, const char * name) {
|
||||
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
||||
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
||||
|
||||
void * p = (void *) GetProcAddress(handle, name);
|
||||
|
||||
SetErrorMode(old_mode);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
static const char * dl_error() {
|
||||
return "";
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
using dl_handle = void;
|
||||
|
||||
struct dl_handle_deleter {
|
||||
void operator()(void * handle) {
|
||||
dlclose(handle);
|
||||
}
|
||||
};
|
||||
|
||||
static void * dl_load_library(const fs::path & path) {
|
||||
dl_handle * handle = dlopen(path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
|
||||
|
||||
return handle;
|
||||
}
|
||||
|
||||
static void * dl_get_sym(dl_handle * handle, const char * name) {
|
||||
return dlsym(handle, name);
|
||||
}
|
||||
|
||||
static const char * dl_error() {
|
||||
const char *rslt = dlerror();
|
||||
return rslt != nullptr ? rslt : "";
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
|
||||
|
||||
struct ggml_backend_reg_entry {
|
||||
ggml_backend_reg_t reg;
|
||||
dl_handle_ptr handle;
|
||||
};
|
||||
|
||||
struct ggml_backend_registry {
|
||||
std::vector<ggml_backend_reg_entry> backends;
|
||||
std::vector<ggml_backend_dev_t> devices;
|
||||
|
||||
ggml_backend_registry() {
|
||||
#ifdef GGML_USE_CUDA
|
||||
register_backend(ggml_backend_cuda_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_METAL
|
||||
register_backend(ggml_backend_metal_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_SYCL
|
||||
register_backend(ggml_backend_sycl_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_VULKAN
|
||||
register_backend(ggml_backend_vk_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_WEBGPU
|
||||
register_backend(ggml_backend_webgpu_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_ZDNN
|
||||
register_backend(ggml_backend_zdnn_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_OPENCL
|
||||
register_backend(ggml_backend_opencl_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_ZENDNN
|
||||
register_backend(ggml_backend_zendnn_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_HEXAGON
|
||||
register_backend(ggml_backend_hexagon_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_CANN
|
||||
register_backend(ggml_backend_cann_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_BLAS
|
||||
register_backend(ggml_backend_blas_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_RPC
|
||||
register_backend(ggml_backend_rpc_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_CPU
|
||||
register_backend(ggml_backend_cpu_reg());
|
||||
#endif
|
||||
}
|
||||
|
||||
~ggml_backend_registry() {
|
||||
// FIXME: backends cannot be safely unloaded without a function to destroy all the backend resources,
|
||||
// since backend threads may still be running and accessing resources from the dynamic library
|
||||
for (auto & entry : backends) {
|
||||
if (entry.handle) {
|
||||
entry.handle.release(); // NOLINT
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void register_backend(ggml_backend_reg_t reg, dl_handle_ptr handle = nullptr) {
|
||||
if (!reg) {
|
||||
return;
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
|
||||
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
|
||||
#endif
|
||||
backends.push_back({ reg, std::move(handle) });
|
||||
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
|
||||
register_device(ggml_backend_reg_dev_get(reg, i));
|
||||
}
|
||||
}
|
||||
|
||||
void register_device(ggml_backend_dev_t device) {
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
|
||||
#endif
|
||||
devices.push_back(device);
|
||||
}
|
||||
|
||||
ggml_backend_reg_t load_backend(const fs::path & path, bool silent) {
|
||||
dl_handle_ptr handle { dl_load_library(path) };
|
||||
if (!handle) {
|
||||
if (!silent) {
|
||||
GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path_str(path).c_str(), dl_error());
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
||||
if (score_fn && score_fn() == 0) {
|
||||
if (!silent) {
|
||||
GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path_str(path).c_str());
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
|
||||
if (!backend_init_fn) {
|
||||
if (!silent) {
|
||||
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path_str(path).c_str());
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ggml_backend_reg_t reg = backend_init_fn();
|
||||
if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
|
||||
if (!silent) {
|
||||
if (!reg) {
|
||||
GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n",
|
||||
__func__, path_str(path).c_str());
|
||||
} else {
|
||||
GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
|
||||
__func__, path_str(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path_str(path).c_str());
|
||||
|
||||
register_backend(reg, std::move(handle));
|
||||
|
||||
return reg;
|
||||
}
|
||||
|
||||
void unload_backend(ggml_backend_reg_t reg, bool silent) {
|
||||
auto it = std::find_if(backends.begin(), backends.end(),
|
||||
[reg](const ggml_backend_reg_entry & entry) { return entry.reg == reg; });
|
||||
|
||||
if (it == backends.end()) {
|
||||
if (!silent) {
|
||||
GGML_LOG_ERROR("%s: backend not found\n", __func__);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (!silent) {
|
||||
GGML_LOG_DEBUG("%s: unloading %s backend\n", __func__, ggml_backend_reg_name(reg));
|
||||
}
|
||||
|
||||
// remove devices
|
||||
devices.erase(
|
||||
std::remove_if(devices.begin(), devices.end(),
|
||||
[reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }),
|
||||
devices.end());
|
||||
|
||||
// remove backend
|
||||
backends.erase(it);
|
||||
}
|
||||
};
|
||||
|
||||
static ggml_backend_registry & get_reg() {
|
||||
static ggml_backend_registry reg;
|
||||
return reg;
|
||||
}
|
||||
|
||||
// Internal API
|
||||
void ggml_backend_register(ggml_backend_reg_t reg) {
|
||||
get_reg().register_backend(reg);
|
||||
}
|
||||
|
||||
void ggml_backend_device_register(ggml_backend_dev_t device) {
|
||||
get_reg().register_device(device);
|
||||
}
|
||||
|
||||
// Backend (reg) enumeration
|
||||
static bool striequals(const char * a, const char * b) {
|
||||
for (; *a && *b; a++, b++) {
|
||||
if (std::tolower(*a) != std::tolower(*b)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return *a == *b;
|
||||
}
|
||||
|
||||
size_t ggml_backend_reg_count() {
|
||||
return get_reg().backends.size();
|
||||
}
|
||||
|
||||
ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
|
||||
GGML_ASSERT(index < ggml_backend_reg_count());
|
||||
return get_reg().backends[index].reg;
|
||||
}
|
||||
|
||||
ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
|
||||
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
|
||||
ggml_backend_reg_t reg = ggml_backend_reg_get(i);
|
||||
if (striequals(ggml_backend_reg_name(reg), name)) {
|
||||
return reg;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Device enumeration
|
||||
size_t ggml_backend_dev_count() {
|
||||
return get_reg().devices.size();
|
||||
}
|
||||
|
||||
ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
|
||||
GGML_ASSERT(index < ggml_backend_dev_count());
|
||||
return get_reg().devices[index];
|
||||
}
|
||||
|
||||
ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||
if (striequals(ggml_backend_dev_name(dev), name)) {
|
||||
return dev;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||
if (ggml_backend_dev_type(dev) == type) {
|
||||
return dev;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Convenience functions
|
||||
ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
|
||||
if (!dev) {
|
||||
return nullptr;
|
||||
}
|
||||
return ggml_backend_dev_init(dev, params);
|
||||
}
|
||||
|
||||
ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
|
||||
if (!dev) {
|
||||
return nullptr;
|
||||
}
|
||||
return ggml_backend_dev_init(dev, params);
|
||||
}
|
||||
|
||||
ggml_backend_t ggml_backend_init_best(void) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
|
||||
dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU);
|
||||
dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (!dev) {
|
||||
return nullptr;
|
||||
}
|
||||
return ggml_backend_dev_init(dev, nullptr);
|
||||
}
|
||||
|
||||
// Dynamic loading
|
||||
ggml_backend_reg_t ggml_backend_load(const char * path) {
|
||||
return get_reg().load_backend(path, false);
|
||||
}
|
||||
|
||||
void ggml_backend_unload(ggml_backend_reg_t reg) {
|
||||
get_reg().unload_backend(reg, true);
|
||||
}
|
||||
|
||||
static fs::path get_executable_path() {
|
||||
#if defined(__APPLE__)
|
||||
// get executable path
|
||||
std::vector<char> path;
|
||||
uint32_t size;
|
||||
while (true) {
|
||||
size = path.size();
|
||||
if (_NSGetExecutablePath(path.data(), &size) == 0) {
|
||||
break;
|
||||
}
|
||||
path.resize(size);
|
||||
}
|
||||
std::string base_path(path.data(), size);
|
||||
// remove executable name
|
||||
auto last_slash = base_path.find_last_of('/');
|
||||
if (last_slash != std::string::npos) {
|
||||
base_path = base_path.substr(0, last_slash);
|
||||
}
|
||||
return base_path + "/";
|
||||
#elif defined(__linux__) || defined(__FreeBSD__)
|
||||
std::string base_path = ".";
|
||||
std::vector<char> path(1024);
|
||||
while (true) {
|
||||
// get executable path
|
||||
# if defined(__linux__)
|
||||
ssize_t len = readlink("/proc/self/exe", path.data(), path.size());
|
||||
# elif defined(__FreeBSD__)
|
||||
ssize_t len = readlink("/proc/curproc/file", path.data(), path.size());
|
||||
# endif
|
||||
if (len == -1) {
|
||||
break;
|
||||
}
|
||||
if (len < (ssize_t) path.size()) {
|
||||
base_path = std::string(path.data(), len);
|
||||
// remove executable name
|
||||
auto last_slash = base_path.find_last_of('/');
|
||||
if (last_slash != std::string::npos) {
|
||||
base_path = base_path.substr(0, last_slash);
|
||||
}
|
||||
break;
|
||||
}
|
||||
path.resize(path.size() * 2);
|
||||
}
|
||||
|
||||
return base_path + "/";
|
||||
#elif defined(_WIN32)
|
||||
std::vector<wchar_t> path(MAX_PATH);
|
||||
DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
|
||||
if (len == 0) {
|
||||
return {};
|
||||
}
|
||||
std::wstring base_path(path.data(), len);
|
||||
// remove executable name
|
||||
auto last_slash = base_path.find_last_of('\\');
|
||||
if (last_slash != std::string::npos) {
|
||||
base_path = base_path.substr(0, last_slash);
|
||||
}
|
||||
return base_path + L"\\";
|
||||
#else
|
||||
return {};
|
||||
#endif
|
||||
}
|
||||
|
||||
static fs::path backend_filename_prefix() {
|
||||
#ifdef _WIN32
|
||||
return fs::u8path("ggml-");
|
||||
#else
|
||||
return fs::u8path("libggml-");
|
||||
#endif
|
||||
}
|
||||
|
||||
static fs::path backend_filename_extension() {
|
||||
#ifdef _WIN32
|
||||
return fs::u8path(".dll");
|
||||
#else
|
||||
return fs::u8path(".so");
|
||||
#endif
|
||||
}
|
||||
|
||||
static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
|
||||
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
|
||||
const fs::path name_path = fs::u8path(name);
|
||||
const fs::path file_prefix = backend_filename_prefix().native() + name_path.native() + fs::u8path("-").native();
|
||||
const fs::path file_extension = backend_filename_extension();
|
||||
|
||||
std::vector<fs::path> search_paths;
|
||||
if (user_search_path == nullptr) {
|
||||
#ifdef GGML_BACKEND_DIR
|
||||
search_paths.push_back(fs::u8path(GGML_BACKEND_DIR));
|
||||
#endif
|
||||
// default search paths: executable directory, current directory
|
||||
search_paths.push_back(get_executable_path());
|
||||
search_paths.push_back(fs::current_path());
|
||||
} else {
|
||||
search_paths.push_back(fs::u8path(user_search_path));
|
||||
}
|
||||
|
||||
int best_score = 0;
|
||||
fs::path best_path;
|
||||
|
||||
for (const auto & search_path : search_paths) {
|
||||
if (std::error_code ec; !fs::exists(search_path, ec)) {
|
||||
if (ec) {
|
||||
GGML_LOG_DEBUG("%s: posix_stat(%s) failure, error-message: %s\n", __func__, path_str(search_path).c_str(), ec.message().c_str());
|
||||
} else {
|
||||
GGML_LOG_DEBUG("%s: search path %s does not exist\n", __func__, path_str(search_path).c_str());
|
||||
}
|
||||
continue;
|
||||
}
|
||||
fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
|
||||
for (const auto & entry : dir_it) {
|
||||
if (entry.is_regular_file()) {
|
||||
auto filename = entry.path().filename();
|
||||
auto ext = entry.path().extension();
|
||||
if (filename.native().find(file_prefix) == 0 && ext == file_extension) {
|
||||
dl_handle_ptr handle { dl_load_library(entry) };
|
||||
if (!handle && !silent) {
|
||||
GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path_str(entry.path()).c_str(), dl_error());
|
||||
}
|
||||
if (handle) {
|
||||
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
||||
if (score_fn) {
|
||||
int s = score_fn();
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, path_str(entry.path()).c_str(), s);
|
||||
#endif
|
||||
if (s > best_score) {
|
||||
best_score = s;
|
||||
best_path = entry.path();
|
||||
}
|
||||
} else {
|
||||
if (!silent) {
|
||||
GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, path_str(entry.path()).c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (best_score == 0) {
|
||||
// try to load the base backend
|
||||
for (const auto & search_path : search_paths) {
|
||||
fs::path filename = backend_filename_prefix().native() + name_path.native() + backend_filename_extension().native();
|
||||
fs::path path = search_path / filename;
|
||||
if (std::error_code ec; fs::exists(path, ec)) {
|
||||
return get_reg().load_backend(path, silent);
|
||||
} else {
|
||||
if (ec) {
|
||||
GGML_LOG_DEBUG("%s: posix_stat(%s) failure, error-message: %s\n", __func__, path_str(path).c_str(), ec.message().c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return get_reg().load_backend(best_path, silent);
|
||||
}
|
||||
|
||||
void ggml_backend_load_all() {
|
||||
ggml_backend_load_all_from_path(nullptr);
|
||||
}
|
||||
|
||||
void ggml_backend_load_all_from_path(const char * dir_path) {
|
||||
#ifdef NDEBUG
|
||||
bool silent = true;
|
||||
#else
|
||||
bool silent = false;
|
||||
#endif
|
||||
|
||||
ggml_backend_load_best("blas", silent, dir_path);
|
||||
ggml_backend_load_best("zendnn", silent, dir_path);
|
||||
ggml_backend_load_best("cann", silent, dir_path);
|
||||
ggml_backend_load_best("cuda", silent, dir_path);
|
||||
ggml_backend_load_best("hip", silent, dir_path);
|
||||
ggml_backend_load_best("metal", silent, dir_path);
|
||||
ggml_backend_load_best("rpc", silent, dir_path);
|
||||
ggml_backend_load_best("sycl", silent, dir_path);
|
||||
ggml_backend_load_best("vulkan", silent, dir_path);
|
||||
ggml_backend_load_best("opencl", silent, dir_path);
|
||||
ggml_backend_load_best("hexagon", silent, dir_path);
|
||||
ggml_backend_load_best("musa", silent, dir_path);
|
||||
ggml_backend_load_best("cpu", silent, dir_path);
|
||||
// check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
|
||||
const char * backend_path = std::getenv("GGML_BACKEND_PATH");
|
||||
if (backend_path) {
|
||||
ggml_backend_load(backend_path);
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,689 @@
|
||||
function(ggml_add_cpu_backend_features cpu_name arch)
|
||||
# The feature detection code is compiled as a separate target so that
|
||||
# it can be built without the architecture flags
|
||||
# Since multiple variants of the CPU backend may be included in the same
|
||||
# build, using set_source_files_properties() to set the arch flags is not possible
|
||||
set(GGML_CPU_FEATS_NAME ${cpu_name}-feats)
|
||||
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/${arch}/cpu-feats.cpp)
|
||||
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . ../include)
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARGN})
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
|
||||
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_link_libraries(${cpu_name} PRIVATE ${GGML_CPU_FEATS_NAME})
|
||||
endfunction()
|
||||
|
||||
function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
if (tag_name)
|
||||
set(GGML_CPU_NAME ggml-cpu-${tag_name})
|
||||
else()
|
||||
set(GGML_CPU_NAME ggml-cpu)
|
||||
endif()
|
||||
|
||||
ggml_add_backend_library(${GGML_CPU_NAME})
|
||||
|
||||
list (APPEND GGML_CPU_SOURCES
|
||||
ggml-cpu/ggml-cpu.c
|
||||
ggml-cpu/ggml-cpu.cpp
|
||||
ggml-cpu/repack.cpp
|
||||
ggml-cpu/repack.h
|
||||
ggml-cpu/hbm.cpp
|
||||
ggml-cpu/hbm.h
|
||||
ggml-cpu/quants.c
|
||||
ggml-cpu/quants.h
|
||||
ggml-cpu/traits.cpp
|
||||
ggml-cpu/traits.h
|
||||
ggml-cpu/amx/amx.cpp
|
||||
ggml-cpu/amx/amx.h
|
||||
ggml-cpu/amx/mmq.cpp
|
||||
ggml-cpu/amx/mmq.h
|
||||
ggml-cpu/ggml-cpu-impl.h
|
||||
ggml-cpu/common.h
|
||||
ggml-cpu/binary-ops.h
|
||||
ggml-cpu/binary-ops.cpp
|
||||
ggml-cpu/unary-ops.h
|
||||
ggml-cpu/unary-ops.cpp
|
||||
ggml-cpu/simd-mappings.h
|
||||
ggml-cpu/vec.h
|
||||
ggml-cpu/vec.cpp
|
||||
ggml-cpu/ops.h
|
||||
ggml-cpu/ops.cpp
|
||||
)
|
||||
|
||||
target_compile_features(${GGML_CPU_NAME} PRIVATE c_std_11 cxx_std_17)
|
||||
target_include_directories(${GGML_CPU_NAME} PRIVATE . ggml-cpu)
|
||||
|
||||
if (APPLE AND GGML_ACCELERATE)
|
||||
find_library(ACCELERATE_FRAMEWORK Accelerate)
|
||||
if (ACCELERATE_FRAMEWORK)
|
||||
message(STATUS "Accelerate framework found")
|
||||
|
||||
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_ACCELERATE)
|
||||
target_compile_definitions(${GGML_CPU_NAME} PRIVATE ACCELERATE_NEW_LAPACK)
|
||||
target_compile_definitions(${GGML_CPU_NAME} PRIVATE ACCELERATE_LAPACK_ILP64)
|
||||
|
||||
target_link_libraries(${GGML_CPU_NAME} PRIVATE ${ACCELERATE_FRAMEWORK})
|
||||
else()
|
||||
message(WARNING "Accelerate framework not found")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_OPENMP)
|
||||
find_package(OpenMP)
|
||||
if (OpenMP_FOUND)
|
||||
set(GGML_OPENMP_ENABLED "ON" CACHE INTERNAL "")
|
||||
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP)
|
||||
|
||||
target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||
else()
|
||||
set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "")
|
||||
message(WARNING "OpenMP not found")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_LLAMAFILE)
|
||||
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_LLAMAFILE)
|
||||
|
||||
list(APPEND GGML_CPU_SOURCES
|
||||
ggml-cpu/llamafile/sgemm.cpp
|
||||
ggml-cpu/llamafile/sgemm.h)
|
||||
endif()
|
||||
|
||||
if (GGML_CPU_HBM)
|
||||
find_library(memkind memkind REQUIRED)
|
||||
|
||||
message(STATUS "Using memkind for CPU HBM")
|
||||
|
||||
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_HBM)
|
||||
|
||||
target_link_libraries(${GGML_CPU_NAME} PUBLIC memkind)
|
||||
endif()
|
||||
|
||||
if (GGML_SYSTEM_ARCH STREQUAL "ARM")
|
||||
message(STATUS "ARM detected")
|
||||
list(APPEND GGML_CPU_SOURCES
|
||||
ggml-cpu/arch/arm/quants.c
|
||||
ggml-cpu/arch/arm/repack.cpp
|
||||
)
|
||||
|
||||
if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
||||
message(FATAL_ERROR "MSVC is not supported for ARM, use clang")
|
||||
else()
|
||||
check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
|
||||
if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
|
||||
list(APPEND ARCH_FLAGS -mfp16-format=ieee)
|
||||
endif()
|
||||
|
||||
if (GGML_NATIVE)
|
||||
# -mcpu=native does not always enable all the features in some compilers,
|
||||
# so we check for them manually and enable them if available
|
||||
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_C_COMPILER} -mcpu=native -E -v -
|
||||
INPUT_FILE "/dev/null"
|
||||
OUTPUT_QUIET
|
||||
ERROR_VARIABLE ARM_MCPU
|
||||
RESULT_VARIABLE ARM_MCPU_RESULT
|
||||
)
|
||||
if (NOT ARM_MCPU_RESULT)
|
||||
string(REGEX MATCH "-mcpu=[^ ']+" ARM_MCPU_FLAG "${ARM_MCPU}")
|
||||
string(REGEX MATCH "-march=[^ ']+" ARM_MARCH_FLAG "${ARM_MCPU}")
|
||||
|
||||
# on some old GCC we need to read -march=
|
||||
if (ARM_MARCH_FLAG AND NOT "${ARM_MARCH_FLAG}" STREQUAL "-march=native")
|
||||
set(ARM_NATIVE_FLAG "${ARM_MARCH_FLAG}")
|
||||
elseif(ARM_MCPU_FLAG AND NOT "${ARM_MCPU_FLAG}" STREQUAL "-mcpu=native")
|
||||
set(ARM_NATIVE_FLAG "${ARM_MCPU_FLAG}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if ("${ARM_NATIVE_FLAG}" STREQUAL "")
|
||||
set(ARM_NATIVE_FLAG -mcpu=native)
|
||||
message(WARNING "ARM -march/-mcpu not found, -mcpu=native will be used")
|
||||
else()
|
||||
message(STATUS "ARM detected flags: ${ARM_NATIVE_FLAG}")
|
||||
endif()
|
||||
|
||||
include(CheckCXXSourceRuns)
|
||||
|
||||
macro(check_arm_feature tag feature code)
|
||||
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
|
||||
set(CMAKE_REQUIRED_FLAGS "${ARM_NATIVE_FLAG}+${tag}")
|
||||
check_cxx_source_runs("${code}" GGML_MACHINE_SUPPORTS_${tag})
|
||||
if (GGML_MACHINE_SUPPORTS_${tag})
|
||||
set(ARM_NATIVE_FLAG_FIX "${ARM_NATIVE_FLAG_FIX}+${tag}")
|
||||
else()
|
||||
set(CMAKE_REQUIRED_FLAGS "${ARM_NATIVE_FLAG}+no${tag}")
|
||||
check_cxx_source_compiles("int main() { return 0; }" GGML_MACHINE_SUPPORTS_no${tag})
|
||||
if (GGML_MACHINE_SUPPORTS_no${tag})
|
||||
set(ARM_NATIVE_FLAG_FIX "${ARM_NATIVE_FLAG_FIX}+no${tag}")
|
||||
list(APPEND ARCH_FLAGS -U__ARM_FEATURE_${feature})
|
||||
endif()
|
||||
endif()
|
||||
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
|
||||
endmacro()
|
||||
|
||||
check_arm_feature(dotprod DOTPROD "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }")
|
||||
check_arm_feature(i8mm MATMUL_INT8 "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }")
|
||||
check_arm_feature(sve SVE "#include <arm_sve.h>\nint main() { svfloat32_t _a, _b; volatile svfloat32_t _c = svadd_f32_z(svptrue_b8(), _a, _b); return 0; }")
|
||||
check_arm_feature(sme SME "#include <arm_sme.h>\n__arm_locally_streaming int main() { __asm__ volatile(\"smstart; smstop;\"); return 0; }")
|
||||
|
||||
list(APPEND ARCH_FLAGS "${ARM_NATIVE_FLAG}${ARM_NATIVE_FLAG_FIX}")
|
||||
else()
|
||||
if (GGML_CPU_ARM_ARCH)
|
||||
list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})
|
||||
elseif(GGML_CPU_ALL_VARIANTS)
|
||||
# Begin with the lowest baseline
|
||||
set(ARM_MCPU "armv8-a")
|
||||
set(ARCH_TAGS "")
|
||||
set(ARCH_DEFINITIONS "")
|
||||
|
||||
# When a feature is selected, bump the MCPU to the first
|
||||
# version that supported it
|
||||
if (GGML_INTERNAL_DOTPROD)
|
||||
set(ARM_MCPU "armv8.2-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+dotprod")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_DOTPROD)
|
||||
endif()
|
||||
if (GGML_INTERNAL_FP16_VECTOR_ARITHMETIC)
|
||||
set(ARM_MCPU "armv8.2-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+fp16")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_FP16_VECTOR_ARITHMETIC)
|
||||
endif()
|
||||
if (GGML_INTERNAL_SVE)
|
||||
set(ARM_MCPU "armv8.2-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+sve")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_SVE)
|
||||
endif()
|
||||
if (GGML_INTERNAL_MATMUL_INT8)
|
||||
set(ARM_MCPU "armv8.6-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+i8mm")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_MATMUL_INT8)
|
||||
endif()
|
||||
if (GGML_INTERNAL_SVE2)
|
||||
set(ARM_MCPU "armv8.6-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+sve2")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_SVE2)
|
||||
endif()
|
||||
if (GGML_INTERNAL_NOSVE)
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+nosve")
|
||||
endif()
|
||||
if (GGML_INTERNAL_SME)
|
||||
set(ARM_MCPU "armv9.2-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+sme")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_SME)
|
||||
endif()
|
||||
list(APPEND ARCH_FLAGS "-march=${ARM_MCPU}${ARCH_TAGS}")
|
||||
ggml_add_cpu_backend_features(${GGML_CPU_NAME} arm ${ARCH_DEFINITIONS})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
message(STATUS "Checking for ARM features using flags:")
|
||||
foreach(flag IN LISTS ARCH_FLAGS)
|
||||
message(STATUS " ${flag}")
|
||||
endforeach()
|
||||
|
||||
include(CheckCXXSourceCompiles)
|
||||
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
|
||||
string(REPLACE ";" " " ARCH_FLAGS_STR "${ARCH_FLAGS}")
|
||||
set(CMAKE_REQUIRED_FLAGS "${ARCH_FLAGS_STR}")
|
||||
foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC SME)
|
||||
set(ARM_FEATURE "HAVE_${feature}")
|
||||
check_cxx_source_compiles(
|
||||
"
|
||||
#if !defined(__ARM_FEATURE_${feature})
|
||||
# error \"Feature ${feature} is not defined\"
|
||||
#endif
|
||||
int main() { return 0; }
|
||||
"
|
||||
${ARM_FEATURE}
|
||||
)
|
||||
endforeach()
|
||||
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
|
||||
endif()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "x86")
|
||||
message(STATUS "x86 detected")
|
||||
list(APPEND GGML_CPU_SOURCES
|
||||
ggml-cpu/arch/x86/quants.c
|
||||
ggml-cpu/arch/x86/repack.cpp
|
||||
)
|
||||
|
||||
if (MSVC)
|
||||
# instruction set detection for MSVC only
|
||||
if (GGML_NATIVE)
|
||||
include(ggml-cpu/cmake/FindSIMD.cmake)
|
||||
endif ()
|
||||
if (GGML_AVX512)
|
||||
list(APPEND ARCH_FLAGS /arch:AVX512)
|
||||
# /arch:AVX512 includes: __AVX512F__, __AVX512CD__, __AVX512BW__, __AVX512DQ__, and __AVX512VL__
|
||||
# MSVC has no compile-time flags enabling specific
|
||||
# AVX512 extensions, neither it defines the
|
||||
# macros corresponding to the extensions.
|
||||
# Do it manually.
|
||||
list(APPEND ARCH_DEFINITIONS GGML_AVX512)
|
||||
if (GGML_AVX512_VBMI)
|
||||
list(APPEND ARCH_DEFINITIONS __AVX512VBMI__)
|
||||
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
||||
list(APPEND ARCH_FLAGS -mavx512vbmi)
|
||||
endif()
|
||||
endif()
|
||||
if (GGML_AVX512_VNNI)
|
||||
list(APPEND ARCH_DEFINITIONS __AVX512VNNI__ GGML_AVX512_VNNI)
|
||||
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
||||
list(APPEND ARCH_FLAGS -mavx512vnni)
|
||||
endif()
|
||||
endif()
|
||||
if (GGML_AVX512_BF16)
|
||||
list(APPEND ARCH_DEFINITIONS __AVX512BF16__ GGML_AVX512_BF16)
|
||||
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
||||
list(APPEND ARCH_FLAGS -mavx512bf16)
|
||||
endif()
|
||||
endif()
|
||||
if (GGML_AMX_TILE)
|
||||
list(APPEND ARCH_DEFINITIONS __AMX_TILE__ GGML_AMX_TILE)
|
||||
endif()
|
||||
if (GGML_AMX_INT8)
|
||||
list(APPEND ARCH_DEFINITIONS __AMX_INT8__ GGML_AMX_INT8)
|
||||
endif()
|
||||
if (GGML_AMX_BF16)
|
||||
list(APPEND ARCH_DEFINITIONS __AMX_BF16__ GGML_AMX_BF16)
|
||||
endif()
|
||||
elseif (GGML_AVX2)
|
||||
list(APPEND ARCH_FLAGS /arch:AVX2)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_AVX2 GGML_FMA GGML_F16C)
|
||||
elseif (GGML_AVX)
|
||||
list(APPEND ARCH_FLAGS /arch:AVX)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_AVX)
|
||||
elseif (GGML_SSE42)
|
||||
list(APPEND ARCH_FLAGS /arch:SSE4.2)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_SSE42)
|
||||
endif()
|
||||
if (GGML_AVX_VNNI)
|
||||
list(APPEND ARCH_DEFINITIONS __AVXVNNI__ GGML_AVX_VNNI)
|
||||
endif()
|
||||
if (GGML_BMI2)
|
||||
# MSVC does not define macro __BMI2__
|
||||
list(APPEND ARCH_DEFINITIONS __BMI2__ GGML_BMI2)
|
||||
endif()
|
||||
else ()
|
||||
if (GGML_NATIVE)
|
||||
list(APPEND ARCH_FLAGS -march=native)
|
||||
else ()
|
||||
if (GGML_SSE42)
|
||||
list(APPEND ARCH_FLAGS -msse4.2)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_SSE42)
|
||||
endif()
|
||||
if (GGML_F16C)
|
||||
list(APPEND ARCH_FLAGS -mf16c)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_F16C)
|
||||
endif()
|
||||
if (GGML_FMA)
|
||||
list(APPEND ARCH_FLAGS -mfma)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_FMA)
|
||||
endif()
|
||||
if (GGML_BMI2)
|
||||
list(APPEND ARCH_FLAGS -mbmi2)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_BMI2)
|
||||
endif()
|
||||
if (GGML_AVX)
|
||||
list(APPEND ARCH_FLAGS -mavx)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_AVX)
|
||||
endif()
|
||||
if (GGML_AVX2)
|
||||
list(APPEND ARCH_FLAGS -mavx2)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_AVX2)
|
||||
endif()
|
||||
if (GGML_AVX_VNNI)
|
||||
list(APPEND ARCH_FLAGS -mavxvnni)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_AVX_VNNI)
|
||||
endif()
|
||||
if (GGML_AVX512)
|
||||
list(APPEND ARCH_FLAGS -mavx512f)
|
||||
list(APPEND ARCH_FLAGS -mavx512cd)
|
||||
list(APPEND ARCH_FLAGS -mavx512vl)
|
||||
list(APPEND ARCH_FLAGS -mavx512dq)
|
||||
list(APPEND ARCH_FLAGS -mavx512bw)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_AVX512)
|
||||
endif()
|
||||
if (GGML_AVX512_VBMI)
|
||||
list(APPEND ARCH_FLAGS -mavx512vbmi)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_AVX512_VBMI)
|
||||
endif()
|
||||
if (GGML_AVX512_VNNI)
|
||||
list(APPEND ARCH_FLAGS -mavx512vnni)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_AVX512_VNNI)
|
||||
endif()
|
||||
if (GGML_AVX512_BF16)
|
||||
list(APPEND ARCH_FLAGS -mavx512bf16)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_AVX512_BF16)
|
||||
endif()
|
||||
if (GGML_AMX_TILE)
|
||||
list(APPEND ARCH_FLAGS -mamx-tile)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_AMX_TILE)
|
||||
endif()
|
||||
if (GGML_AMX_INT8)
|
||||
list(APPEND ARCH_FLAGS -mamx-int8)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_AMX_INT8)
|
||||
endif()
|
||||
if (GGML_AMX_BF16)
|
||||
list(APPEND ARCH_FLAGS -mamx-bf16)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_AMX_BF16)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_BACKEND_DL)
|
||||
if (GGML_NATIVE)
|
||||
# the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
|
||||
message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
|
||||
endif()
|
||||
ggml_add_cpu_backend_features(${GGML_CPU_NAME} x86 ${ARCH_DEFINITIONS})
|
||||
endif()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
|
||||
message(STATUS "PowerPC detected")
|
||||
list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/powerpc/quants.c)
|
||||
if (GGML_NATIVE)
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
||||
file(READ "/proc/cpuinfo" POWER10_M)
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "powerpc")
|
||||
execute_process(COMMAND bash -c "prtconf |grep 'Implementation' | head -n 1" OUTPUT_VARIABLE POWER10_M)
|
||||
endif()
|
||||
|
||||
string(TOUPPER "${POWER10_M}" POWER10_M_UPPER)
|
||||
string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M_UPPER}")
|
||||
string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")
|
||||
|
||||
if (EXTRACTED_NUMBER GREATER_EQUAL 10)
|
||||
list(APPEND ARCH_FLAGS -mcpu=power10)
|
||||
elseif (EXTRACTED_NUMBER EQUAL 9)
|
||||
list(APPEND ARCH_FLAGS -mcpu=power9)
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
|
||||
list(APPEND ARCH_FLAGS -mcpu=powerpc64le -mtune=native)
|
||||
else()
|
||||
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native -mpowerpc64)
|
||||
endif()
|
||||
elseif(GGML_CPU_ALL_VARIANTS)
|
||||
# Begin with the lowest baseline
|
||||
set(ARCH_DEFINITIONS "")
|
||||
|
||||
# When a feature is selected, bump the MCPU to the first
|
||||
# version that supported it
|
||||
foreach(PVER RANGE 7 11)
|
||||
if(DEFINED GGML_INTERNAL_POWER${PVER})
|
||||
set(POWERPC_MCPU "power${PVER}")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_POWER${PVER})
|
||||
endif()
|
||||
endforeach()
|
||||
if (GGML_INTERNAL_VSX)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_VSX)
|
||||
list(APPEND ARCH_FLAGS -mvsx)
|
||||
endif()
|
||||
|
||||
if (DEFINED POWERPC_MCPU)
|
||||
list(APPEND ARCH_FLAGS -mcpu=${POWERPC_MCPU})
|
||||
endif()
|
||||
ggml_add_cpu_backend_features(${GGML_CPU_NAME} powerpc ${ARCH_DEFINITIONS})
|
||||
else()
|
||||
if (GGML_CPU_POWERPC_CPUTYPE)
|
||||
list(APPEND ARCH_FLAGS -mcpu=${GGML_CPU_POWERPC_CPUTYPE})
|
||||
endif()
|
||||
endif()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "loongarch64")
|
||||
message(STATUS "loongarch64 detected")
|
||||
list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/loongarch/quants.c)
|
||||
|
||||
list(APPEND ARCH_FLAGS -march=loongarch64)
|
||||
if (GGML_LASX)
|
||||
list(APPEND ARCH_FLAGS -mlasx)
|
||||
endif()
|
||||
if (GGML_LSX)
|
||||
list(APPEND ARCH_FLAGS -mlsx)
|
||||
endif()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
|
||||
message(STATUS "riscv64 detected")
|
||||
list(APPEND GGML_CPU_SOURCES
|
||||
ggml-cpu/arch/riscv/quants.c
|
||||
ggml-cpu/arch/riscv/repack.cpp
|
||||
)
|
||||
if (GGML_CPU_RISCV64_SPACEMIT)
|
||||
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_RISCV64_SPACEMIT ${RISCV64_SPACEMIT_IME_SPEC})
|
||||
list(APPEND GGML_CPU_SOURCES
|
||||
ggml-cpu/spacemit/ime.cpp
|
||||
ggml-cpu/spacemit/ime.h
|
||||
ggml-cpu/spacemit/ime1_kernels.cpp
|
||||
ggml-cpu/spacemit/ime_kernels.h
|
||||
)
|
||||
endif()
|
||||
if(NOT GGML_CPU_ALL_VARIANTS)
|
||||
set(MARCH_STR "rv64gc")
|
||||
if (GGML_RV_ZFH)
|
||||
string(APPEND MARCH_STR "_zfh")
|
||||
endif()
|
||||
|
||||
if (GGML_XTHEADVECTOR)
|
||||
string(APPEND MARCH_STR "_xtheadvector")
|
||||
elseif (GGML_RVV)
|
||||
string(APPEND MARCH_STR "_v")
|
||||
if (GGML_RV_ZVFH)
|
||||
string(APPEND MARCH_STR "_zvfh")
|
||||
endif()
|
||||
if (GGML_RV_ZVFBFWMA)
|
||||
string(APPEND MARCH_STR "_zvfbfwma")
|
||||
endif()
|
||||
endif()
|
||||
if (GGML_RV_ZICBOP)
|
||||
string(APPEND MARCH_STR "_zicbop")
|
||||
endif()
|
||||
if (GGML_RV_ZIHINTPAUSE)
|
||||
string(APPEND MARCH_STR "_zihintpause")
|
||||
endif()
|
||||
list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
|
||||
else()
|
||||
# Begin with the lowest baseline
|
||||
set(ARCH_DEFINITIONS "")
|
||||
|
||||
if (GGML_INTERNAL_RVV)
|
||||
message(STATUS "RVV enabled")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_RVV)
|
||||
list(APPEND ARCH_FLAGS -march=rv64gc_v -mabi=lp64d)
|
||||
endif()
|
||||
|
||||
ggml_add_cpu_backend_features(${GGML_CPU_NAME} riscv ${ARCH_DEFINITIONS})
|
||||
endif()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
|
||||
message(STATUS "s390x detected")
|
||||
list(APPEND GGML_CPU_SOURCES
|
||||
ggml-cpu/arch/s390/quants.c)
|
||||
|
||||
# for native compilation
|
||||
if (GGML_NATIVE)
|
||||
# check machine level to determine target
|
||||
file(READ "/proc/cpuinfo" CPUINFO_CONTENTS)
|
||||
string(REGEX REPLACE "machine[ \t\r\n]*=[ \t\r\n]*([0-9]+)" "\\1" S390X_M ${CPUINFO_CONTENTS})
|
||||
|
||||
# TODO: Separation to determine activation of VX/VXE/VXE2
|
||||
if (${S390X_M} MATCHES "8561|8562")
|
||||
message(STATUS "z15 target")
|
||||
list(APPEND ARCH_FLAGS -march=z15)
|
||||
elseif (${S390X_M} MATCHES "3931")
|
||||
message(STATUS "z16 target")
|
||||
list(APPEND ARCH_FLAGS -march=z16)
|
||||
elseif (${S390X_M} MATCHES "9175|9176")
|
||||
# NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
|
||||
# binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15.
|
||||
message(STATUS "z17 target")
|
||||
list(APPEND ARCH_FLAGS -march=arch15)
|
||||
else()
|
||||
message(STATUS "Unknown target")
|
||||
message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
|
||||
list(APPEND ARCH_FLAGS -march=native -mtune=native)
|
||||
endif()
|
||||
# for cross-compilation
|
||||
elseif(GGML_CPU_ALL_VARIANTS)
|
||||
# range through IBM z15 to z17
|
||||
# NOTE: update when a new hardware level is released
|
||||
foreach (ZHW RANGE 15 17)
|
||||
if(DEFINED GGML_INTERNAL_Z${ZHW})
|
||||
message(STATUS "z${ZHW} cross-compile target")
|
||||
list(APPEND ARCH_FLAGS -march=z${ZHW})
|
||||
endif()
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
if (GGML_VXE OR GGML_INTERNAL_VXE2)
|
||||
message(STATUS "VXE2 enabled")
|
||||
list(APPEND ARCH_FLAGS -mvx -mzvector)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_VXE2)
|
||||
endif()
|
||||
|
||||
if (GGML_INTERNAL_NNPA)
|
||||
message(STATUS "NNPA enabled")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_NNPA)
|
||||
endif()
|
||||
|
||||
ggml_add_cpu_backend_features(${GGML_CPU_NAME} s390 ${ARCH_DEFINITIONS})
|
||||
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
|
||||
message(STATUS "Wasm detected")
|
||||
list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
|
||||
else()
|
||||
message(WARNING "Unknown CPU architecture. Falling back to generic implementations.")
|
||||
list(APPEND ARCH_FLAGS -DGGML_CPU_GENERIC)
|
||||
endif()
|
||||
|
||||
if (GGML_CPU_REPACK)
|
||||
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_REPACK)
|
||||
endif()
|
||||
|
||||
if (GGML_CPU_KLEIDIAI)
|
||||
message(STATUS "Using KleidiAI optimized kernels if applicable")
|
||||
|
||||
# Disable the KleidiAI tests
|
||||
set(KLEIDIAI_BUILD_TESTS OFF)
|
||||
|
||||
# Fetch KleidiAI sources:
|
||||
include(FetchContent)
|
||||
set(KLEIDIAI_COMMIT_TAG "v1.16.0")
|
||||
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
|
||||
set(KLEIDIAI_ARCHIVE_MD5 "0a9e9008adb6031f9e8cf70dff4a3321")
|
||||
|
||||
if (POLICY CMP0135)
|
||||
cmake_policy(SET CMP0135 NEW)
|
||||
endif()
|
||||
|
||||
FetchContent_Declare(KleidiAI_Download
|
||||
URL ${KLEIDIAI_DOWNLOAD_URL}
|
||||
DOWNLOAD_EXTRACT_TIMESTAMP NEW
|
||||
URL_HASH MD5=${KLEIDIAI_ARCHIVE_MD5})
|
||||
|
||||
FetchContent_MakeAvailable(KleidiAI_Download)
|
||||
FetchContent_GetProperties(KleidiAI_Download
|
||||
SOURCE_DIR KLEIDIAI_SRC
|
||||
POPULATED KLEIDIAI_POPULATED)
|
||||
|
||||
if (NOT KLEIDIAI_POPULATED)
|
||||
message(FATAL_ERROR "KleidiAI source downloaded failed.")
|
||||
endif()
|
||||
|
||||
add_compile_definitions(GGML_USE_CPU_KLEIDIAI)
|
||||
|
||||
# Remove kleidiai target after fetching it
|
||||
if (TARGET kleidiai)
|
||||
set_target_properties(kleidiai PROPERTIES EXCLUDE_FROM_ALL TRUE)
|
||||
endif()
|
||||
|
||||
list(APPEND GGML_CPU_SOURCES
|
||||
ggml-cpu/kleidiai/kleidiai.cpp
|
||||
ggml-cpu/kleidiai/kernels.cpp
|
||||
ggml-cpu/kleidiai/kleidiai.h
|
||||
ggml-cpu/kleidiai/kernels.h
|
||||
)
|
||||
|
||||
# KleidiAI
|
||||
include_directories(
|
||||
${KLEIDIAI_SRC}/
|
||||
${KLEIDIAI_SRC}/kai/
|
||||
${KLEIDIAI_SRC}/kai/ukernels/
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/)
|
||||
|
||||
set(ARCH_FLAGS_TEMP "${ARCH_FLAGS}")
|
||||
if (NOT ARCH_FLAGS_TEMP)
|
||||
string(REGEX MATCH "-march=[^ ]+" ARCH_FLAGS_TEMP "${CMAKE_C_FLAGS}")
|
||||
endif()
|
||||
string(FIND "${ARCH_FLAGS_TEMP}" "+dotprod" DOTPROD_ENABLED)
|
||||
string(FIND "${ARCH_FLAGS_TEMP}" "+i8mm" I8MM_ENABLED)
|
||||
string(FIND "${ARCH_FLAGS_TEMP}" "+sme" SME_ENABLED)
|
||||
string(FIND "${ARCH_FLAGS_TEMP}" "+sve" SVE_ENABLED)
|
||||
|
||||
set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS_TEMP})
|
||||
|
||||
list(APPEND GGML_KLEIDIAI_SOURCES
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p4x8sb_f32_neon.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon.c)
|
||||
|
||||
if (NOT DOTPROD_ENABLED MATCHES -1)
|
||||
list(APPEND GGML_KLEIDIAI_SOURCES
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod.c)
|
||||
endif()
|
||||
|
||||
if (NOT I8MM_ENABLED MATCHES -1)
|
||||
list(APPEND GGML_KLEIDIAI_SOURCES
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm.c)
|
||||
endif()
|
||||
|
||||
if (NOT SME_ENABLED MATCHES -1)
|
||||
list(APPEND GGML_KLEIDIAI_SOURCES
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa_asm.S
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot_asm.S
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa_asm.S
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c
|
||||
${KLEIDIAI_SRC}/kai/kai_common_sme_asm.S)
|
||||
set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
|
||||
endif()
|
||||
|
||||
if (NOT SVE_ENABLED MATCHES -1)
|
||||
list(APPEND GGML_KLEIDIAI_SOURCES
|
||||
${KLEIDIAI_SRC}/kai/kai_common_sve_asm.S
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod_asm.S
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm_asm.S
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm.c)
|
||||
endif()
|
||||
|
||||
set_source_files_properties(${GGML_KLEIDIAI_SOURCES} PROPERTIES COMPILE_OPTIONS "${PRIVATE_ARCH_FLAGS}")
|
||||
list(APPEND GGML_CPU_SOURCES ${GGML_KLEIDIAI_SOURCES})
|
||||
endif()
|
||||
|
||||
message(STATUS "Adding CPU backend variant ${GGML_CPU_NAME}: ${ARCH_FLAGS} ${ARCH_DEFINITIONS}")
|
||||
target_sources(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_SOURCES})
|
||||
target_compile_options(${GGML_CPU_NAME} PRIVATE ${ARCH_FLAGS})
|
||||
target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS})
|
||||
|
||||
if (EMSCRIPTEN)
|
||||
set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128")
|
||||
endif()
|
||||
|
||||
if (CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
|
||||
# The compiler automatically enables "-ffast-math" which can cause NaNs in tests due to "-fassociative-math"
|
||||
target_compile_options(${GGML_CPU_NAME} PRIVATE "-fno-associative-math")
|
||||
endif()
|
||||
endfunction()
|
||||
@@ -0,0 +1,224 @@
|
||||
#include "amx.h"
|
||||
#include "common.h"
|
||||
#include "mmq.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "traits.h"
|
||||
|
||||
#if defined(__linux__)
|
||||
#include <sys/syscall.h>
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
|
||||
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
||||
|
||||
// AMX type_trais
|
||||
namespace ggml::cpu::amx {
|
||||
class tensor_traits : public ggml::cpu::tensor_traits {
|
||||
bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
|
||||
size = ggml_backend_amx_desired_wsize(op);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override {
|
||||
if (op->op == GGML_OP_MUL_MAT) {
|
||||
ggml_backend_amx_mul_mat(params, op);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struct ggml_tensor *) {
|
||||
static tensor_traits traits;
|
||||
return &traits;
|
||||
}
|
||||
} // namespace ggml::cpu::amx
|
||||
|
||||
// AMX buffer interface
|
||||
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
free(buffer->context);
|
||||
}
|
||||
|
||||
static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
return (void *) (buffer->context);
|
||||
}
|
||||
|
||||
static enum ggml_status ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||
tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor);
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
|
||||
uint8_t value, size_t offset, size_t size) {
|
||||
memset((char *) tensor->data + offset, value, size);
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
|
||||
const void * data, size_t offset, size_t size) {
|
||||
if (qtype_has_amx_kernels(tensor->type)) {
|
||||
GGML_LOG_DEBUG("%s: amx repack tensor %s of type %s\n", __func__, tensor->name, ggml_type_name(tensor->type));
|
||||
ggml_backend_amx_convert_weight(tensor, data, offset, size);
|
||||
} else {
|
||||
memcpy((char *) tensor->data + offset, data, size);
|
||||
}
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
/*
|
||||
// need to figure what we need to do with buffer->extra.
|
||||
static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
|
||||
memcpy(data, (const char *)tensor->data + offset, size);
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||
if (ggml_backend_buffer_is_host(src->buffer)) {
|
||||
if (qtype_has_amx_kernels(src->type)) {
|
||||
ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_nbytes(dst));
|
||||
} else {
|
||||
memcpy(dst->data, src->data, ggml_nbytes(src));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
*/
|
||||
|
||||
static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
memset(buffer->context, value, buffer->size);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
|
||||
/* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
|
||||
/* .get_base = */ ggml_backend_amx_buffer_get_base,
|
||||
/* .init_tensor = */ ggml_backend_amx_buffer_init_tensor,
|
||||
/* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor,
|
||||
/* .set_tensor = */ ggml_backend_amx_buffer_set_tensor,
|
||||
/* .get_tensor = */ nullptr,
|
||||
/* .cpy_tensor = */ nullptr,
|
||||
/* .clear = */ ggml_backend_amx_buffer_clear,
|
||||
/* .reset = */ nullptr,
|
||||
};
|
||||
|
||||
static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
return "AMX";
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
void * data = ggml_aligned_malloc(size);
|
||||
if (data == NULL) {
|
||||
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
return TENSOR_ALIGNMENT;
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
namespace ggml::cpu::amx {
|
||||
class extra_buffer_type : ggml::cpu::extra_buffer_type {
|
||||
bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
|
||||
// handle only 2d gemm for now
|
||||
auto is_contiguous_2d = [](const struct ggml_tensor * t) {
|
||||
return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
|
||||
};
|
||||
|
||||
if (op->op == GGML_OP_MUL_MAT && is_contiguous_2d(op->src[0]) && // src0 must be contiguous
|
||||
is_contiguous_2d(op->src[1]) && // src1 must be contiguous
|
||||
op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() &&
|
||||
op->src[0]->ne[0] % (TILE_K * 2 * 32) == 0 && // TODO: not sure if correct (https://github.com/ggml-org/llama.cpp/pull/16315)
|
||||
op->ne[0] % (TILE_N * 2) == 0 && // out_features is 32x
|
||||
(qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == GGML_TYPE_F16))) {
|
||||
// src1 must be host buffer
|
||||
if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
|
||||
return false;
|
||||
}
|
||||
// src1 must be float32
|
||||
if (op->src[1]->type == GGML_TYPE_F32) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
|
||||
if (op->op == GGML_OP_MUL_MAT && op->src[0]->buffer &&
|
||||
op->src[0]->buffer->buft == ggml_backend_amx_buffer_type()) {
|
||||
return (ggml::cpu::tensor_traits *) op->src[0]->extra;
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
};
|
||||
} // namespace ggml::cpu::amx
|
||||
|
||||
static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
return ggml_backend_amx_get_alloc_size(tensor);
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
#define ARCH_GET_XCOMP_PERM 0x1022
|
||||
#define ARCH_REQ_XCOMP_PERM 0x1023
|
||||
#define XFEATURE_XTILECFG 17
|
||||
#define XFEATURE_XTILEDATA 18
|
||||
|
||||
static bool ggml_amx_init() {
|
||||
#if defined(__linux__)
|
||||
if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
|
||||
fprintf(stderr, "AMX is not ready to be used!\n");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
#elif defined(_WIN32)
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
|
||||
static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_amx_buffer_type_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ nullptr, // defaults to SIZE_MAX
|
||||
/* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
|
||||
/* .is_host = */ nullptr,
|
||||
},
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
||||
/* .context = */ new ggml::cpu::amx::extra_buffer_type(),
|
||||
};
|
||||
|
||||
if (!ggml_amx_init()) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return &ggml_backend_buffer_type_amx;
|
||||
}
|
||||
|
||||
#endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
||||
@@ -0,0 +1,8 @@
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
|
||||
// GGML internal header
|
||||
|
||||
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
||||
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
|
||||
#endif
|
||||
@@ -0,0 +1,91 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <type_traits>
|
||||
|
||||
#if defined(GGML_USE_OPENMP)
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#define TILE_M 16
|
||||
#define TILE_N 16
|
||||
#define TILE_K 32
|
||||
#define VNNI_BLK 4
|
||||
|
||||
#define AMX_BLK_SIZE 32
|
||||
|
||||
#define TMM0 0
|
||||
#define TMM1 1
|
||||
#define TMM2 2
|
||||
#define TMM3 3
|
||||
#define TMM4 4
|
||||
#define TMM5 5
|
||||
#define TMM6 6
|
||||
#define TMM7 7
|
||||
|
||||
// parallel routines
|
||||
template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
|
||||
inline T div_up(T x, T y) { return (x + y - 1) / y; }
|
||||
|
||||
template <typename T>
|
||||
inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
|
||||
#if 0
|
||||
// onednn partition pattern
|
||||
T& n_my = n_end;
|
||||
if (nth <= 1 || n == 0) {
|
||||
n_start = 0;
|
||||
n_my = n;
|
||||
} else {
|
||||
T n1 = div_up(n, nth);
|
||||
T n2 = n1 - 1;
|
||||
T T1 = n - n2 * nth;
|
||||
n_my = ith < T1 ? n1 : n2;
|
||||
n_start = ith <= T1 ? ith*n1 : T1 * n1 + (ith - T1) * n2;
|
||||
}
|
||||
n_end += n_start;
|
||||
#else
|
||||
// pytorch aten partition pattern
|
||||
T n_my = div_up(n, nth);
|
||||
n_start = ith * n_my;
|
||||
n_end = std::min(n_start + n_my, n);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename func_t>
|
||||
inline void parallel_for(int n, const func_t& f) {
|
||||
#if defined(GGML_USE_OPENMP)
|
||||
#pragma omp parallel
|
||||
{
|
||||
int nth = omp_get_num_threads();
|
||||
int ith = omp_get_thread_num();
|
||||
int tbegin, tend;
|
||||
balance211(n, nth, ith, tbegin, tend);
|
||||
f(tbegin, tend);
|
||||
}
|
||||
#else
|
||||
f(0, n);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename func_t>
|
||||
inline void parallel_for_ggml(const ggml_compute_params * params, int n, const func_t & f) {
|
||||
int tbegin, tend;
|
||||
balance211(n, params->nth, params->ith, tbegin, tend);
|
||||
f(tbegin, tend);
|
||||
}
|
||||
|
||||
// quantized types that have AMX support
|
||||
inline bool qtype_has_amx_kernels(const enum ggml_type type) {
|
||||
// TODO: fix padding for vnni format
|
||||
return (type == GGML_TYPE_Q4_0) ||
|
||||
(type == GGML_TYPE_Q4_1) ||
|
||||
(type == GGML_TYPE_Q8_0) ||
|
||||
(type == GGML_TYPE_Q4_K) ||
|
||||
(type == GGML_TYPE_Q5_K) ||
|
||||
(type == GGML_TYPE_Q6_K) ||
|
||||
(type == GGML_TYPE_IQ4_XS);
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,10 @@
|
||||
#pragma once
|
||||
#include "common.h"
|
||||
|
||||
size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst);
|
||||
|
||||
size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor);
|
||||
|
||||
void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
|
||||
void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
@@ -0,0 +1,262 @@
|
||||
#pragma once
|
||||
|
||||
// Rename `_generic` functions if no native implementation is available.
|
||||
// This effectively selects the generic implementation.
|
||||
|
||||
#if defined(GGML_CPU_GENERIC)
|
||||
// quants.c
|
||||
#define quantize_row_q8_0_generic quantize_row_q8_0
|
||||
#define quantize_row_q8_1_generic quantize_row_q8_1
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
#define ggml_vec_dot_q4_0_q8_0_generic ggml_vec_dot_q4_0_q8_0
|
||||
#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
|
||||
#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
|
||||
#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
|
||||
#define ggml_vec_dot_q8_0_q8_0_generic ggml_vec_dot_q8_0_q8_0
|
||||
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
|
||||
#define ggml_vec_dot_q3_K_q8_K_generic ggml_vec_dot_q3_K_q8_K
|
||||
#define ggml_vec_dot_q4_K_q8_K_generic ggml_vec_dot_q4_K_q8_K
|
||||
#define ggml_vec_dot_q5_K_q8_K_generic ggml_vec_dot_q5_K_q8_K
|
||||
#define ggml_vec_dot_q6_K_q8_K_generic ggml_vec_dot_q6_K_q8_K
|
||||
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
|
||||
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
|
||||
#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
|
||||
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
|
||||
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
|
||||
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
|
||||
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
|
||||
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
|
||||
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
|
||||
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
|
||||
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
|
||||
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
|
||||
#elif defined(__POWERPC__) || defined(__powerpc__)
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14146#issuecomment-2972561679
|
||||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
|
||||
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
|
||||
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
|
||||
#elif defined(__loongarch64)
|
||||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
|
||||
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
|
||||
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
|
||||
#elif defined(__riscv)
|
||||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
|
||||
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
|
||||
#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
|
||||
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
|
||||
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
|
||||
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
|
||||
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
|
||||
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
|
||||
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
|
||||
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
|
||||
#elif defined(__s390x__)
|
||||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
|
||||
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
|
||||
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
|
||||
#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
|
||||
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
|
||||
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
|
||||
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
|
||||
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
|
||||
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
|
||||
#elif defined(__wasm__)
|
||||
// quants.c
|
||||
#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
|
||||
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
|
||||
#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
|
||||
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
|
||||
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
|
||||
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
|
||||
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
|
||||
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
|
||||
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
|
||||
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
|
||||
#endif
|
||||
@@ -0,0 +1,98 @@
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#if defined(__aarch64__)
|
||||
|
||||
#if defined(__linux__)
|
||||
#include <sys/auxv.h>
|
||||
#elif defined(__APPLE__)
|
||||
#include <sys/sysctl.h>
|
||||
#endif
|
||||
|
||||
#if !defined(HWCAP2_SVE2)
|
||||
#define HWCAP2_SVE2 (1 << 1)
|
||||
#endif
|
||||
|
||||
#if !defined(HWCAP2_I8MM)
|
||||
#define HWCAP2_I8MM (1 << 13)
|
||||
#endif
|
||||
|
||||
#if !defined(HWCAP2_SME)
|
||||
#define HWCAP2_SME (1 << 23)
|
||||
#endif
|
||||
|
||||
struct aarch64_features {
|
||||
// has_neon not needed, aarch64 has NEON guaranteed
|
||||
bool has_dotprod = false;
|
||||
bool has_fp16_va = false;
|
||||
bool has_sve = false;
|
||||
bool has_sve2 = false;
|
||||
bool has_i8mm = false;
|
||||
bool has_sme = false;
|
||||
|
||||
aarch64_features() {
|
||||
#if defined(__linux__)
|
||||
uint32_t hwcap = getauxval(AT_HWCAP);
|
||||
uint32_t hwcap2 = getauxval(AT_HWCAP2);
|
||||
|
||||
has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
|
||||
has_fp16_va = !!(hwcap & HWCAP_FPHP);
|
||||
has_sve = !!(hwcap & HWCAP_SVE);
|
||||
has_sve2 = !!(hwcap2 & HWCAP2_SVE2);
|
||||
has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
|
||||
has_sme = !!(hwcap2 & HWCAP2_SME);
|
||||
#elif defined(__APPLE__)
|
||||
int oldp = 0;
|
||||
size_t size = sizeof(oldp);
|
||||
|
||||
if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) == 0) {
|
||||
has_dotprod = static_cast<bool>(oldp);
|
||||
}
|
||||
|
||||
if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) == 0) {
|
||||
has_i8mm = static_cast<bool>(oldp);
|
||||
}
|
||||
|
||||
if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) == 0) {
|
||||
has_sme = static_cast<bool>(oldp);
|
||||
}
|
||||
|
||||
// Apple apparently does not implement SVE yet
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
static int ggml_backend_cpu_aarch64_score() {
|
||||
int score = 1;
|
||||
aarch64_features af;
|
||||
|
||||
#ifdef GGML_USE_DOTPROD
|
||||
if (!af.has_dotprod) { return 0; }
|
||||
score += 1<<1;
|
||||
#endif
|
||||
#ifdef GGML_USE_FP16_VECTOR_ARITHMETIC
|
||||
if (!af.has_fp16_va) { return 0; }
|
||||
score += 1<<2;
|
||||
#endif
|
||||
#ifdef GGML_USE_SVE
|
||||
if (!af.has_sve) { return 0; }
|
||||
score += 1<<3;
|
||||
#endif
|
||||
#ifdef GGML_USE_MATMUL_INT8
|
||||
if (!af.has_i8mm) { return 0; }
|
||||
score += 1<<4;
|
||||
#endif
|
||||
#ifdef GGML_USE_SVE2
|
||||
if (!af.has_sve2) { return 0; }
|
||||
score += 1<<5;
|
||||
#endif
|
||||
#ifdef GGML_USE_SME
|
||||
if (!af.has_sme) { return 0; }
|
||||
score += 1<<6;
|
||||
#endif
|
||||
|
||||
return score;
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_aarch64_score)
|
||||
|
||||
# endif // defined(__aarch64__)
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,82 @@
|
||||
# include "ggml-backend-impl.h"
|
||||
|
||||
#if defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__)
|
||||
|
||||
#if defined(__linux__)
|
||||
#include <sys/auxv.h>
|
||||
#endif
|
||||
|
||||
#include <string>
|
||||
|
||||
struct powerpc_features {
|
||||
std::string platform = "";
|
||||
int power_version = -1;
|
||||
|
||||
bool has_vsx = false;
|
||||
|
||||
powerpc_features() {
|
||||
#if defined(__linux__)
|
||||
unsigned long auxval = getauxval(AT_PLATFORM);
|
||||
if (auxval) {
|
||||
platform = std::string(reinterpret_cast<const char*>(auxval));
|
||||
// TBD: Do systems exist that return this in uppercase?
|
||||
if (platform.substr(0, 5) == "power") {
|
||||
// Extractt a numeric suffix, if one exists
|
||||
int vpos = -1;
|
||||
for (int i = platform.length() - 1; i >= 0; i--) {
|
||||
if (std::isdigit(platform[i])) {
|
||||
vpos = i;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (vpos > -1) {
|
||||
power_version = std::stoi(platform.substr(vpos));
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (power_version >= 9) {
|
||||
has_vsx = true;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static int ggml_backend_cpu_powerpc_score() {
|
||||
int score = 1;
|
||||
powerpc_features pf;
|
||||
|
||||
// Platform scores
|
||||
#if defined(GGML_USE_POWER7)
|
||||
if (pf.power_version < 7) { return 0; }
|
||||
score += 1<<1;
|
||||
#endif
|
||||
#if defined(GGML_USE_POWER8)
|
||||
if (pf.power_version < 8) { return 0; }
|
||||
score += 1<<2;
|
||||
#endif
|
||||
#if defined(GGML_USE_POWER9)
|
||||
if (pf.power_version < 9) { return 0; }
|
||||
score += 1<<3;
|
||||
#endif
|
||||
#if defined(GGML_USE_POWER10)
|
||||
if (pf.power_version < 10) { return 0; }
|
||||
score += 1<<4;
|
||||
#endif
|
||||
#if defined(GGML_USE_POWER11)
|
||||
if (pf.power_version < 11) { return 0; }
|
||||
score += 1<<5;
|
||||
#endif
|
||||
|
||||
// Feature scores
|
||||
#if defined(GGML_USE_VSX)
|
||||
if (!pf.has_vsx) { return 0; }
|
||||
score += 1<<6;
|
||||
#endif
|
||||
|
||||
return score;
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_powerpc_score)
|
||||
|
||||
#endif // defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,38 @@
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#if defined(__riscv) && __riscv_xlen == 64
|
||||
#include <asm/hwprobe.h>
|
||||
#include <asm/unistd.h>
|
||||
#include <unistd.h>
|
||||
|
||||
struct riscv64_features {
|
||||
bool has_rvv = false;
|
||||
|
||||
riscv64_features() {
|
||||
struct riscv_hwprobe probe;
|
||||
probe.key = RISCV_HWPROBE_KEY_IMA_EXT_0;
|
||||
probe.value = 0;
|
||||
|
||||
int ret = syscall(__NR_riscv_hwprobe, &probe, 1, 0, NULL, 0);
|
||||
|
||||
if (0 == ret) {
|
||||
has_rvv = !!(probe.value & RISCV_HWPROBE_IMA_V);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static int ggml_backend_cpu_riscv64_score() {
|
||||
int score = 1;
|
||||
riscv64_features rf;
|
||||
|
||||
#ifdef GGML_USE_RVV
|
||||
if (!rf.has_rvv) { return 0; }
|
||||
score += 1 << 1;
|
||||
#endif
|
||||
|
||||
return score;
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_riscv64_score)
|
||||
|
||||
#endif // __riscv && __riscv_xlen == 64
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,342 @@
|
||||
#define GGML_COMMON_IMPL_CPP
|
||||
#define GGML_COMMON_DECL_CPP
|
||||
#include "ggml-common.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "simd-mappings.h"
|
||||
#include "traits.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <cassert>
|
||||
#include <cstdlib> // for qsort
|
||||
#include <cstdio> // for GGML_ASSERT
|
||||
|
||||
#define GGML_CPU_CLANG_WORKAROUND
|
||||
#include "../../repack.h"
|
||||
|
||||
#if defined(__GNUC__)
|
||||
#pragma GCC diagnostic ignored "-Woverlength-strings"
|
||||
#endif
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 8;
|
||||
const int blocklen = 8;
|
||||
|
||||
assert (n % qk == 0);
|
||||
assert (nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(s);
|
||||
UNUSED(bs);
|
||||
UNUSED(vx);
|
||||
UNUSED(vy);
|
||||
UNUSED(nr);
|
||||
UNUSED(nc);
|
||||
UNUSED(nb);
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
#if defined __riscv_v
|
||||
if (__riscv_vlenb() >= QK4_0) {
|
||||
const size_t vl = QK4_0;
|
||||
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
||||
|
||||
vfloat32m1_t sumf = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
|
||||
for (int l = 0; l < nb; l++) {
|
||||
const int64_t a0 = *(const int64_t *)&a_ptr[l].qs[0];
|
||||
const int64_t a1 = *(const int64_t *)&a_ptr[l].qs[8];
|
||||
const int64_t a2 = *(const int64_t *)&a_ptr[l].qs[16];
|
||||
const int64_t a3 = *(const int64_t *)&a_ptr[l].qs[24];
|
||||
__asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment constraints
|
||||
const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a0, vl / 4));
|
||||
const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a1, vl / 4));
|
||||
const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a2, vl / 4));
|
||||
const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a3, vl / 4));
|
||||
|
||||
const vint8m4_t rhs_raw_vec = __riscv_vle8_v_i8m4((const int8_t *)b_ptr[l].qs, vl * 4);
|
||||
const vint8m4_t rhs_vec_lo = __riscv_vsra_vx_i8m4(__riscv_vsll_vx_i8m4(rhs_raw_vec, 4, vl * 4), 4, vl * 4);
|
||||
const vint8m4_t rhs_vec_hi = __riscv_vsra_vx_i8m4(rhs_raw_vec, 4, vl * 4);
|
||||
const vint8m2_t rhs_vec_lo_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 0);
|
||||
const vint8m2_t rhs_vec_lo_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 1);
|
||||
const vint8m2_t rhs_vec_hi_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 0);
|
||||
const vint8m2_t rhs_vec_hi_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 1);
|
||||
|
||||
const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
|
||||
const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
|
||||
const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
|
||||
const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
|
||||
|
||||
const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_hi_m));
|
||||
const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
|
||||
const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
|
||||
const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
|
||||
const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
|
||||
const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
|
||||
const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
|
||||
const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
|
||||
const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
|
||||
const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
|
||||
const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
|
||||
const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
|
||||
const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
|
||||
|
||||
// vector version needs Zvfhmin extension
|
||||
const float a_scale = GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
const float b_scales[8] = {
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[0]),
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[1]),
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[2]),
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[3]),
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[4]),
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[5]),
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[6]),
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[7])
|
||||
};
|
||||
const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
|
||||
const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scale, vl / 4);
|
||||
sumf = __riscv_vfmacc_vv_f32m1(sumf, tmp1, b_scales_vec, vl / 4);
|
||||
}
|
||||
__riscv_vse32_v_f32m1(s + x * ncols_interleaved, sumf, vl / 4);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
#endif
|
||||
ggml_gemv_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 8;
|
||||
const int blocklen = 8;
|
||||
|
||||
assert (n % qk == 0);
|
||||
assert (nr % 4 == 0);
|
||||
assert (nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(s);
|
||||
UNUSED(bs);
|
||||
UNUSED(vx);
|
||||
UNUSED(vy);
|
||||
UNUSED(nr);
|
||||
UNUSED(nc);
|
||||
UNUSED(nb);
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
#if defined __riscv_v
|
||||
if (__riscv_vlenb() >= QK4_0) {
|
||||
const size_t vl = QK4_0;
|
||||
|
||||
for (int y = 0; y < nr / 4; y++) {
|
||||
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
||||
vfloat32m1_t sumf0 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
|
||||
vfloat32m1_t sumf1 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
|
||||
vfloat32m1_t sumf2 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
|
||||
vfloat32m1_t sumf3 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
|
||||
for (int l = 0; l < nb; l++) {
|
||||
const vint8m4_t rhs_raw_vec = __riscv_vle8_v_i8m4((const int8_t *)b_ptr[l].qs, vl * 4);
|
||||
const vint8m4_t rhs_vec_lo = __riscv_vsra_vx_i8m4(__riscv_vsll_vx_i8m4(rhs_raw_vec, 4, vl * 4), 4, vl * 4);
|
||||
const vint8m4_t rhs_vec_hi = __riscv_vsra_vx_i8m4(rhs_raw_vec, 4, vl * 4);
|
||||
const vint8m2_t rhs_vec_lo_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 0);
|
||||
const vint8m2_t rhs_vec_lo_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 1);
|
||||
const vint8m2_t rhs_vec_hi_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 0);
|
||||
const vint8m2_t rhs_vec_hi_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 1);
|
||||
|
||||
// vector version needs Zvfhmin extension
|
||||
const float a_scales[4] = {
|
||||
GGML_CPU_FP16_TO_FP32(a_ptr[l].d[0]),
|
||||
GGML_CPU_FP16_TO_FP32(a_ptr[l].d[1]),
|
||||
GGML_CPU_FP16_TO_FP32(a_ptr[l].d[2]),
|
||||
GGML_CPU_FP16_TO_FP32(a_ptr[l].d[3])
|
||||
};
|
||||
const float b_scales[8] = {
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[0]),
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[1]),
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[2]),
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[3]),
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[4]),
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[5]),
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[6]),
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[7])
|
||||
};
|
||||
const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
|
||||
|
||||
const int64_t A0 = *(const int64_t *)&a_ptr[l].qs[0];
|
||||
const int64_t A4 = *(const int64_t *)&a_ptr[l].qs[32];
|
||||
const int64_t A8 = *(const int64_t *)&a_ptr[l].qs[64];
|
||||
const int64_t Ac = *(const int64_t *)&a_ptr[l].qs[96];
|
||||
__asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
|
||||
vint16m4_t sumi_l0;
|
||||
{
|
||||
const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A0, vl / 4));
|
||||
const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A4, vl / 4));
|
||||
const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A8, vl / 4));
|
||||
const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ac, vl / 4));
|
||||
const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
|
||||
const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
|
||||
const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
|
||||
const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
|
||||
|
||||
sumi_l0 = sumi_hi_m;
|
||||
}
|
||||
|
||||
{
|
||||
const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l0));
|
||||
const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
|
||||
const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
|
||||
const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
|
||||
const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
|
||||
const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
|
||||
const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
|
||||
const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
|
||||
const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
|
||||
const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
|
||||
const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
|
||||
const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
|
||||
const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
|
||||
|
||||
const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[0], vl / 4);
|
||||
sumf0 = __riscv_vfmacc_vv_f32m1(sumf0, tmp1, b_scales_vec, vl / 4);
|
||||
}
|
||||
|
||||
const int64_t A1 = *(const int64_t *)&a_ptr[l].qs[8];
|
||||
const int64_t A5 = *(const int64_t *)&a_ptr[l].qs[40];
|
||||
const int64_t A9 = *(const int64_t *)&a_ptr[l].qs[72];
|
||||
const int64_t Ad = *(const int64_t *)&a_ptr[l].qs[104];
|
||||
__asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
|
||||
vint16m4_t sumi_l1;
|
||||
{
|
||||
const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A1, vl / 4));
|
||||
const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A5, vl / 4));
|
||||
const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A9, vl / 4));
|
||||
const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ad, vl / 4));
|
||||
const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
|
||||
const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
|
||||
const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
|
||||
const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
|
||||
|
||||
sumi_l1 = sumi_hi_m;
|
||||
}
|
||||
|
||||
{
|
||||
const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l1));
|
||||
const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
|
||||
const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
|
||||
const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
|
||||
const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
|
||||
const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
|
||||
const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
|
||||
const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
|
||||
const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
|
||||
const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
|
||||
const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
|
||||
const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
|
||||
const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
|
||||
|
||||
const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[1], vl / 4);
|
||||
sumf1 = __riscv_vfmacc_vv_f32m1(sumf1, tmp1, b_scales_vec, vl / 4);
|
||||
}
|
||||
|
||||
const int64_t A2 = *(const int64_t *)&a_ptr[l].qs[16];
|
||||
const int64_t A6 = *(const int64_t *)&a_ptr[l].qs[48];
|
||||
const int64_t Aa = *(const int64_t *)&a_ptr[l].qs[80];
|
||||
const int64_t Ae = *(const int64_t *)&a_ptr[l].qs[112];
|
||||
__asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
|
||||
vint16m4_t sumi_l2;
|
||||
{
|
||||
const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A2, vl / 4));
|
||||
const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A6, vl / 4));
|
||||
const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Aa, vl / 4));
|
||||
const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ae, vl / 4));
|
||||
const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
|
||||
const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
|
||||
const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
|
||||
const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
|
||||
|
||||
sumi_l2 = sumi_hi_m;
|
||||
}
|
||||
|
||||
{
|
||||
const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l2));
|
||||
const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
|
||||
const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
|
||||
const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
|
||||
const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
|
||||
const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
|
||||
const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
|
||||
const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
|
||||
const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
|
||||
const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
|
||||
const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
|
||||
const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
|
||||
const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
|
||||
|
||||
const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[2], vl / 4);
|
||||
sumf2 = __riscv_vfmacc_vv_f32m1(sumf2, tmp1, b_scales_vec, vl / 4);
|
||||
}
|
||||
|
||||
const int64_t A3 = *(const int64_t *)&a_ptr[l].qs[24];
|
||||
const int64_t A7 = *(const int64_t *)&a_ptr[l].qs[56];
|
||||
const int64_t Ab = *(const int64_t *)&a_ptr[l].qs[88];
|
||||
const int64_t Af = *(const int64_t *)&a_ptr[l].qs[120];
|
||||
__asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
|
||||
vint16m4_t sumi_l3;
|
||||
{
|
||||
const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A3, vl / 4));
|
||||
const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A7, vl / 4));
|
||||
const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ab, vl / 4));
|
||||
const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Af, vl / 4));
|
||||
const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
|
||||
const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
|
||||
const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
|
||||
const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
|
||||
|
||||
sumi_l3 = sumi_hi_m;
|
||||
}
|
||||
|
||||
{
|
||||
const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l3));
|
||||
const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
|
||||
const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
|
||||
const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
|
||||
const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
|
||||
const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
|
||||
const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
|
||||
const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
|
||||
const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
|
||||
const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
|
||||
const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
|
||||
const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
|
||||
const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
|
||||
|
||||
const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[3], vl / 4);
|
||||
sumf3 = __riscv_vfmacc_vv_f32m1(sumf3, tmp1, b_scales_vec, vl / 4);
|
||||
}
|
||||
}
|
||||
__riscv_vse32_v_f32m1(&s[(y * 4 + 0) * bs + x * ncols_interleaved], sumf0, vl / 4);
|
||||
__riscv_vse32_v_f32m1(&s[(y * 4 + 1) * bs + x * ncols_interleaved], sumf1, vl / 4);
|
||||
__riscv_vse32_v_f32m1(&s[(y * 4 + 2) * bs + x * ncols_interleaved], sumf2, vl / 4);
|
||||
__riscv_vse32_v_f32m1(&s[(y * 4 + 3) * bs + x * ncols_interleaved], sumf3, vl / 4);
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
#endif
|
||||
ggml_gemm_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
@@ -0,0 +1,50 @@
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#if defined(__s390x__)
|
||||
#include <sys/auxv.h>
|
||||
|
||||
// find hwcap bits in asm/elf.h
|
||||
#ifndef HWCAP_VXRS_EXT2
|
||||
#define HWCAP_VXRS_EXT2 (1 << 15)
|
||||
#endif
|
||||
|
||||
#ifndef HWCAP_NNPA
|
||||
#define HWCAP_NNPA (1 << 20)
|
||||
#endif
|
||||
|
||||
struct s390x_features {
|
||||
bool has_vxe2 = false;
|
||||
bool has_nnpa = false;
|
||||
|
||||
s390x_features() {
|
||||
uint32_t hwcap = getauxval(AT_HWCAP);
|
||||
// NOTE: use hwcap2 with DFLT for z17 and later
|
||||
// uint32_t hwcap2 = getauxval(AT_HWCAP2);
|
||||
|
||||
has_vxe2 = !!(hwcap & HWCAP_VXRS_EXT2);
|
||||
has_nnpa = !!(hwcap & HWCAP_NNPA);
|
||||
}
|
||||
};
|
||||
|
||||
static int ggml_backend_cpu_s390x_score() {
|
||||
int score = 1;
|
||||
s390x_features sf;
|
||||
|
||||
// IBM z15 / LinuxONE 3
|
||||
#ifdef GGML_USE_VXE2
|
||||
if (!sf.has_vxe2) { return 0; }
|
||||
score += 1 << 1;
|
||||
#endif
|
||||
|
||||
// IBM z16 / LinuxONE 4 and z17 / LinuxONE 5
|
||||
#ifdef GGML_USE_NNPA
|
||||
if (!sf.has_nnpa) { return 0; }
|
||||
score += 1 << 2;
|
||||
#endif
|
||||
|
||||
return score;
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_s390x_score)
|
||||
|
||||
#endif // __s390x__
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,327 @@
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
#include <bitset>
|
||||
#include <array>
|
||||
#include <string>
|
||||
|
||||
// ref: https://cdrdv2-public.intel.com/782156/325383-sdm-vol-2abcd.pdf
|
||||
struct cpuid_x86 {
|
||||
bool SSE3(void) { return f_1_ecx[0]; }
|
||||
bool PCLMULQDQ(void) { return f_1_ecx[1]; }
|
||||
bool MONITOR(void) { return f_1_ecx[3]; }
|
||||
bool SSSE3(void) { return f_1_ecx[9]; }
|
||||
bool FMA(void) { return f_1_ecx[12]; }
|
||||
bool CMPXCHG16B(void) { return f_1_ecx[13]; }
|
||||
bool SSE41(void) { return f_1_ecx[19]; }
|
||||
bool SSE42(void) { return f_1_ecx[20]; }
|
||||
bool MOVBE(void) { return f_1_ecx[22]; }
|
||||
bool POPCNT(void) { return f_1_ecx[23]; }
|
||||
bool AES(void) { return f_1_ecx[25]; }
|
||||
bool XSAVE(void) { return f_1_ecx[26]; }
|
||||
bool OSXSAVE(void) { return f_1_ecx[27]; }
|
||||
bool AVX(void) { return f_1_ecx[28]; }
|
||||
bool F16C(void) { return f_1_ecx[29]; }
|
||||
bool RDRAND(void) { return f_1_ecx[30]; }
|
||||
|
||||
bool MSR(void) { return f_1_edx[5]; }
|
||||
bool CX8(void) { return f_1_edx[8]; }
|
||||
bool SEP(void) { return f_1_edx[11]; }
|
||||
bool CMOV(void) { return f_1_edx[15]; }
|
||||
bool CLFSH(void) { return f_1_edx[19]; }
|
||||
bool MMX(void) { return f_1_edx[23]; }
|
||||
bool FXSR(void) { return f_1_edx[24]; }
|
||||
bool SSE(void) { return f_1_edx[25]; }
|
||||
bool SSE2(void) { return f_1_edx[26]; }
|
||||
|
||||
bool FSGSBASE(void) { return f_7_ebx[0]; }
|
||||
bool BMI1(void) { return f_7_ebx[3]; }
|
||||
bool HLE(void) { return is_intel && f_7_ebx[4]; }
|
||||
bool AVX2(void) { return f_7_ebx[5]; }
|
||||
bool BMI2(void) { return f_7_ebx[8]; }
|
||||
bool ERMS(void) { return f_7_ebx[9]; }
|
||||
bool INVPCID(void) { return f_7_ebx[10]; }
|
||||
bool RTM(void) { return is_intel && f_7_ebx[11]; }
|
||||
bool AVX512F(void) { return f_7_ebx[16]; }
|
||||
bool AVX512DQ(void) { return f_7_ebx[17]; }
|
||||
bool RDSEED(void) { return f_7_ebx[18]; }
|
||||
bool ADX(void) { return f_7_ebx[19]; }
|
||||
bool AVX512PF(void) { return f_7_ebx[26]; }
|
||||
bool AVX512ER(void) { return f_7_ebx[27]; }
|
||||
bool AVX512CD(void) { return f_7_ebx[28]; }
|
||||
bool AVX512BW(void) { return f_7_ebx[30]; }
|
||||
bool AVX512VL(void) { return f_7_ebx[31]; }
|
||||
|
||||
bool SHA(void) { return f_7_ebx[29]; }
|
||||
|
||||
bool PREFETCHWT1(void) { return f_7_ecx[0]; }
|
||||
|
||||
bool LAHF(void) { return f_81_ecx[0]; }
|
||||
bool LZCNT(void) { return is_intel && f_81_ecx[5]; }
|
||||
bool ABM(void) { return is_amd && f_81_ecx[5]; }
|
||||
bool SSE4a(void) { return is_amd && f_81_ecx[6]; }
|
||||
bool XOP(void) { return is_amd && f_81_ecx[11]; }
|
||||
bool TBM(void) { return is_amd && f_81_ecx[21]; }
|
||||
|
||||
bool SYSCALL(void) { return is_intel && f_81_edx[11]; }
|
||||
bool MMXEXT(void) { return is_amd && f_81_edx[22]; }
|
||||
bool RDTSCP(void) { return is_intel && f_81_edx[27]; }
|
||||
bool _3DNOWEXT(void) { return is_amd && f_81_edx[30]; }
|
||||
bool _3DNOW(void) { return is_amd && f_81_edx[31]; }
|
||||
|
||||
bool AVX512_VBMI(void) { return f_7_ecx[1]; }
|
||||
bool AVX512_VNNI(void) { return f_7_ecx[11]; }
|
||||
bool AVX512_FP16(void) { return f_7_edx[23]; }
|
||||
bool AVX512_BF16(void) { return f_7_1_eax[5]; }
|
||||
bool AVX_VNNI(void) { return f_7_1_eax[4]; }
|
||||
|
||||
bool AMX_TILE(void) { return f_7_edx[24]; }
|
||||
bool AMX_INT8(void) { return f_7_edx[25]; }
|
||||
bool AMX_FP16(void) { return f_7_1_eax[21]; }
|
||||
bool AMX_BF16(void) { return f_7_edx[22]; }
|
||||
|
||||
#ifdef _MSC_VER
|
||||
static void cpuid(int cpu_info[4], int eax) {
|
||||
__cpuid(cpu_info, eax);
|
||||
}
|
||||
static void cpuidex(int cpu_info[4], int eax, int ecx) {
|
||||
__cpuidex(cpu_info, eax, ecx);
|
||||
}
|
||||
#else
|
||||
static void cpuid(int cpu_info[4], int eax) {
|
||||
__asm__ __volatile__(
|
||||
"cpuid"
|
||||
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
|
||||
: "a"(eax), "c"(0));
|
||||
}
|
||||
static void cpuidex(int cpu_info[4], int eax, int ecx) {
|
||||
__asm__ __volatile__(
|
||||
"cpuid"
|
||||
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
|
||||
: "a"(eax), "c"(ecx));
|
||||
}
|
||||
#endif
|
||||
|
||||
cpuid_x86() {
|
||||
std::array<int, 4> cpui;
|
||||
std::vector<std::array<int, 4>> data;
|
||||
|
||||
// calling __cpuid with 0x0 as the function_id argument
|
||||
// gets the number of the highest valid function ID.
|
||||
cpuid(cpui.data(), 0);
|
||||
int n_ids = cpui[0];
|
||||
|
||||
for (int i = 0; i <= n_ids; ++i) {
|
||||
cpuidex(cpui.data(), i, 0);
|
||||
data.push_back(cpui);
|
||||
}
|
||||
|
||||
// capture vendor string
|
||||
char vendor[0x20] = {};
|
||||
*reinterpret_cast<int *>(vendor) = data[0][1];
|
||||
*reinterpret_cast<int *>(vendor + 4) = data[0][3];
|
||||
*reinterpret_cast<int *>(vendor + 8) = data[0][2];
|
||||
this->vendor = vendor;
|
||||
if (this->vendor == "GenuineIntel") {
|
||||
is_intel = true;
|
||||
} else if (this->vendor == "AuthenticAMD") {
|
||||
is_amd = true;
|
||||
}
|
||||
|
||||
// load bitset with flags for function 0x00000001
|
||||
if (n_ids >= 1) {
|
||||
f_1_ecx = data[1][2];
|
||||
f_1_edx = data[1][3];
|
||||
}
|
||||
|
||||
// load bitset with flags for function 0x00000007
|
||||
if (n_ids >= 7) {
|
||||
f_7_ebx = data[7][1];
|
||||
f_7_ecx = data[7][2];
|
||||
f_7_edx = data[7][3];
|
||||
cpuidex(cpui.data(), 7, 1);
|
||||
f_7_1_eax = cpui[0];
|
||||
}
|
||||
|
||||
// calling __cpuid with 0x80000000 as the function_id argument
|
||||
// gets the number of the highest valid extended ID.
|
||||
cpuid(cpui.data(), 0x80000000);
|
||||
unsigned int n_ex_ids = cpui[0];
|
||||
|
||||
std::vector<std::array<int, 4>> ext_data;
|
||||
for (unsigned int i = 0x80000000; i <= n_ex_ids; ++i) {
|
||||
cpuidex(cpui.data(), i, 0);
|
||||
ext_data.push_back(cpui);
|
||||
}
|
||||
|
||||
// load bitset with flags for function 0x80000001
|
||||
if (n_ex_ids >= 0x80000001) {
|
||||
f_81_ecx = ext_data[1][2];
|
||||
f_81_edx = ext_data[1][3];
|
||||
}
|
||||
|
||||
// interpret CPU brand string if reported
|
||||
char brand[0x40] = {};
|
||||
if (n_ex_ids >= 0x80000004) {
|
||||
std::memcpy(brand, ext_data[2].data(), sizeof(cpui));
|
||||
std::memcpy(brand + 16, ext_data[3].data(), sizeof(cpui));
|
||||
std::memcpy(brand + 32, ext_data[4].data(), sizeof(cpui));
|
||||
this->brand = brand;
|
||||
}
|
||||
}
|
||||
|
||||
bool is_intel = false;
|
||||
bool is_amd = false;
|
||||
std::string vendor;
|
||||
std::string brand;
|
||||
std::bitset<32> f_1_ecx;
|
||||
std::bitset<32> f_1_edx;
|
||||
std::bitset<32> f_7_ebx;
|
||||
std::bitset<32> f_7_ecx;
|
||||
std::bitset<32> f_7_edx;
|
||||
std::bitset<32> f_7_1_eax;
|
||||
std::bitset<32> f_81_ecx;
|
||||
std::bitset<32> f_81_edx;
|
||||
};
|
||||
|
||||
#if 0
|
||||
void test_x86_is() {
|
||||
cpuid_x86 is;
|
||||
printf("CPU Vendor: %s\n", is.vendor.c_str());
|
||||
printf("Brand: %s\n", is.brand.c_str());
|
||||
printf("is_intel: %d\n", is.is_intel);
|
||||
printf("is_amd: %d\n", is.is_amd);
|
||||
printf("sse3: %d\n", is.SSE3());
|
||||
printf("pclmulqdq: %d\n", is.PCLMULQDQ());
|
||||
printf("ssse3: %d\n", is.SSSE3());
|
||||
printf("fma: %d\n", is.FMA());
|
||||
printf("cmpxchg16b: %d\n", is.CMPXCHG16B());
|
||||
printf("sse41: %d\n", is.SSE41());
|
||||
printf("sse42: %d\n", is.SSE42());
|
||||
printf("movbe: %d\n", is.MOVBE());
|
||||
printf("popcnt: %d\n", is.POPCNT());
|
||||
printf("aes: %d\n", is.AES());
|
||||
printf("xsave: %d\n", is.XSAVE());
|
||||
printf("osxsave: %d\n", is.OSXSAVE());
|
||||
printf("avx: %d\n", is.AVX());
|
||||
printf("f16c: %d\n", is.F16C());
|
||||
printf("rdrand: %d\n", is.RDRAND());
|
||||
printf("msr: %d\n", is.MSR());
|
||||
printf("cx8: %d\n", is.CX8());
|
||||
printf("sep: %d\n", is.SEP());
|
||||
printf("cmov: %d\n", is.CMOV());
|
||||
printf("clflush: %d\n", is.CLFSH());
|
||||
printf("mmx: %d\n", is.MMX());
|
||||
printf("fxsr: %d\n", is.FXSR());
|
||||
printf("sse: %d\n", is.SSE());
|
||||
printf("sse2: %d\n", is.SSE2());
|
||||
printf("fsgsbase: %d\n", is.FSGSBASE());
|
||||
printf("bmi1: %d\n", is.BMI1());
|
||||
printf("hle: %d\n", is.HLE());
|
||||
printf("avx2: %d\n", is.AVX2());
|
||||
printf("bmi2: %d\n", is.BMI2());
|
||||
printf("erms: %d\n", is.ERMS());
|
||||
printf("invpcid: %d\n", is.INVPCID());
|
||||
printf("rtm: %d\n", is.RTM());
|
||||
printf("avx512f: %d\n", is.AVX512F());
|
||||
printf("rdseed: %d\n", is.RDSEED());
|
||||
printf("adx: %d\n", is.ADX());
|
||||
printf("avx512pf: %d\n", is.AVX512PF());
|
||||
printf("avx512er: %d\n", is.AVX512ER());
|
||||
printf("avx512cd: %d\n", is.AVX512CD());
|
||||
printf("sha: %d\n", is.SHA());
|
||||
printf("prefetchwt1: %d\n", is.PREFETCHWT1());
|
||||
printf("lahf: %d\n", is.LAHF());
|
||||
printf("lzcnt: %d\n", is.LZCNT());
|
||||
printf("abm: %d\n", is.ABM());
|
||||
printf("sse4a: %d\n", is.SSE4a());
|
||||
printf("xop: %d\n", is.XOP());
|
||||
printf("tbm: %d\n", is.TBM());
|
||||
printf("syscall: %d\n", is.SYSCALL());
|
||||
printf("mmxext: %d\n", is.MMXEXT());
|
||||
printf("rdtscp: %d\n", is.RDTSCP());
|
||||
printf("3dnowext: %d\n", is._3DNOWEXT());
|
||||
printf("3dnow: %d\n", is._3DNOW());
|
||||
printf("avx512_vbmi: %d\n", is.AVX512_VBMI());
|
||||
printf("avx512_vnni: %d\n", is.AVX512_VNNI());
|
||||
printf("avx512_fp16: %d\n", is.AVX512_FP16());
|
||||
printf("avx512_bf16: %d\n", is.AVX512_BF16());
|
||||
printf("amx_tile: %d\n", is.AMX_TILE());
|
||||
printf("amx_int8: %d\n", is.AMX_INT8());
|
||||
printf("amx_fp16: %d\n", is.AMX_FP16());
|
||||
printf("amx_bf16: %d\n", is.AMX_BF16());
|
||||
}
|
||||
#endif
|
||||
|
||||
static int ggml_backend_cpu_x86_score() {
|
||||
// FIXME: this does not check for OS support
|
||||
|
||||
int score = 1;
|
||||
cpuid_x86 is;
|
||||
|
||||
#ifdef GGML_FMA
|
||||
if (!is.FMA()) { return 0; }
|
||||
score += 1;
|
||||
#endif
|
||||
#ifdef GGML_F16C
|
||||
if (!is.F16C()) { return 0; }
|
||||
score += 1<<1;
|
||||
#endif
|
||||
#ifdef GGML_SSE42
|
||||
if (!is.SSE42()) { return 0; }
|
||||
score += 1<<2;
|
||||
#endif
|
||||
#ifdef GGML_BMI2
|
||||
if (!is.BMI2()) { return 0; }
|
||||
score += 1<<3;
|
||||
#endif
|
||||
#ifdef GGML_AVX
|
||||
if (!is.AVX()) { return 0; }
|
||||
score += 1<<4;
|
||||
#endif
|
||||
#ifdef GGML_AVX2
|
||||
if (!is.AVX2()) { return 0; }
|
||||
score += 1<<5;
|
||||
#endif
|
||||
#ifdef GGML_AVX_VNNI
|
||||
if (!is.AVX_VNNI()) { return 0; }
|
||||
score += 1<<6;
|
||||
#endif
|
||||
#ifdef GGML_AVX512
|
||||
if (!is.AVX512F()) { return 0; }
|
||||
if (!is.AVX512CD()) { return 0; }
|
||||
if (!is.AVX512VL()) { return 0; }
|
||||
if (!is.AVX512DQ()) { return 0; }
|
||||
if (!is.AVX512BW()) { return 0; }
|
||||
score += 1<<7;
|
||||
#endif
|
||||
#ifdef GGML_AVX512_VBMI
|
||||
if (!is.AVX512_VBMI()) { return 0; }
|
||||
score += 1<<8;
|
||||
#endif
|
||||
#ifdef GGML_AVX512_BF16
|
||||
if (!is.AVX512_BF16()) { return 0; }
|
||||
score += 1<<9;
|
||||
#endif
|
||||
#ifdef GGML_AVX512_VNNI
|
||||
if (!is.AVX512_VNNI()) { return 0; }
|
||||
score += 1<<10;
|
||||
#endif
|
||||
#ifdef GGML_AMX_INT8
|
||||
if (!is.AMX_INT8()) { return 0; }
|
||||
score += 1<<11;
|
||||
#endif
|
||||
|
||||
return score;
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_x86_score)
|
||||
|
||||
#endif // defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,158 @@
|
||||
#include "binary-ops.h"
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE)
|
||||
#include <Accelerate/Accelerate.h>
|
||||
|
||||
using vDSP_fn_t = void (*)(const float *, vDSP_Stride, const float *, vDSP_Stride, float *, vDSP_Stride, vDSP_Length);
|
||||
#endif
|
||||
|
||||
static inline float op_add(float a, float b) {
|
||||
return a + b;
|
||||
}
|
||||
|
||||
static inline float op_sub(float a, float b) {
|
||||
return a - b;
|
||||
}
|
||||
|
||||
static inline float op_mul(float a, float b) {
|
||||
return a * b;
|
||||
}
|
||||
|
||||
static inline float op_div(float a, float b) {
|
||||
return a / b;
|
||||
}
|
||||
|
||||
template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
|
||||
static inline void vec_binary_op_contiguous(const int64_t n, dst_t * z, const src0_t * x, const src1_t * y) {
|
||||
constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
|
||||
constexpr auto src1_to_f32 = type_conversion_table<src1_t>::to_f32;
|
||||
constexpr auto f32_to_dst = type_conversion_table<dst_t >::from_f32;
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
z[i] = f32_to_dst(op(src0_to_f32(x[i]), src1_to_f32(y[i])));
|
||||
}
|
||||
}
|
||||
|
||||
template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
|
||||
static inline void vec_binary_op_non_contiguous(const int64_t n, const int64_t ne10, const int64_t nb10, dst_t * z, const src0_t * x, const src1_t * y) {
|
||||
constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
|
||||
constexpr auto src1_to_f32 = type_conversion_table<src1_t>::to_f32;
|
||||
constexpr auto f32_to_dst = type_conversion_table<dst_t >::from_f32;
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
int i10 = i % ne10;
|
||||
const src1_t * y_ptr = (const src1_t *)((const char *)y + i10*nb10);
|
||||
z[i] = f32_to_dst(op(src0_to_f32(x[i]), src1_to_f32(*y_ptr)));
|
||||
}
|
||||
}
|
||||
|
||||
template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
|
||||
static void apply_binary_op(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
GGML_ASSERT( nb0 == sizeof(dst_t));
|
||||
GGML_ASSERT(nb00 == sizeof(src0_t));
|
||||
|
||||
const auto [ir0, ir1] = get_thread_range(params, src0);
|
||||
const bool is_src1_contiguous = (nb10 == sizeof(src1_t));
|
||||
|
||||
if (!is_src1_contiguous) { // broadcast not implemented yet for non-contiguous
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, src1));
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_ACCELERATE
|
||||
vDSP_fn_t vDSP_op = nullptr;
|
||||
// TODO - avoid the f32-only check using type 'trait' lookup tables and row-based src-to-float conversion functions
|
||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
if (op == op_add) {
|
||||
vDSP_op = vDSP_vadd;
|
||||
} else if (op == op_sub) {
|
||||
vDSP_op = vDSP_vsub;
|
||||
} else if (op == op_mul) {
|
||||
vDSP_op = vDSP_vmul;
|
||||
} else if (op == op_div) {
|
||||
vDSP_op = vDSP_vdiv;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (int64_t ir = ir0; ir < ir1; ++ir) {
|
||||
const int64_t i03 = ir/(ne02*ne01);
|
||||
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
||||
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
||||
|
||||
const int64_t i13 = i03 % ne13;
|
||||
const int64_t i12 = i02 % ne12;
|
||||
const int64_t i11 = i01 % ne11;
|
||||
|
||||
dst_t * dst_ptr = (dst_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
||||
const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
||||
const src1_t * src1_ptr = (const src1_t *) ((const char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
||||
|
||||
if (is_src1_contiguous) {
|
||||
// src1 is broadcastable across src0 and dst in i1, i2, i3
|
||||
const int64_t nr0 = ne00 / ne10;
|
||||
|
||||
for (int64_t r = 0; r < nr0; ++r) {
|
||||
#ifdef GGML_USE_ACCELERATE
|
||||
if constexpr (std::is_same_v<src0_t, float> && std::is_same_v<src1_t, float> && std::is_same_v<dst_t, float>) {
|
||||
if (vDSP_op != nullptr) {
|
||||
vDSP_op(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
vec_binary_op_contiguous<op>(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
||||
}
|
||||
} else {
|
||||
vec_binary_op_non_contiguous<op>(ne0, ne10, nb10, dst_ptr, src0_ptr, src1_ptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Use the 'traits' lookup table (for type conversion fns), instead of a mass of 'if' conditions with long templates
|
||||
template <float (*op)(float, float)>
|
||||
static void binary_op(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
/* */ if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { // all f32
|
||||
apply_binary_op<op, float, float, float>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { // all f16
|
||||
apply_binary_op<op, ggml_fp16_t, ggml_fp16_t, ggml_fp16_t>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
|
||||
apply_binary_op<op, ggml_bf16_t, ggml_bf16_t, ggml_bf16_t>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_BF16) {
|
||||
apply_binary_op<op, ggml_bf16_t, float, ggml_bf16_t>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
apply_binary_op<op, ggml_bf16_t, float, float>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
|
||||
apply_binary_op<op, ggml_fp16_t, float, ggml_fp16_t>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
apply_binary_op<op, ggml_fp16_t, float, float>(params, dst);
|
||||
} else {
|
||||
GGML_ABORT("%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
|
||||
ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_compute_forward_add_non_quantized(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
binary_op<op_add>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_sub(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
binary_op<op_sub>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_mul(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
binary_op<op_mul>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_div(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
binary_op<op_div>(params, dst);
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void ggml_compute_forward_add_non_quantized(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_sub(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_mul(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_div(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,100 @@
|
||||
include(CheckCSourceRuns)
|
||||
|
||||
set(AVX_CODE "
|
||||
#include <immintrin.h>
|
||||
int main()
|
||||
{
|
||||
__m256 a;
|
||||
a = _mm256_set1_ps(0);
|
||||
return 0;
|
||||
}
|
||||
")
|
||||
|
||||
set(AVX512_CODE "
|
||||
#include <immintrin.h>
|
||||
int main()
|
||||
{
|
||||
__m512i a = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0);
|
||||
__m512i b = a;
|
||||
__mmask64 equality_mask = _mm512_cmp_epi8_mask(a, b, _MM_CMPINT_EQ);
|
||||
return 0;
|
||||
}
|
||||
")
|
||||
|
||||
set(AVX2_CODE "
|
||||
#include <immintrin.h>
|
||||
int main()
|
||||
{
|
||||
__m256i a = {0};
|
||||
a = _mm256_abs_epi16(a);
|
||||
__m256i x;
|
||||
_mm256_extract_epi64(x, 0); // we rely on this in our AVX2 code
|
||||
return 0;
|
||||
}
|
||||
")
|
||||
|
||||
set(FMA_CODE "
|
||||
#include <immintrin.h>
|
||||
int main()
|
||||
{
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
const __m256 d = _mm256_setzero_ps();
|
||||
const __m256 p = _mm256_setzero_ps();
|
||||
acc = _mm256_fmadd_ps( d, p, acc );
|
||||
return 0;
|
||||
}
|
||||
")
|
||||
|
||||
macro(check_sse type flags)
|
||||
set(__FLAG_I 1)
|
||||
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
|
||||
foreach (__FLAG ${flags})
|
||||
if (NOT ${type}_FOUND)
|
||||
set(CMAKE_REQUIRED_FLAGS ${__FLAG})
|
||||
check_c_source_runs("${${type}_CODE}" HAS_${type}_${__FLAG_I})
|
||||
if (HAS_${type}_${__FLAG_I})
|
||||
set(${type}_FOUND TRUE CACHE BOOL "${type} support")
|
||||
set(${type}_FLAGS "${__FLAG}" CACHE STRING "${type} flags")
|
||||
endif()
|
||||
math(EXPR __FLAG_I "${__FLAG_I}+1")
|
||||
endif()
|
||||
endforeach()
|
||||
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
|
||||
|
||||
if (NOT ${type}_FOUND)
|
||||
set(${type}_FOUND FALSE CACHE BOOL "${type} support")
|
||||
set(${type}_FLAGS "" CACHE STRING "${type} flags")
|
||||
endif()
|
||||
|
||||
mark_as_advanced(${type}_FOUND ${type}_FLAGS)
|
||||
endmacro()
|
||||
|
||||
# flags are for MSVC only!
|
||||
check_sse("AVX" " ;/arch:AVX")
|
||||
if (NOT ${AVX_FOUND})
|
||||
set(GGML_AVX OFF)
|
||||
else()
|
||||
set(GGML_AVX ON)
|
||||
endif()
|
||||
|
||||
check_sse("AVX2" " ;/arch:AVX2")
|
||||
check_sse("FMA" " ;/arch:AVX2")
|
||||
if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
|
||||
set(GGML_AVX2 OFF)
|
||||
else()
|
||||
set(GGML_AVX2 ON)
|
||||
endif()
|
||||
|
||||
check_sse("AVX512" " ;/arch:AVX512")
|
||||
if (NOT ${AVX512_FOUND})
|
||||
set(GGML_AVX512 OFF)
|
||||
else()
|
||||
set(GGML_AVX512 ON)
|
||||
endif()
|
||||
@@ -0,0 +1,87 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "traits.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "simd-mappings.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
#include <utility>
|
||||
|
||||
// convenience functions/macros for use in template calls
|
||||
// note: these won't be required after the 'traits' lookup table is used.
|
||||
static inline ggml_fp16_t f32_to_f16(float x) {
|
||||
return GGML_CPU_FP32_TO_FP16(x);
|
||||
}
|
||||
|
||||
static inline float f16_to_f32(ggml_fp16_t x) {
|
||||
return GGML_CPU_FP16_TO_FP32(x);
|
||||
}
|
||||
|
||||
static inline ggml_bf16_t f32_to_bf16(float x) {
|
||||
return GGML_FP32_TO_BF16(x);
|
||||
}
|
||||
|
||||
static inline float bf16_to_f32(ggml_bf16_t x) {
|
||||
return GGML_BF16_TO_FP32(x);
|
||||
}
|
||||
|
||||
static inline float i32_to_f32(int32_t x) {
|
||||
return x;
|
||||
}
|
||||
|
||||
static inline int32_t f32_to_i32(float x) {
|
||||
return x;
|
||||
}
|
||||
|
||||
static inline float f32_to_f32(float x) {
|
||||
return x;
|
||||
}
|
||||
|
||||
// TODO - merge this into the traits table, after using row-based conversions
|
||||
template <class T>
|
||||
struct type_conversion_table;
|
||||
|
||||
template <>
|
||||
struct type_conversion_table<ggml_fp16_t> {
|
||||
static constexpr float (*to_f32)(ggml_fp16_t) = f16_to_f32;
|
||||
static constexpr ggml_fp16_t (*from_f32)(float) = f32_to_f16;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct type_conversion_table<float> {
|
||||
static constexpr float (*to_f32)(float) = f32_to_f32;
|
||||
static constexpr float (*from_f32)(float) = f32_to_f32;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct type_conversion_table<ggml_bf16_t> {
|
||||
static constexpr float (*to_f32)(ggml_bf16_t) = bf16_to_f32;
|
||||
static constexpr ggml_bf16_t (*from_f32)(float) = f32_to_bf16;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct type_conversion_table<int32_t> {
|
||||
static constexpr float (*to_f32)(int32_t) = i32_to_f32;
|
||||
static constexpr int32_t (*from_f32)(float) = f32_to_i32;
|
||||
};
|
||||
|
||||
static std::pair<int64_t, int64_t> get_thread_range(const struct ggml_compute_params * params, const struct ggml_tensor * src0) {
|
||||
const int64_t ith = params->ith;
|
||||
const int64_t nth = params->nth;
|
||||
|
||||
const int64_t nr = ggml_nrows(src0);
|
||||
|
||||
// rows per thread
|
||||
const int64_t dr = (nr + nth - 1)/nth;
|
||||
|
||||
// row range for this thread
|
||||
const int64_t ir0 = dr*ith;
|
||||
const int64_t ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
return {ir0, ir1};
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,526 @@
|
||||
#pragma once
|
||||
|
||||
// GGML CPU internal header
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
|
||||
//#include <stddef.h>
|
||||
#include <stdbool.h>
|
||||
#include <string.h> // memcpy
|
||||
#include <math.h> // fabsf
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct ggml_compute_params {
|
||||
// ith = thread index, nth = number of threads
|
||||
int ith, nth;
|
||||
|
||||
// work buffer for all threads
|
||||
size_t wsize;
|
||||
void * wdata;
|
||||
|
||||
struct ggml_threadpool * threadpool;
|
||||
};
|
||||
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
|
||||
#define m512bh(p) p
|
||||
#define m512i(p) p
|
||||
|
||||
#else
|
||||
|
||||
#define m512bh(p) (__m512bh)(p)
|
||||
#define m512i(p) (__m512i)(p)
|
||||
|
||||
#endif
|
||||
|
||||
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
||||
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
|
||||
#ifndef __FMA__
|
||||
#define __FMA__
|
||||
#endif
|
||||
#ifndef __F16C__
|
||||
#define __F16C__
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
|
||||
#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
|
||||
#ifndef __SSE3__
|
||||
#define __SSE3__
|
||||
#endif
|
||||
#ifndef __SSSE3__
|
||||
#define __SSSE3__
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__s390x__) && defined(__VEC__)
|
||||
#ifndef __VXE__
|
||||
#define __VXE__
|
||||
#endif // __VXE__
|
||||
#ifndef __VXE2__
|
||||
#define __VXE2__
|
||||
#endif // __VXE2__
|
||||
#endif // __s390x__ && __VEC__
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE) && defined(__linux__)
|
||||
#include <sys/prctl.h>
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/5404
|
||||
#ifdef _MSC_VER
|
||||
#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
|
||||
#else
|
||||
#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
|
||||
#endif // _MSC_VER
|
||||
|
||||
#if !defined(__aarch64__)
|
||||
|
||||
// 32-bit ARM compatibility
|
||||
|
||||
// vaddlvq_s16
|
||||
// vpaddq_s16
|
||||
// vpaddq_s32
|
||||
// vaddvq_s32
|
||||
// vaddvq_f32
|
||||
// vmaxvq_f32
|
||||
// vcvtnq_s32_f32
|
||||
// vzip1_u8
|
||||
// vzip2_u8
|
||||
|
||||
inline static int32_t vaddlvq_s16(int16x8_t v) {
|
||||
int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
|
||||
return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
|
||||
}
|
||||
|
||||
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
||||
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
||||
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
||||
return vcombine_s16(a0, b0);
|
||||
}
|
||||
|
||||
inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
|
||||
int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
|
||||
int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
|
||||
return vcombine_s32(a0, b0);
|
||||
}
|
||||
|
||||
inline static int32_t vaddvq_s32(int32x4_t v) {
|
||||
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
||||
}
|
||||
|
||||
inline static float vaddvq_f32(float32x4_t v) {
|
||||
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
||||
}
|
||||
|
||||
inline static float vmaxvq_f32(float32x4_t v) {
|
||||
return
|
||||
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
||||
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
||||
}
|
||||
|
||||
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
||||
int32x4_t res;
|
||||
|
||||
res[0] = roundf(vgetq_lane_f32(v, 0));
|
||||
res[1] = roundf(vgetq_lane_f32(v, 1));
|
||||
res[2] = roundf(vgetq_lane_f32(v, 2));
|
||||
res[3] = roundf(vgetq_lane_f32(v, 3));
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
|
||||
uint8x8_t res;
|
||||
|
||||
res[0] = a[0]; res[1] = b[0];
|
||||
res[2] = a[1]; res[3] = b[1];
|
||||
res[4] = a[2]; res[5] = b[2];
|
||||
res[6] = a[3]; res[7] = b[3];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
|
||||
uint8x8_t res;
|
||||
|
||||
res[0] = a[4]; res[1] = b[4];
|
||||
res[2] = a[5]; res[3] = b[5];
|
||||
res[4] = a[6]; res[5] = b[6];
|
||||
res[6] = a[7]; res[7] = b[7];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// vld1q_s16_x2
|
||||
// vld1q_u8_x2
|
||||
// vld1q_u8_x4
|
||||
// vld1q_s8_x2
|
||||
// vld1q_s8_x4
|
||||
// TODO: double-check these work correctly
|
||||
|
||||
typedef struct ggml_int16x8x2_t {
|
||||
int16x8_t val[2];
|
||||
} ggml_int16x8x2_t;
|
||||
|
||||
inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
|
||||
ggml_int16x8x2_t res;
|
||||
|
||||
res.val[0] = vld1q_s16(ptr + 0);
|
||||
res.val[1] = vld1q_s16(ptr + 8);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_uint8x16x2_t {
|
||||
uint8x16_t val[2];
|
||||
} ggml_uint8x16x2_t;
|
||||
|
||||
inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
|
||||
ggml_uint8x16x2_t res;
|
||||
|
||||
res.val[0] = vld1q_u8(ptr + 0);
|
||||
res.val[1] = vld1q_u8(ptr + 16);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_uint8x16x4_t {
|
||||
uint8x16_t val[4];
|
||||
} ggml_uint8x16x4_t;
|
||||
|
||||
inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
|
||||
ggml_uint8x16x4_t res;
|
||||
|
||||
res.val[0] = vld1q_u8(ptr + 0);
|
||||
res.val[1] = vld1q_u8(ptr + 16);
|
||||
res.val[2] = vld1q_u8(ptr + 32);
|
||||
res.val[3] = vld1q_u8(ptr + 48);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_int8x16x2_t {
|
||||
int8x16_t val[2];
|
||||
} ggml_int8x16x2_t;
|
||||
|
||||
inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
|
||||
ggml_int8x16x2_t res;
|
||||
|
||||
res.val[0] = vld1q_s8(ptr + 0);
|
||||
res.val[1] = vld1q_s8(ptr + 16);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_int8x16x4_t {
|
||||
int8x16_t val[4];
|
||||
} ggml_int8x16x4_t;
|
||||
|
||||
inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
||||
ggml_int8x16x4_t res;
|
||||
|
||||
res.val[0] = vld1q_s8(ptr + 0);
|
||||
res.val[1] = vld1q_s8(ptr + 16);
|
||||
res.val[2] = vld1q_s8(ptr + 32);
|
||||
res.val[3] = vld1q_s8(ptr + 48);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// NOTE: not tested
|
||||
inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
||||
int8x16_t res;
|
||||
|
||||
res[ 0] = a[b[ 0]];
|
||||
res[ 1] = a[b[ 1]];
|
||||
res[ 2] = a[b[ 2]];
|
||||
res[ 3] = a[b[ 3]];
|
||||
res[ 4] = a[b[ 4]];
|
||||
res[ 5] = a[b[ 5]];
|
||||
res[ 6] = a[b[ 6]];
|
||||
res[ 7] = a[b[ 7]];
|
||||
res[ 8] = a[b[ 8]];
|
||||
res[ 9] = a[b[ 9]];
|
||||
res[10] = a[b[10]];
|
||||
res[11] = a[b[11]];
|
||||
res[12] = a[b[12]];
|
||||
res[13] = a[b[13]];
|
||||
res[14] = a[b[14]];
|
||||
res[15] = a[b[15]];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// NOTE: not tested
|
||||
inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
|
||||
uint8x16_t res;
|
||||
|
||||
res[ 0] = a[b[ 0]];
|
||||
res[ 1] = a[b[ 1]];
|
||||
res[ 2] = a[b[ 2]];
|
||||
res[ 3] = a[b[ 3]];
|
||||
res[ 4] = a[b[ 4]];
|
||||
res[ 5] = a[b[ 5]];
|
||||
res[ 6] = a[b[ 6]];
|
||||
res[ 7] = a[b[ 7]];
|
||||
res[ 8] = a[b[ 8]];
|
||||
res[ 9] = a[b[ 9]];
|
||||
res[10] = a[b[10]];
|
||||
res[11] = a[b[11]];
|
||||
res[12] = a[b[12]];
|
||||
res[13] = a[b[13]];
|
||||
res[14] = a[b[14]];
|
||||
res[15] = a[b[15]];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define ggml_int16x8x2_t int16x8x2_t
|
||||
#define ggml_uint8x16x2_t uint8x16x2_t
|
||||
#define ggml_uint8x16x4_t uint8x16x4_t
|
||||
#define ggml_int8x16x2_t int8x16x2_t
|
||||
#define ggml_int8x16x4_t int8x16x4_t
|
||||
|
||||
#define ggml_vld1q_s16_x2 vld1q_s16_x2
|
||||
#define ggml_vld1q_u8_x2 vld1q_u8_x2
|
||||
#define ggml_vld1q_u8_x4 vld1q_u8_x4
|
||||
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
||||
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
||||
#define ggml_vqtbl1q_s8 vqtbl1q_s8
|
||||
#define ggml_vqtbl1q_u8 vqtbl1q_u8
|
||||
|
||||
#endif // !defined(__aarch64__)
|
||||
|
||||
#if !defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
||||
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
|
||||
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
|
||||
|
||||
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
|
||||
|
||||
#endif // !defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
#endif // defined(__ARM_NEON)
|
||||
|
||||
#ifdef __wasm_simd128__
|
||||
#include <wasm_simd128.h>
|
||||
#endif
|
||||
|
||||
#ifdef __POWER9_VECTOR__
|
||||
#include <altivec.h>
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <intrin.h>
|
||||
#elif defined(__SSE__) || defined(__SSE3__) || defined(__SSSE3__) || defined(__AVX__) || defined(__F16C__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX512BF16__)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __riscv_v_intrinsic
|
||||
#include <riscv_vector.h>
|
||||
#endif
|
||||
|
||||
#if defined(__loongarch64)
|
||||
#if defined(__loongarch_asx)
|
||||
#include <lasxintrin.h>
|
||||
#endif
|
||||
#if defined(__loongarch_sx)
|
||||
#include <lsxintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__VXE__) || defined(__VXE2__)
|
||||
#include <vecintrin.h>
|
||||
|
||||
#define vec_neg(a) (-(a)) // Vector Negate
|
||||
#define vec_add(a, b) ((a) + (b)) // Vector Add
|
||||
#define vec_sub(a, b) ((a) - (b)) // Vector Subtract
|
||||
#define vec_mul(a, b) ((a) * (b)) // Vector Multiply
|
||||
#define vec_div(a, b) ((a) / (b)) // Vector Divide
|
||||
#define vec_sl(a, b) ((a) << (b)) // Vector Shift Left
|
||||
#define vec_sra(a, b) ((a) >> (b)) // Vector Shift Right
|
||||
#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebraic
|
||||
#define vec_slo(a, b) vec_slb(a, (b) << 64) // Vector Shift Left by Octet
|
||||
#define vec_sro(a, b) vec_srb(a, (b) << 64) // Vector Shift Right by Octet
|
||||
|
||||
#ifndef vec_and
|
||||
#define vec_and(a, b) ((a) & (b)) // Vector AND
|
||||
#endif
|
||||
|
||||
#ifndef vec_or
|
||||
#define vec_or(a, b) ((a) | (b)) // Vector OR
|
||||
#endif
|
||||
|
||||
#ifndef vec_xor
|
||||
#define vec_xor(a, b) ((a) ^ (b)) // Vector XOR
|
||||
#endif
|
||||
|
||||
typedef signed char char8x16_t __attribute__((vector_size(16)));
|
||||
typedef unsigned char uchar8x16_t __attribute__((vector_size(16)));
|
||||
|
||||
typedef int8_t int8x16_t __attribute__((vector_size(16)));
|
||||
typedef int16_t int16x8_t __attribute__((vector_size(16)));
|
||||
typedef int32_t int32x4_t __attribute__((vector_size(16)));
|
||||
|
||||
typedef uint8_t uint8x16_t __attribute__((vector_size(16)));
|
||||
typedef uint16_t uint16x8_t __attribute__((vector_size(16)));
|
||||
typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
|
||||
|
||||
typedef float float32x4_t __attribute__((vector_size(16)));
|
||||
typedef double double64x2_t __attribute__((vector_size(16)));
|
||||
|
||||
typedef signed long long long64x2_t __attribute__((vector_size(16)));
|
||||
typedef unsigned long long ulong64x2_t __attribute__((vector_size(16)));
|
||||
|
||||
typedef struct ggml_uint8x16x2_t {
|
||||
uint8x16_t val[2];
|
||||
} ggml_uint8x16x2_t;
|
||||
|
||||
inline static ggml_uint8x16x2_t ggml_vec_xl_u8x2(const uint8_t * ptr) {
|
||||
ggml_uint8x16x2_t res;
|
||||
|
||||
res.val[0] = vec_xl( 0, ptr);
|
||||
res.val[1] = vec_xl(16, ptr);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_uint8x16x4_t {
|
||||
uint8x16_t val[4];
|
||||
} ggml_uint8x16x4_t;
|
||||
|
||||
inline static ggml_uint8x16x4_t ggml_vec_xl_u8x4(const uint8_t * ptr) {
|
||||
ggml_uint8x16x4_t res;
|
||||
|
||||
res.val[0] = vec_xl( 0, ptr);
|
||||
res.val[1] = vec_xl(16, ptr);
|
||||
res.val[2] = vec_xl(32, ptr);
|
||||
res.val[3] = vec_xl(48, ptr);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_int8x16x4_t {
|
||||
int8x16_t val[4];
|
||||
} ggml_int8x16x4_t;
|
||||
|
||||
inline static ggml_int8x16x4_t ggml_vec_xl_s8x4(const int8_t * ptr) {
|
||||
ggml_int8x16x4_t res;
|
||||
|
||||
res.val[0] = vec_xl( 0, ptr);
|
||||
res.val[1] = vec_xl(16, ptr);
|
||||
res.val[2] = vec_xl(32, ptr);
|
||||
res.val[3] = vec_xl(48, ptr);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_int16x8x2_t {
|
||||
int16x8_t val[2];
|
||||
} ggml_int16x8x2_t;
|
||||
|
||||
inline static ggml_int16x8x2_t ggml_vec_xl_s16x2(const int16_t * ptr) {
|
||||
ggml_int16x8x2_t res;
|
||||
|
||||
res.val[0] = vec_xl( 0, ptr);
|
||||
res.val[1] = vec_xl(16, ptr);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
/*
|
||||
! WARNING: Very slow. Use vec_perm if possible. Refer to iq4_xs
|
||||
! or iq4_nl for example implementation.
|
||||
*/
|
||||
inline static int8x16_t ggml_vec_tbl(int8x16_t a, uint8x16_t b) {
|
||||
int8x16_t res;
|
||||
|
||||
res[ 0] = a[b[ 0]];
|
||||
res[ 1] = a[b[ 1]];
|
||||
res[ 2] = a[b[ 2]];
|
||||
res[ 3] = a[b[ 3]];
|
||||
res[ 4] = a[b[ 4]];
|
||||
res[ 5] = a[b[ 5]];
|
||||
res[ 6] = a[b[ 6]];
|
||||
res[ 7] = a[b[ 7]];
|
||||
res[ 8] = a[b[ 8]];
|
||||
res[ 9] = a[b[ 9]];
|
||||
res[10] = a[b[10]];
|
||||
res[11] = a[b[11]];
|
||||
res[12] = a[b[12]];
|
||||
res[13] = a[b[13]];
|
||||
res[14] = a[b[14]];
|
||||
res[15] = a[b[15]];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
|
||||
const uchar8x16_t v_maske = { 0, 1, 4, 5, 8, 9, 12, 13,
|
||||
16, 17, 20, 21, 24, 25, 28, 29 };
|
||||
|
||||
const int16x8_t v_abo = vec_pack((int32x4_t)a, (int32x4_t)b);
|
||||
const int16x8_t v_abe = vec_perm(a, b, v_maske);
|
||||
return v_abo + v_abe;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see https://github.com/ggml-org/llama.cpp/pull/14037
|
||||
*/
|
||||
inline static float vec_hsum_f32x4(float32x4_t v) {
|
||||
float32x4_t v_temp = v + vec_reve(v);
|
||||
return v_temp[0] + v_temp[1];
|
||||
}
|
||||
|
||||
inline static int32_t vec_hsum_i32x4(int32x4_t v) {
|
||||
int32x4_t v_temp = v + vec_reve(v);
|
||||
return v_temp[0] + v_temp[1];
|
||||
}
|
||||
|
||||
inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
||||
const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b);
|
||||
return acc + (vec_unpackh(p) + vec_unpackl(p));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__loongarch_sx)
|
||||
/* float type data load instructions */
|
||||
static __m128 __lsx_vreplfr2vr_s(const float val) {
|
||||
v4f32 res = {val, val, val, val};
|
||||
return (__m128)res;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__loongarch_asx)
|
||||
static __m256 __lasx_xvreplfr2vr_s(const float val) {
|
||||
v8f32 res = {val, val, val, val, val, val, val, val};
|
||||
return (__m256)res;
|
||||
}
|
||||
#endif
|
||||
|
||||
// TODO: move to ggml-threading
|
||||
void ggml_barrier(struct ggml_threadpool * tp);
|
||||
|
||||
void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value);
|
||||
int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,686 @@
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "repack.h"
|
||||
#include "traits.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "amx/amx.h"
|
||||
|
||||
#include <cctype>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
# include "hbm.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CPU_KLEIDIAI
|
||||
# include "kleidiai/kleidiai.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CPU_RISCV64_SPACEMIT
|
||||
# include "spacemit/ime.h"
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32)
|
||||
# define WIN32_LEAN_AND_MEAN
|
||||
# ifndef NOMINMAX
|
||||
# define NOMINMAX
|
||||
# endif
|
||||
# include <windows.h>
|
||||
#else
|
||||
# include <unistd.h>
|
||||
#endif
|
||||
|
||||
#if defined(__APPLE__)
|
||||
# include <sys/sysctl.h>
|
||||
# include <sys/types.h>
|
||||
#endif
|
||||
|
||||
// ggml-backend interface
|
||||
|
||||
std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_types() {
|
||||
static std::vector<ggml_backend_buffer_type_t> bufts = []() {
|
||||
std::vector<ggml_backend_buffer_type_t> bufts;
|
||||
|
||||
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
||||
if (ggml_backend_amx_buffer_type()) {
|
||||
bufts.push_back(ggml_backend_amx_buffer_type());
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CPU_RISCV64_SPACEMIT
|
||||
if (ggml_backend_cpu_riscv64_spacemit_buffer_type()) {
|
||||
bufts.push_back(ggml_backend_cpu_riscv64_spacemit_buffer_type());
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CPU_KLEIDIAI
|
||||
if (ggml_backend_cpu_kleidiai_buffer_type()) {
|
||||
bufts.push_back(ggml_backend_cpu_kleidiai_buffer_type());
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CPU_REPACK
|
||||
if (ggml_backend_cpu_repack_buffer_type()) {
|
||||
bufts.push_back(ggml_backend_cpu_repack_buffer_type());
|
||||
}
|
||||
#endif
|
||||
|
||||
return bufts;
|
||||
}();
|
||||
|
||||
return bufts;
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device) {
|
||||
static std::vector<ggml_backend_buffer_type_t> extra_bufts = [] {
|
||||
std::vector<ggml_backend_buffer_type_t> bufts = ggml_backend_cpu_get_extra_buffer_types();
|
||||
bufts.push_back(nullptr);
|
||||
return bufts;
|
||||
}();
|
||||
|
||||
return extra_bufts.data();
|
||||
|
||||
GGML_UNUSED(device);
|
||||
}
|
||||
|
||||
static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) {
|
||||
for (auto * extra : ggml_backend_cpu_get_extra_buffer_types()) {
|
||||
if (extra == buft) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// CPU backend - backend (stream)
|
||||
|
||||
struct ggml_backend_cpu_context {
|
||||
int n_threads;
|
||||
ggml_threadpool_t threadpool;
|
||||
|
||||
uint8_t * work_data;
|
||||
size_t work_size;
|
||||
|
||||
ggml_abort_callback abort_callback;
|
||||
void * abort_callback_data;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
|
||||
return "CPU";
|
||||
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_free(ggml_backend_t backend) {
|
||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||
delete[] cpu_ctx->work_data;
|
||||
delete cpu_ctx;
|
||||
delete backend;
|
||||
}
|
||||
|
||||
struct ggml_backend_plan_cpu {
|
||||
struct ggml_cplan cplan;
|
||||
struct ggml_cgraph cgraph;
|
||||
};
|
||||
|
||||
static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
|
||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||
|
||||
struct ggml_backend_plan_cpu * cpu_plan = new ggml_backend_plan_cpu;
|
||||
|
||||
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
||||
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
||||
|
||||
if (cpu_plan->cplan.work_size > 0) {
|
||||
cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
|
||||
if (cpu_plan->cplan.work_data == NULL) {
|
||||
delete cpu_plan;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
|
||||
cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
||||
|
||||
return cpu_plan;
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
||||
|
||||
delete[] cpu_plan->cplan.work_data;
|
||||
delete cpu_plan;
|
||||
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
||||
|
||||
return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
||||
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||
|
||||
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
||||
|
||||
if (cpu_ctx->work_size < cplan.work_size) {
|
||||
delete[] cpu_ctx->work_data;
|
||||
cpu_ctx->work_data = new uint8_t[cplan.work_size];
|
||||
if (cpu_ctx->work_data == NULL) {
|
||||
cpu_ctx->work_size = 0;
|
||||
return GGML_STATUS_ALLOC_FAILED;
|
||||
}
|
||||
cpu_ctx->work_size = cplan.work_size;
|
||||
}
|
||||
cplan.work_data = (uint8_t *)cpu_ctx->work_data;
|
||||
|
||||
cplan.abort_callback = cpu_ctx->abort_callback;
|
||||
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
||||
|
||||
return ggml_graph_compute(cgraph, &cplan);
|
||||
}
|
||||
|
||||
static const struct ggml_backend_i ggml_backend_cpu_i = {
|
||||
/* .get_name = */ ggml_backend_cpu_get_name,
|
||||
/* .free = */ ggml_backend_cpu_free,
|
||||
/* .set_tensor_async = */ NULL,
|
||||
/* .get_tensor_async = */ NULL,
|
||||
/* .cpy_tensor_async = */ NULL,
|
||||
/* .synchronize = */ NULL,
|
||||
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
|
||||
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
|
||||
/* .graph_plan_update = */ NULL,
|
||||
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
||||
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
/* .graph_optimize = */ NULL,
|
||||
};
|
||||
|
||||
static ggml_guid_t ggml_backend_cpu_guid(void) {
|
||||
static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
|
||||
return &guid;
|
||||
}
|
||||
|
||||
ggml_backend_t ggml_backend_cpu_init(void) {
|
||||
// initialize CPU backend now to avoid slowing the first graph computation
|
||||
ggml_cpu_init();
|
||||
|
||||
struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context;
|
||||
if (ctx == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
||||
ctx->threadpool = NULL;
|
||||
ctx->work_data = NULL;
|
||||
ctx->work_size = 0;
|
||||
ctx->abort_callback = NULL;
|
||||
ctx->abort_callback_data = NULL;
|
||||
|
||||
ggml_backend_t cpu_backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_cpu_guid(),
|
||||
/* .iface = */ ggml_backend_cpu_i,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
||||
/* .context = */ ctx,
|
||||
};
|
||||
|
||||
if (cpu_backend == NULL) {
|
||||
delete ctx;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return cpu_backend;
|
||||
}
|
||||
|
||||
bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
||||
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
|
||||
}
|
||||
|
||||
void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
||||
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
||||
|
||||
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
||||
ctx->n_threads = n_threads;
|
||||
}
|
||||
|
||||
void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
|
||||
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
||||
|
||||
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
||||
|
||||
if (ctx->threadpool && ctx->threadpool != threadpool) {
|
||||
// already had a different threadpool, pause/suspend it before switching
|
||||
ggml_threadpool_pause(ctx->threadpool);
|
||||
}
|
||||
ctx->threadpool = threadpool;
|
||||
}
|
||||
|
||||
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
|
||||
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
||||
|
||||
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
||||
ctx->abort_callback = abort_callback;
|
||||
ctx->abort_callback_data = abort_callback_data;
|
||||
}
|
||||
|
||||
// CPU backend - device
|
||||
|
||||
struct ggml_backend_cpu_device_context {
|
||||
std::string description = "CPU";
|
||||
|
||||
ggml_backend_cpu_device_context() {
|
||||
#ifdef __APPLE__
|
||||
size_t len = 0;
|
||||
if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
|
||||
description.resize(len);
|
||||
sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
|
||||
}
|
||||
#elif defined(__linux__)
|
||||
FILE * f = fopen("/proc/cpuinfo", "r");
|
||||
if (f) {
|
||||
char buf[1024];
|
||||
while (fgets(buf, sizeof(buf), f)) {
|
||||
if (strncmp(buf, "model name", 10) == 0) {
|
||||
char * p = strchr(buf, ':');
|
||||
if (p) {
|
||||
p++;
|
||||
while (std::isspace(*p)) {
|
||||
p++;
|
||||
}
|
||||
while (std::isspace(p[strlen(p) - 1])) {
|
||||
p[strlen(p) - 1] = '\0';
|
||||
}
|
||||
description = p;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
fclose(f);
|
||||
}
|
||||
#elif defined(_WIN32)
|
||||
HKEY hKey;
|
||||
if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
|
||||
TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
|
||||
0,
|
||||
KEY_READ,
|
||||
&hKey) == ERROR_SUCCESS) {
|
||||
DWORD cpu_brand_size = 0;
|
||||
if (RegQueryValueExA(hKey,
|
||||
"ProcessorNameString",
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
&cpu_brand_size) == ERROR_SUCCESS) {
|
||||
description.resize(cpu_brand_size);
|
||||
if (RegQueryValueExA(hKey,
|
||||
"ProcessorNameString",
|
||||
NULL,
|
||||
NULL,
|
||||
(LPBYTE)&description[0], // NOLINT
|
||||
&cpu_brand_size) == ERROR_SUCCESS) {
|
||||
if (description.find('\0') != std::string::npos) {
|
||||
description.resize(description.find('\0'));
|
||||
}
|
||||
}
|
||||
}
|
||||
RegCloseKey(hKey);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) {
|
||||
return "CPU";
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) {
|
||||
struct ggml_backend_cpu_device_context * ctx = (struct ggml_backend_cpu_device_context *)dev->context;
|
||||
|
||||
return ctx->description.c_str();
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
#ifdef _WIN32
|
||||
MEMORYSTATUSEX status;
|
||||
status.dwLength = sizeof(status);
|
||||
GlobalMemoryStatusEx(&status);
|
||||
*total = status.ullTotalPhys;
|
||||
*free = status.ullAvailPhys;
|
||||
#else
|
||||
long pages = sysconf(_SC_PHYS_PAGES);
|
||||
long page_size = sysconf(_SC_PAGE_SIZE);
|
||||
*total = pages * page_size;
|
||||
|
||||
// "free" system memory is ill-defined, for practical purposes assume that all of it is free:
|
||||
*free = *total;
|
||||
#endif // _WIN32
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) {
|
||||
return GGML_BACKEND_DEVICE_TYPE_CPU;
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
||||
props->name = ggml_backend_cpu_device_get_name(dev);
|
||||
props->description = ggml_backend_cpu_device_get_description(dev);
|
||||
props->type = ggml_backend_cpu_device_get_type(dev);
|
||||
ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
props->caps = {
|
||||
/* .async = */ false,
|
||||
/* .host_buffer = */ false,
|
||||
/* .buffer_from_host_ptr = */ true,
|
||||
/* .events = */ false,
|
||||
};
|
||||
}
|
||||
|
||||
static ggml_backend_t ggml_backend_cpu_device_init_backend(ggml_backend_dev_t dev, const char * params) {
|
||||
return ggml_backend_cpu_init();
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
GGML_UNUSED(params);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_backend_dev_t dev) {
|
||||
return ggml_backend_cpu_buffer_type();
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
||||
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
GGML_UNUSED(max_tensor_size);
|
||||
}
|
||||
|
||||
static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
||||
const struct ggml_tensor * src0 = op->src[0];
|
||||
const struct ggml_tensor * src1 = op->src[1];
|
||||
|
||||
if (op->op == GGML_OP_NONE || op->op == GGML_OP_RESHAPE || op->op == GGML_OP_VIEW || op->op == GGML_OP_PERMUTE || op->op == GGML_OP_TRANSPOSE) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// check extra buffer types
|
||||
// note: only the first sources are checked for extra buffer types to reduce overhead, increase if necessary
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (op->src[i] && op->src[i]->buffer &&
|
||||
ggml_backend_cpu_is_extra_buffer_type(op->src[i]->buffer->buft)) {
|
||||
auto * buf_extra = (ggml::cpu::extra_buffer_type *) op->src[i]->buffer->buft->context;
|
||||
return buf_extra->supports_op(dev, op);
|
||||
}
|
||||
}
|
||||
|
||||
switch (op->op) {
|
||||
case GGML_OP_CPY:
|
||||
case GGML_OP_SET_ROWS:
|
||||
return
|
||||
op->type != GGML_TYPE_IQ3_XXS &&
|
||||
op->type != GGML_TYPE_IQ3_S &&
|
||||
op->type != GGML_TYPE_IQ2_XXS &&
|
||||
op->type != GGML_TYPE_IQ2_XS &&
|
||||
op->type != GGML_TYPE_IQ2_S &&
|
||||
op->type != GGML_TYPE_IQ1_S &&
|
||||
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
||||
case GGML_OP_MUL_MAT:
|
||||
return src1->type == GGML_TYPE_F32 || src1->type == ggml_get_type_traits_cpu(src0->type)->vec_dot_type;
|
||||
case GGML_OP_SOFT_MAX_BACK: {
|
||||
if (op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
float max_bias = 0.0f;
|
||||
|
||||
memcpy(&max_bias, (const float *) op->op_params + 1, sizeof(float));
|
||||
|
||||
return max_bias == 0.0f;
|
||||
}
|
||||
case GGML_OP_IM2COL_BACK:
|
||||
return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32;
|
||||
case GGML_OP_GET_ROWS_BACK:
|
||||
return src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16;
|
||||
case GGML_OP_OUT_PROD:
|
||||
return (src0->type == GGML_TYPE_F32 || (ggml_is_quantized(src0->type) && src0->ne[2] == src1->ne[2] && src0->ne[3] == src1->ne[3])) &&
|
||||
src1->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||
return ggml_backend_buft_is_host(buft) || ggml_backend_cpu_is_extra_buffer_type(buft);
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
|
||||
/* .get_name = */ ggml_backend_cpu_device_get_name,
|
||||
/* .get_description = */ ggml_backend_cpu_device_get_description,
|
||||
/* .get_memory = */ ggml_backend_cpu_device_get_memory,
|
||||
/* .get_type = */ ggml_backend_cpu_device_get_type,
|
||||
/* .get_props = */ ggml_backend_cpu_device_get_props,
|
||||
/* .init_backend = */ ggml_backend_cpu_device_init_backend,
|
||||
/* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type,
|
||||
/* .get_host_buffer_type = */ NULL,
|
||||
/* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_host_ptr,
|
||||
/* .supports_op = */ ggml_backend_cpu_device_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_cpu_device_supports_buft,
|
||||
/* .offload_op = */ NULL,
|
||||
/* .event_new = */ NULL,
|
||||
/* .event_free = */ NULL,
|
||||
/* .event_synchronize = */ NULL,
|
||||
};
|
||||
|
||||
// CPU backend - backend (reg)
|
||||
|
||||
static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
|
||||
return "CPU";
|
||||
|
||||
GGML_UNUSED(reg);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) {
|
||||
return 1;
|
||||
|
||||
GGML_UNUSED(reg);
|
||||
}
|
||||
|
||||
static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
|
||||
GGML_ASSERT(index == 0);
|
||||
|
||||
static ggml_backend_cpu_device_context ctx;
|
||||
static ggml_backend_device ggml_backend_cpu_device = {
|
||||
/* .iface = */ ggml_backend_cpu_device_i,
|
||||
/* .reg = */ reg,
|
||||
/* .context = */ &ctx,
|
||||
};
|
||||
|
||||
return &ggml_backend_cpu_device;
|
||||
}
|
||||
|
||||
// This is intended to replace the the ggml_cpu_has_* functions when loading the CPU backend dynamically,
|
||||
// and additionally to allow other backends to expose their own list of features that applications can query using the same API
|
||||
static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t reg) {
|
||||
static std::vector<ggml_backend_feature> features = []() {
|
||||
ggml_cpu_init();
|
||||
|
||||
std::vector<ggml_backend_feature> features;
|
||||
if (ggml_cpu_has_sse3()) {
|
||||
features.push_back({ "SSE3", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_ssse3()) {
|
||||
features.push_back({ "SSSE3", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_avx()) {
|
||||
features.push_back({ "AVX", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_avx_vnni()) {
|
||||
features.push_back({ "AVX_VNNI", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_avx2()) {
|
||||
features.push_back({ "AVX2", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_f16c()) {
|
||||
features.push_back({ "F16C", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_fma()) {
|
||||
features.push_back({ "FMA", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_bmi2()) {
|
||||
features.push_back({ "BMI2", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_avx512()) {
|
||||
features.push_back({ "AVX512", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_avx512_vbmi()) {
|
||||
features.push_back({ "AVX512_VBMI", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_avx512_vnni()) {
|
||||
features.push_back({ "AVX512_VNNI", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_avx512_bf16()) {
|
||||
features.push_back({ "AVX512_BF16", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_amx_int8()) {
|
||||
features.push_back({ "AMX_INT8", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_neon()) {
|
||||
features.push_back({ "NEON", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_arm_fma()) {
|
||||
features.push_back({ "ARM_FMA", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_fp16_va()) {
|
||||
features.push_back({ "FP16_VA", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_matmul_int8()) {
|
||||
features.push_back({ "MATMUL_INT8", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_sve()) {
|
||||
features.push_back({ "SVE", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_dotprod()) {
|
||||
features.push_back({ "DOTPROD", "1" });
|
||||
}
|
||||
if (ggml_cpu_get_sve_cnt() > 0) {
|
||||
static std::string sve_cnt = std::to_string(ggml_cpu_get_sve_cnt());
|
||||
features.push_back({ "SVE_CNT", sve_cnt.c_str() });
|
||||
}
|
||||
if (ggml_cpu_has_sme()) {
|
||||
features.push_back({ "SME", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_riscv_v()) {
|
||||
features.push_back({ "RISCV_V", "1" });
|
||||
}
|
||||
if (ggml_cpu_get_rvv_vlen() > 0) {
|
||||
static std::string rvv_vlen = std::to_string(ggml_cpu_get_rvv_vlen());
|
||||
features.push_back({ "RVV_VLEN", rvv_vlen.c_str() });
|
||||
}
|
||||
if (ggml_cpu_has_vsx()) {
|
||||
features.push_back({ "VSX", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_vxe()) {
|
||||
features.push_back({ "VXE", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_wasm_simd()) {
|
||||
features.push_back({ "WASM_SIMD", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_llamafile()) {
|
||||
features.push_back({ "LLAMAFILE", "1" });
|
||||
}
|
||||
#ifdef GGML_USE_ACCELERATE
|
||||
features.push_back({ "ACCELERATE", "1" });
|
||||
#endif
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
features.push_back({ "CPU_HBM", "1" });
|
||||
#endif
|
||||
#ifdef GGML_USE_OPENMP
|
||||
features.push_back({ "OPENMP", "1" });
|
||||
#endif
|
||||
#ifdef GGML_USE_CPU_KLEIDIAI
|
||||
features.push_back({ "KLEIDIAI", "1" });
|
||||
#endif
|
||||
#ifdef GGML_USE_CPU_REPACK
|
||||
features.push_back({ "REPACK", "1" });
|
||||
#endif
|
||||
|
||||
features.push_back({ nullptr, nullptr });
|
||||
|
||||
return features;
|
||||
}();
|
||||
|
||||
return features.data();
|
||||
|
||||
GGML_UNUSED(reg);
|
||||
}
|
||||
|
||||
static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
||||
if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
|
||||
ggml_backend_set_n_threads_t fct = ggml_backend_cpu_set_n_threads;
|
||||
return (void *)fct;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
|
||||
ggml_backend_dev_get_extra_bufts_t fct = ggml_backend_cpu_device_get_extra_buffers_type;
|
||||
return (void *)fct;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_get_features") == 0) {
|
||||
return (void *)ggml_backend_cpu_get_features;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_set_abort_callback") == 0) {
|
||||
return (void *)ggml_backend_cpu_set_abort_callback;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_cpu_numa_init") == 0) {
|
||||
return (void *)ggml_numa_init;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_cpu_is_numa") == 0) {
|
||||
return (void *)ggml_is_numa;
|
||||
}
|
||||
|
||||
// threadpool - TODO: move to ggml-base
|
||||
if (strcmp(name, "ggml_threadpool_new") == 0) {
|
||||
return (void *)ggml_threadpool_new;
|
||||
}
|
||||
if (strcmp(name, "ggml_threadpool_free") == 0) {
|
||||
return (void *)ggml_threadpool_free;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_cpu_set_threadpool") == 0) {
|
||||
return (void *)ggml_backend_cpu_set_threadpool;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
|
||||
GGML_UNUSED(reg);
|
||||
}
|
||||
|
||||
static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
|
||||
/* .get_name = */ ggml_backend_cpu_reg_get_name,
|
||||
/* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,
|
||||
/* .get_device = */ ggml_backend_cpu_reg_get_device,
|
||||
/* .get_proc_address = */ ggml_backend_cpu_get_proc_address,
|
||||
};
|
||||
|
||||
ggml_backend_reg_t ggml_backend_cpu_reg(void) {
|
||||
// init CPU feature detection
|
||||
ggml_cpu_init();
|
||||
|
||||
static struct ggml_backend_reg ggml_backend_cpu_reg = {
|
||||
/* .api_version = */ GGML_BACKEND_API_VERSION,
|
||||
/* .iface = */ ggml_backend_cpu_reg_i,
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
|
||||
return &ggml_backend_cpu_reg;
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_IMPL(ggml_backend_cpu_reg)
|
||||
@@ -0,0 +1,55 @@
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#include "hbm.h"
|
||||
|
||||
// buffer type HBM
|
||||
|
||||
#include <hbwmalloc.h>
|
||||
|
||||
static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
return "CPU_HBM";
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
hbw_free(buffer->context);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
||||
size_t size) {
|
||||
void * ptr;
|
||||
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
||||
if (result != 0) {
|
||||
GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||
buffer->buft = buft;
|
||||
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
||||
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ nullptr, // defaults to SIZE_MAX
|
||||
/* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes
|
||||
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
||||
},
|
||||
/* .context = */ nullptr,
|
||||
};
|
||||
|
||||
return &ggml_backend_cpu_buffer_type_hbm;
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,8 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml.h"
|
||||
|
||||
// GGML CPU internal header
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
||||
@@ -0,0 +1,938 @@
|
||||
// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
// KleidiAI micro-kernels
|
||||
#include "kai_matmul_clamp_f32_qsi8d32p_qsi4c32p_interface.h"
|
||||
#include "kai_matmul_clamp_f32_qai8dxp_qsi8cxp_interface.h"
|
||||
#include "kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.h"
|
||||
#include "kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.h"
|
||||
#include "kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.h"
|
||||
#include "kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.h"
|
||||
#include "kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.h"
|
||||
#include "kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.h"
|
||||
#include "kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.h"
|
||||
#include "kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa.h"
|
||||
#include "kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot.h"
|
||||
#include "kai_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod.h"
|
||||
#include "kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.h"
|
||||
#include "kai_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod.h"
|
||||
#include "kai_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm.h"
|
||||
#include "kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm.h"
|
||||
#include "kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod.h"
|
||||
|
||||
#include "kai_lhs_pack_bf16p2vlx2_f32_sme.h"
|
||||
#include "kai_lhs_quant_pack_qsi8d32p_f32.h"
|
||||
#include "kai_lhs_quant_pack_qsi8d32p4x8sb_f32_neon.h"
|
||||
#include "kai_lhs_quant_pack_qsi8d32p_f32_neon.h"
|
||||
#include "kai_lhs_quant_pack_qai8dxp_f32.h"
|
||||
|
||||
#include "kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.h"
|
||||
#include "kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h"
|
||||
#include "kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.h"
|
||||
#include "kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon.h"
|
||||
|
||||
#include "kai_common.h"
|
||||
|
||||
#include "simd-mappings.h"
|
||||
|
||||
#define GGML_COMMON_DECL_CPP
|
||||
#include "ggml-common.h"
|
||||
|
||||
#include "kernels.h"
|
||||
|
||||
#define NELEMS(x) (sizeof(x) / sizeof(*x))
|
||||
|
||||
template<size_t(*Fn)(size_t,size_t,size_t)>
|
||||
static inline size_t kernel_offs_fn3(size_t a, size_t b, size_t c) {
|
||||
return Fn(a, b, c);
|
||||
}
|
||||
|
||||
template<size_t(*Fn)(size_t,size_t)>
|
||||
static inline size_t kernel_offs_fn2(size_t a, size_t b, size_t) {
|
||||
return Fn(a, b);
|
||||
}
|
||||
|
||||
template<void(*Fn)(size_t,size_t,size_t,size_t,const void*,const void*,float*,size_t,size_t,float,float)>
|
||||
static inline void kernel_run_fn11(size_t m, size_t n, size_t k, size_t bl,
|
||||
const void* lhs, const void* rhs, void* dst,
|
||||
size_t dst_stride_row, size_t dst_stride_col,
|
||||
float clamp_min, float clamp_max) {
|
||||
Fn(m, n, k, bl, lhs, rhs, static_cast<float*>(dst), dst_stride_row, dst_stride_col, clamp_min, clamp_max);
|
||||
}
|
||||
|
||||
template<void(*Fn)(size_t,size_t,size_t,const void*,const void*,void*,size_t,size_t,float,float)>
|
||||
static inline void kernel_run_fn10(size_t m, size_t n, size_t k, size_t /*bl*/,
|
||||
const void* lhs, const void* rhs, void* dst,
|
||||
size_t dst_stride_row, size_t dst_stride_col,
|
||||
float clamp_min, float clamp_max) {
|
||||
Fn(m, n, k, lhs, rhs, dst, dst_stride_row, dst_stride_col, clamp_min, clamp_max);
|
||||
}
|
||||
|
||||
template<void(*Fn)(size_t,size_t,size_t,const void*,const void*,float*,size_t,size_t,float,float)>
|
||||
static inline void kernel_run_float_fn10(size_t m, size_t n, size_t k, size_t /*bl*/,
|
||||
const void* lhs, const void* rhs, void* dst,
|
||||
size_t dst_stride_row, size_t dst_stride_col,
|
||||
float clamp_min, float clamp_max) {
|
||||
Fn(m, n, k, lhs, rhs, static_cast<float*>(dst), dst_stride_row, dst_stride_col, clamp_min, clamp_max);
|
||||
}
|
||||
|
||||
template<size_t(*Fn)(size_t,size_t,size_t,size_t,size_t,size_t)>
|
||||
static inline size_t lhs_ps_fn6(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr) {
|
||||
return Fn(m, k, bl, mr, kr, sr);
|
||||
}
|
||||
|
||||
template<size_t(*Fn)(size_t,size_t,size_t,size_t,size_t)>
|
||||
static inline size_t lhs_ps_fn5(size_t m, size_t k, size_t /*bl*/, size_t mr, size_t kr, size_t sr) {
|
||||
return Fn(m, k, mr, kr, sr);
|
||||
}
|
||||
|
||||
template<size_t(*Fn)(size_t,size_t,size_t,size_t,size_t,size_t)>
|
||||
static inline size_t lhs_offs_fn6(size_t m_idx, size_t k, size_t bl, size_t mr, size_t kr, size_t sr) {
|
||||
return Fn(m_idx, k, bl, mr, kr, sr);
|
||||
}
|
||||
|
||||
template<size_t(*Fn)(size_t,size_t,size_t,size_t,size_t)>
|
||||
static inline size_t lhs_offs_fn5(size_t m_idx, size_t k, size_t /*bl*/, size_t mr, size_t kr, size_t sr) {
|
||||
return Fn(m_idx, k, mr, kr, sr);
|
||||
}
|
||||
|
||||
template<void(*Fn)(size_t,size_t,size_t,size_t,size_t,size_t,size_t,const float*,size_t,void*)>
|
||||
static inline void lhs_pack_float_fn10(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr,
|
||||
size_t m_idx_start, const void* lhs, size_t lhs_stride, void* lhs_packed) {
|
||||
Fn(m, k, bl, mr, kr, sr, m_idx_start, static_cast<const float*>(lhs), lhs_stride, lhs_packed);
|
||||
}
|
||||
|
||||
template<void(*Fn)(size_t,size_t,size_t,size_t,size_t,size_t,size_t,const void*,size_t,void*)>
|
||||
static inline void lhs_pack_void_fn10(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr,
|
||||
size_t m_idx_start, const void* lhs, size_t lhs_stride, void* lhs_packed) {
|
||||
Fn(m, k, bl, mr, kr, sr, m_idx_start, lhs, lhs_stride, lhs_packed);
|
||||
}
|
||||
|
||||
template<void(*Fn)(size_t,size_t,size_t,size_t,size_t,size_t,const void*,size_t,void*)>
|
||||
static inline void lhs_pack_void_fn9(size_t m, size_t k, size_t /*bl*/, size_t mr, size_t kr, size_t sr,
|
||||
size_t m_idx_start, const void* lhs, size_t lhs_stride, void* lhs_packed) {
|
||||
Fn(m, k, mr, kr, sr, m_idx_start, lhs, lhs_stride, lhs_packed);
|
||||
}
|
||||
|
||||
template<void(*Fn)(size_t,size_t,size_t,size_t,size_t,size_t,const float*,size_t,void*)>
|
||||
static inline void lhs_pack_float_fn9_no_bl(size_t m, size_t k, size_t /*bl*/, size_t mr, size_t kr, size_t sr,
|
||||
size_t m_idx_start, const void * lhs, size_t lhs_stride, void * lhs_packed) {
|
||||
Fn(m, k, mr, kr, sr, m_idx_start, static_cast<const float*>(lhs), lhs_stride, lhs_packed);
|
||||
}
|
||||
|
||||
template<size_t(*Fn)(size_t,size_t,size_t,size_t,size_t)>
|
||||
static inline size_t rhs_ps_fn5(size_t n, size_t k, size_t nr, size_t kr, size_t bl) {
|
||||
return Fn(n, k, nr, kr, bl);
|
||||
}
|
||||
|
||||
template<size_t(*Fn)(size_t,size_t)>
|
||||
static inline size_t rhs_ps_fn2(size_t n, size_t k, size_t /*nr*/, size_t /*kr*/, size_t /*bl*/) {
|
||||
return Fn(n, k);
|
||||
}
|
||||
|
||||
template<size_t(*Fn)(size_t,size_t,size_t,size_t)>
|
||||
static inline size_t rhs_stride_fn4(size_t k, size_t nr, size_t kr, size_t bl) {
|
||||
return Fn(k, nr, kr, bl);
|
||||
}
|
||||
|
||||
template<size_t(*Fn)(size_t)>
|
||||
static inline size_t rhs_stride_fn1(size_t k, size_t /*nr*/, size_t /*kr*/, size_t /*bl*/) {
|
||||
return Fn(k);
|
||||
}
|
||||
|
||||
template<void(*Fn)(size_t,size_t,size_t,size_t,size_t,size_t,size_t,const uint8_t*,const float*,void*,size_t,const struct kai_rhs_pack_qs4cxs1s0_param*)>
|
||||
static inline void rhs_pack_fn12(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t bl,
|
||||
size_t /*rhs_stride*/, const void* rhs, const void* bias, const void* /*scale*/,
|
||||
void* rhs_packed, size_t extra_bytes, const void* params) {
|
||||
Fn(num_groups, n, k, nr, kr, sr, bl,
|
||||
static_cast<const uint8_t*>(rhs),
|
||||
static_cast<const float*>(bias),
|
||||
rhs_packed, extra_bytes,
|
||||
static_cast<const kai_rhs_pack_qs4cxs1s0_param*>(params));
|
||||
}
|
||||
|
||||
template<void(*Fn)(size_t,size_t,size_t,size_t,size_t,size_t,const int8_t*,const float*,const float*,void*,size_t,const struct kai_rhs_pack_qsi8cx_params*)>
|
||||
static inline void rhs_pack_scale_fn12(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t /*bl*/,
|
||||
size_t /*rhs_stride*/, const void* rhs, const void* bias, const void* scale,
|
||||
void* rhs_packed, size_t extra_bytes, const void* params) {
|
||||
Fn(num_groups, n, k, nr, kr, sr,
|
||||
static_cast<const int8_t*>(rhs),
|
||||
static_cast<const float*>(bias),
|
||||
static_cast<const float*>(scale),
|
||||
rhs_packed, extra_bytes,
|
||||
static_cast<const kai_rhs_pack_qsi8cx_params*>(params));
|
||||
}
|
||||
|
||||
template<void(*Fn)(size_t,size_t,size_t,size_t,size_t,size_t,size_t,const void*,const void*,const void*,void*,size_t,const void*)>
|
||||
static inline void rhs_pack_fn13(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t /*bl*/,
|
||||
size_t rhs_stride, const void* rhs, const void* bias, const void* scale,
|
||||
void* rhs_packed, size_t extra_bytes, const void* params) {
|
||||
Fn(num_groups, n, k, nr, kr, sr, rhs_stride, rhs, bias, scale, rhs_packed, extra_bytes, params);
|
||||
}
|
||||
|
||||
static const size_t INT4_PER_BYTE = 2;
|
||||
static const size_t INT4_BITS = 4;
|
||||
static const int Q4_0_ZERO_POINT = 8;
|
||||
const size_t INT4_PER_UINT16 = 4;
|
||||
|
||||
static void dequantize_row_qsi4c32pscalef16(
|
||||
const void *packed_data,
|
||||
int32_t row_idx,
|
||||
int64_t nc,
|
||||
float *out,
|
||||
size_t nr_pack,
|
||||
size_t packed_row_stride,
|
||||
size_t kr,
|
||||
size_t bl,
|
||||
size_t num_bytes_multiplier
|
||||
) {
|
||||
size_t group_idx = row_idx / nr_pack;
|
||||
size_t row_in_group = row_idx % nr_pack;
|
||||
const uint8_t *packed_group = (const uint8_t *)packed_data + group_idx * packed_row_stride;
|
||||
size_t num_blocks = nc / bl;
|
||||
const uint8_t *block_ptr = packed_group;
|
||||
|
||||
for (size_t b = 0; b < num_blocks; ++b) {
|
||||
uint16_t scale_f16 = *((const uint16_t *)(block_ptr + row_in_group * num_bytes_multiplier));
|
||||
float scale = GGML_CPU_FP16_TO_FP32(scale_f16);
|
||||
|
||||
const uint8_t *segment_ptr = block_ptr + nr_pack * num_bytes_multiplier;
|
||||
size_t num_segments = bl / kr;
|
||||
size_t num_bytes_per_segment = kr / INT4_PER_BYTE;
|
||||
|
||||
for (size_t s = 0; s < num_segments; ++s) {
|
||||
const uint8_t *seg_base = segment_ptr + s * nr_pack * num_bytes_per_segment;
|
||||
const uint8_t *qbytes = seg_base + row_in_group * num_bytes_per_segment;
|
||||
for (size_t k = 0; k < num_bytes_per_segment; ++k) {
|
||||
uint8_t byte = qbytes[k] ^ 0x88;
|
||||
int x0 = (byte & 0x0F) - Q4_0_ZERO_POINT;
|
||||
int x1 = (byte >> INT4_BITS) - Q4_0_ZERO_POINT;
|
||||
out[b * bl + s * num_bytes_per_segment + k] = x0 * scale;
|
||||
out[b * bl + s * num_bytes_per_segment + k + bl/2] = x1 * scale;
|
||||
}
|
||||
}
|
||||
block_ptr += nr_pack * num_bytes_multiplier + num_segments * nr_pack * num_bytes_per_segment;
|
||||
}
|
||||
}
|
||||
|
||||
static void dequantize_row_qsi4c32ps1s0scalef16(
|
||||
const void *packed_data,
|
||||
int32_t row_idx,
|
||||
int64_t k,
|
||||
float *out,
|
||||
size_t nr,
|
||||
size_t packed_row_stride,
|
||||
size_t kr,
|
||||
size_t bl,
|
||||
size_t num_bytes_multiplier
|
||||
) {
|
||||
const size_t num_blocks = k / bl;
|
||||
const size_t bl4 = bl / INT4_PER_UINT16;
|
||||
|
||||
size_t group_idx = row_idx / nr;
|
||||
size_t row_in_group = row_idx % nr;
|
||||
|
||||
const uint8_t *packed_group = (const uint8_t *)packed_data + group_idx * packed_row_stride;
|
||||
const uint16_t *qdata = (const uint16_t *)packed_group;
|
||||
const uint16_t *scales = (const uint16_t *)(packed_group + packed_row_stride - (nr * num_blocks * num_bytes_multiplier));
|
||||
|
||||
for (size_t block_idx = 0; block_idx < num_blocks; ++block_idx) {
|
||||
uint16_t scale_f16 = scales[row_in_group + block_idx * nr];
|
||||
float scale = GGML_CPU_FP16_TO_FP32(scale_f16);
|
||||
|
||||
for (size_t bl4_idx = 0; bl4_idx < bl4; ++bl4_idx) {
|
||||
uint16_t q = qdata[(block_idx * bl4 + bl4_idx) * nr + row_in_group];
|
||||
|
||||
for (size_t qidx = 0; qidx < INT4_PER_UINT16; ++qidx) {
|
||||
int v = ((q >> (qidx * 4)) & 0xF) - Q4_0_ZERO_POINT;
|
||||
out[block_idx * bl + bl4_idx * INT4_BITS + qidx] = v * scale;
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_UNUSED(kr);
|
||||
}
|
||||
|
||||
static void dequantize_row_qsi8cxp(
|
||||
const void *packed_data,
|
||||
int32_t row_idx,
|
||||
int64_t k,
|
||||
float *out,
|
||||
size_t nr,
|
||||
size_t packed_row_stride,
|
||||
size_t kr,
|
||||
size_t bl,
|
||||
size_t num_bytes_multiplier
|
||||
) {
|
||||
GGML_UNUSED(bl);
|
||||
GGML_UNUSED(num_bytes_multiplier);
|
||||
|
||||
const size_t k_internal = ((size_t) k + QK8_0 - 1) / QK8_0 * QK8_0;
|
||||
const size_t group_idx = row_idx / nr;
|
||||
const size_t row_in_group = row_idx % nr;
|
||||
|
||||
const uint8_t * group_ptr = static_cast<const uint8_t *>(packed_data) + group_idx * packed_row_stride;
|
||||
const int8_t * data_base = reinterpret_cast<const int8_t *>(group_ptr);
|
||||
|
||||
const size_t num_blocks = k_internal / kr;
|
||||
|
||||
for (size_t block = 0; block < num_blocks; ++block) {
|
||||
const int8_t * block_ptr = data_base + (block * nr + row_in_group) * kr;
|
||||
for (size_t i = 0; i < kr; ++i) {
|
||||
const size_t k_idx = block * kr + i;
|
||||
if (k_idx < (size_t) k) {
|
||||
out[k_idx] = static_cast<float>(block_ptr[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const uint8_t * sums_ptr = group_ptr + nr * k_internal;
|
||||
GGML_UNUSED(sums_ptr);
|
||||
|
||||
const float * scale_ptr = reinterpret_cast<const float *>(sums_ptr + nr * sizeof(int32_t));
|
||||
const float scale = scale_ptr[row_in_group];
|
||||
|
||||
if (scale == 0.0f) {
|
||||
for (size_t i = 0; i < (size_t) k; ++i) {
|
||||
out[i] = 0.0f;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < (size_t) k; ++i) {
|
||||
out[i] *= scale;
|
||||
}
|
||||
}
|
||||
|
||||
static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||
#if defined(__ARM_FEATURE_SME)
|
||||
{
|
||||
/* SME GEMM */
|
||||
/* .kern_info = */ {
|
||||
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
|
||||
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
|
||||
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
|
||||
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
|
||||
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
|
||||
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
|
||||
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
|
||||
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
|
||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa>,
|
||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa>,
|
||||
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa>,
|
||||
},
|
||||
|
||||
/* .gemm_lhs_info = */ {
|
||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32_neon,
|
||||
/* .get_packed_offset_ex = */ &lhs_offs_fn6<kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32_neon>,
|
||||
/* .packed_size_ex = */ &lhs_ps_fn6<kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32_neon>,
|
||||
/* .pack_func_ex = */ &lhs_pack_float_fn10<kai_run_lhs_quant_pack_qsi8d32p_f32_neon>,
|
||||
},
|
||||
/* SME GEMV */
|
||||
/* .kern_info = */ {
|
||||
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot,
|
||||
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot,
|
||||
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot,
|
||||
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot,
|
||||
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot,
|
||||
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot,
|
||||
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot,
|
||||
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot,
|
||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot>,
|
||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot>,
|
||||
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot>,
|
||||
},
|
||||
/* .gemv_lhs_info = */ {
|
||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32_neon,
|
||||
/* .get_packed_offset_ex = */ &lhs_offs_fn6<kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32_neon>,
|
||||
/* .packed_size_ex = */ &lhs_ps_fn6<kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32_neon>,
|
||||
/* .pack_func_ex = */ &lhs_pack_float_fn10<kai_run_lhs_quant_pack_qsi8d32p_f32_neon>,
|
||||
},
|
||||
/* .rhs_info = */ {
|
||||
/* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
|
||||
/* .to_float = */ dequantize_row_qsi4c32ps1s0scalef16,
|
||||
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon>,
|
||||
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon>,
|
||||
/* .pack_func_ex = */ &rhs_pack_fn12<kai_run_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon>,
|
||||
},
|
||||
/* .required_cpu = */ CPU_FEATURE_SME,
|
||||
/* .lhs_type = */ GGML_TYPE_F32,
|
||||
/* .rhs_type = */ GGML_TYPE_Q4_0,
|
||||
/* .op_type = */ GGML_TYPE_F32,
|
||||
},
|
||||
{
|
||||
/* SME GEMM */
|
||||
/* .kern_info = */ {
|
||||
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa>,
|
||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa>,
|
||||
/* .run_kernel_ex = */ &kernel_run_fn10<kai_run_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa>,
|
||||
},
|
||||
/* .gemm_lhs_info = */ {
|
||||
/* .get_offset = */ kai_get_lhs_offset_lhs_pack_bf16p2vlx2_f32_sme,
|
||||
/* .get_packed_offset_ex = */ &lhs_offs_fn5<kai_get_lhs_packed_offset_lhs_pack_bf16p2vlx2_f32_sme>,
|
||||
/* .packed_size_ex = */ &lhs_ps_fn5<kai_get_lhs_packed_size_lhs_pack_bf16p2vlx2_f32_sme>,
|
||||
/* .pack_func_ex = */ &lhs_pack_void_fn9<kai_run_lhs_pack_bf16p2vlx2_f32_sme>,
|
||||
},
|
||||
/* SME GEMV */
|
||||
/* .kern_info = */ {
|
||||
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_lhs_offset_ex = */ nullptr,
|
||||
/* .get_rhs_packed_offset_ex = */ nullptr,
|
||||
/* .run_kernel_ex = */ nullptr,
|
||||
},
|
||||
/* .gemv_lhs_info = */ {
|
||||
/* .get_offset = */ kai_get_lhs_offset_lhs_pack_bf16p2vlx2_f32_sme,
|
||||
/* .get_packed_offset_ex = */ &lhs_offs_fn5<kai_get_lhs_packed_offset_lhs_pack_bf16p2vlx2_f32_sme>,
|
||||
/* .packed_size_ex = */ &lhs_ps_fn5<kai_get_lhs_packed_size_lhs_pack_bf16p2vlx2_f32_sme>,
|
||||
/* .pack_func_ex = */ &lhs_pack_void_fn9<kai_run_lhs_pack_bf16p2vlx2_f32_sme>,
|
||||
},
|
||||
/* .rhs_info = */ {
|
||||
/* .packed_stride = */ nullptr,
|
||||
/* .to_float = */ nullptr,
|
||||
/* .packed_size_ex = */ &rhs_ps_fn2<kai_get_rhs_packed_size_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme>,
|
||||
/* .packed_stride_ex = */ &rhs_stride_fn1<kai_get_rhs_packed_stride_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme>,
|
||||
/* .pack_func_ex = */ &rhs_pack_fn13<kai_run_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme>,
|
||||
},
|
||||
/* .required_cpu = */ CPU_FEATURE_SME,
|
||||
/* .lhs_type = */ GGML_TYPE_F32,
|
||||
/* .rhs_type = */ GGML_TYPE_F16,
|
||||
/* .op_type = */ GGML_TYPE_F32,
|
||||
},
|
||||
#endif
|
||||
#if defined(__APPLE__)
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
{
|
||||
/* DOTPROD GEMM */
|
||||
/* .kern_info = */ {
|
||||
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
|
||||
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
|
||||
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
|
||||
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
|
||||
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
|
||||
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
|
||||
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
|
||||
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
|
||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod>,
|
||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod>,
|
||||
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod>,
|
||||
},
|
||||
/* .gemm_lhs_info = */ {
|
||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
|
||||
/* .get_packed_offset_ex = */ &lhs_offs_fn6<kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32>,
|
||||
/* .packed_size_ex = */ &lhs_ps_fn6<kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32>,
|
||||
/* .pack_func_ex = */ &lhs_pack_float_fn10<kai_run_lhs_quant_pack_qsi8d32p_f32>,
|
||||
},
|
||||
/* DOTPROD GEMV */
|
||||
/* .kern_info = */ {
|
||||
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
|
||||
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
|
||||
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
|
||||
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
|
||||
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
|
||||
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
|
||||
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
|
||||
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
|
||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod>,
|
||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod>,
|
||||
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod>,
|
||||
},
|
||||
/* .gemv_lhs_info = */ {
|
||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
|
||||
/* .get_packed_offset_ex = */ &lhs_offs_fn6<kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32>,
|
||||
/* .packed_size_ex = */ &lhs_ps_fn6<kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32>,
|
||||
/* .pack_func_ex = */ &lhs_pack_float_fn10<kai_run_lhs_quant_pack_qsi8d32p_f32>,
|
||||
},
|
||||
/* .rhs_info = */ {
|
||||
/* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
/* .to_float = */ dequantize_row_qsi4c32pscalef16,
|
||||
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||
/* .pack_func_ex = */ &rhs_pack_fn12<kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||
},
|
||||
/* .required_cpu = */ CPU_FEATURE_DOTPROD,
|
||||
/* .lhs_type = */ GGML_TYPE_F32,
|
||||
/* .rhs_type = */ GGML_TYPE_Q4_0,
|
||||
/* .op_type = */ GGML_TYPE_F32,
|
||||
},
|
||||
#endif
|
||||
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
{
|
||||
/* i8mm GEMM */
|
||||
/* .kern_info = */ {
|
||||
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
|
||||
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
|
||||
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
|
||||
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
|
||||
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
|
||||
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
|
||||
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
|
||||
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
|
||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm>,
|
||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm>,
|
||||
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm>,
|
||||
},
|
||||
/* .gemm_lhs_info = */ {
|
||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p4x8sb_f32_neon,
|
||||
/* .get_packed_offset_ex = */ &lhs_offs_fn6<kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p4x8sb_f32_neon>,
|
||||
/* .packed_size_ex = */ &lhs_ps_fn6<kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p4x8sb_f32_neon>,
|
||||
/* .pack_func_ex = */ &lhs_pack_float_fn10<kai_run_lhs_quant_pack_qsi8d32p4x8sb_f32_neon>,
|
||||
},
|
||||
/* i8mm GEMV */
|
||||
/* .kern_info = */ {
|
||||
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
|
||||
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
|
||||
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
|
||||
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
|
||||
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
|
||||
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
|
||||
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
|
||||
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
|
||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod>,
|
||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod>,
|
||||
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod>,
|
||||
},
|
||||
/* .gemv_lhs_info = */ {
|
||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
|
||||
/* .get_packed_offset_ex = */ &lhs_offs_fn6<kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32>,
|
||||
/* .packed_size_ex = */ &lhs_ps_fn6<kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32>,
|
||||
/* .pack_func_ex = */ &lhs_pack_float_fn10<kai_run_lhs_quant_pack_qsi8d32p_f32>,
|
||||
},
|
||||
/* .rhs_info = */ {
|
||||
/* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
/* .to_float = */ dequantize_row_qsi4c32pscalef16,
|
||||
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||
/* .pack_func_ex = */ &rhs_pack_fn12<kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||
},
|
||||
/* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
|
||||
/* .lhs_type = */ GGML_TYPE_F32,
|
||||
/* .rhs_type = */ GGML_TYPE_Q4_0,
|
||||
/* .op_type = */ GGML_TYPE_F32,
|
||||
},
|
||||
#endif
|
||||
#else
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
{
|
||||
/* SVE i8mm GEMM */
|
||||
/* .kern_info = */ {
|
||||
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
||||
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
||||
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
||||
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
||||
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
||||
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
||||
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
||||
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm>,
|
||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm>,
|
||||
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm>,
|
||||
},
|
||||
/* .gemm_lhs_info = */ {
|
||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p4x8sb_f32_neon,
|
||||
/* .get_packed_offset_ex = */ &lhs_offs_fn6<kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p4x8sb_f32_neon>,
|
||||
/* .packed_size_ex = */ &lhs_ps_fn6<kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p4x8sb_f32_neon>,
|
||||
/* .pack_func_ex = */ &lhs_pack_float_fn10<kai_run_lhs_quant_pack_qsi8d32p4x8sb_f32_neon>,
|
||||
},
|
||||
/* SVE dotprod GEMV */
|
||||
/* .kern_info = */ {
|
||||
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
||||
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
||||
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
||||
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
||||
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
||||
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
||||
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
||||
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod>,
|
||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod>,
|
||||
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod>,
|
||||
},
|
||||
/* .gemv_lhs_info = */ {
|
||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
|
||||
/* .get_packed_offset_ex = */ &lhs_offs_fn6<kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32>,
|
||||
/* .packed_size_ex = */ &lhs_ps_fn6<kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32>,
|
||||
/* .pack_func_ex = */ &lhs_pack_float_fn10<kai_run_lhs_quant_pack_qsi8d32p_f32>,
|
||||
},
|
||||
/* .rhs_info = */ {
|
||||
/* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
/* .to_float = */ dequantize_row_qsi4c32pscalef16,
|
||||
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||
/* .pack_func_ex = */ &rhs_pack_fn12<kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||
},
|
||||
/* .required_cpu = */ CPU_FEATURE_SVE | CPU_FEATURE_I8MM | CPU_FEATURE_DOTPROD,
|
||||
/* .lhs_type = */ GGML_TYPE_F32,
|
||||
/* .rhs_type = */ GGML_TYPE_Q4_0,
|
||||
/* .op_type = */ GGML_TYPE_F32,
|
||||
},
|
||||
#endif
|
||||
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
{
|
||||
/* i8mm GEMM */
|
||||
/* .kern_info = */ {
|
||||
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
|
||||
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
|
||||
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
|
||||
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
|
||||
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
|
||||
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
|
||||
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
|
||||
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
|
||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm>,
|
||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm>,
|
||||
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm>,
|
||||
},
|
||||
/* .gemm_lhs_info = */ {
|
||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p4x8sb_f32_neon,
|
||||
/* .get_packed_offset_ex = */ &lhs_offs_fn6<kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p4x8sb_f32_neon>,
|
||||
/* .packed_size_ex = */ &lhs_ps_fn6<kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p4x8sb_f32_neon>,
|
||||
/* .pack_func_ex = */ &lhs_pack_float_fn10<kai_run_lhs_quant_pack_qsi8d32p4x8sb_f32_neon>,
|
||||
},
|
||||
/* i8mm GEMV */
|
||||
/* .kern_info = */ {
|
||||
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
|
||||
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
|
||||
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
|
||||
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
|
||||
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
|
||||
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
|
||||
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
|
||||
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
|
||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod>,
|
||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod>,
|
||||
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod>,
|
||||
},
|
||||
/* .gemv_lhs_info = */ {
|
||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
|
||||
/* .get_packed_offset_ex = */ &lhs_offs_fn6<kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32>,
|
||||
/* .packed_size_ex = */ &lhs_ps_fn6<kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32>,
|
||||
/* .pack_func_ex = */ &lhs_pack_float_fn10<kai_run_lhs_quant_pack_qsi8d32p_f32>,
|
||||
},
|
||||
/* .rhs_info = */ {
|
||||
/* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
/* .to_float = */ dequantize_row_qsi4c32pscalef16,
|
||||
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||
/* .pack_func_ex = */ &rhs_pack_fn12<kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||
},
|
||||
/* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
|
||||
/* .lhs_type = */ GGML_TYPE_F32,
|
||||
/* .rhs_type = */ GGML_TYPE_Q4_0,
|
||||
/* .op_type = */ GGML_TYPE_F32,
|
||||
},
|
||||
#endif // __ARM_FEATURE_MATMUL_INT8
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
{
|
||||
/* DOTPROD GEMM */
|
||||
/* .kern_info = */ {
|
||||
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
|
||||
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
|
||||
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
|
||||
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
|
||||
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
|
||||
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
|
||||
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
|
||||
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
|
||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod>,
|
||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod>,
|
||||
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod>,
|
||||
},
|
||||
/* .gemm_lhs_info = */ {
|
||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
|
||||
/* .get_packed_offset_ex = */ &lhs_offs_fn6<kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32>,
|
||||
/* .packed_size_ex = */ &lhs_ps_fn6<kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32>,
|
||||
/* .pack_func_ex = */ &lhs_pack_float_fn10<kai_run_lhs_quant_pack_qsi8d32p_f32>,
|
||||
},
|
||||
/* DOTPROD GEMV */
|
||||
/* .kern_info = */ {
|
||||
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
|
||||
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
|
||||
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
|
||||
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
|
||||
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
|
||||
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
|
||||
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
|
||||
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
|
||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod>,
|
||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod>,
|
||||
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod>,
|
||||
},
|
||||
/* .gemv_lhs_info = */ {
|
||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
|
||||
/* .get_packed_offset_ex = */ &lhs_offs_fn6<kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32>,
|
||||
/* .packed_size_ex = */ &lhs_ps_fn6<kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32>,
|
||||
/* .pack_func_ex = */ &lhs_pack_float_fn10<kai_run_lhs_quant_pack_qsi8d32p_f32>,
|
||||
},
|
||||
/* .rhs_info = */ {
|
||||
/* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
/* .to_float = */ dequantize_row_qsi4c32pscalef16,
|
||||
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||
/* .pack_func_ex = */ &rhs_pack_fn12<kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||
},
|
||||
/* .required_cpu = */ CPU_FEATURE_DOTPROD,
|
||||
/* .lhs_type = */ GGML_TYPE_F32,
|
||||
/* .rhs_type = */ GGML_TYPE_Q4_0,
|
||||
/* .op_type = */ GGML_TYPE_F32,
|
||||
},
|
||||
#endif
|
||||
#endif
|
||||
{ /* Sentinel */ }
|
||||
};
|
||||
|
||||
static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = {
|
||||
#if defined(__ARM_FEATURE_SME)
|
||||
{
|
||||
/* SME GEMM */
|
||||
{
|
||||
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
|
||||
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
|
||||
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
|
||||
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
|
||||
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
|
||||
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
|
||||
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
|
||||
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
|
||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa>,
|
||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa>,
|
||||
/* .run_kernel_ex = */ &kernel_run_float_fn10<kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa>,
|
||||
},
|
||||
/* .gemm_lhs_info = */ {
|
||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
|
||||
/* .get_packed_offset_ex = */ &lhs_offs_fn5<kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32>,
|
||||
/* .packed_size_ex = */ &lhs_ps_fn5<kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32>,
|
||||
/* .pack_func_ex = */ &lhs_pack_float_fn9_no_bl<kai_run_lhs_quant_pack_qai8dxp_f32>,
|
||||
},
|
||||
/* SME GEMV */
|
||||
{
|
||||
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot,
|
||||
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot,
|
||||
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot,
|
||||
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot,
|
||||
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot,
|
||||
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot,
|
||||
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot,
|
||||
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot,
|
||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot>,
|
||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot>,
|
||||
/* .run_kernel_ex = */ &kernel_run_float_fn10<kai_run_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot>,
|
||||
},
|
||||
/* .gemv_lhs_info = */ {
|
||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
|
||||
/* .get_packed_offset_ex = */ &lhs_offs_fn5<kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32>,
|
||||
/* .packed_size_ex = */ &lhs_ps_fn5<kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32>,
|
||||
/* .pack_func_ex = */ &lhs_pack_float_fn9_no_bl<kai_run_lhs_quant_pack_qai8dxp_f32>,
|
||||
},
|
||||
/* .rhs_info = */ {
|
||||
/* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi8cxp_qsi8cx_neon,
|
||||
/* .to_float = */ dequantize_row_qsi8cxp,
|
||||
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
|
||||
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
|
||||
/* .pack_func_ex = */ &rhs_pack_scale_fn12<kai_run_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
|
||||
},
|
||||
/* .required_cpu = */ CPU_FEATURE_SME,
|
||||
/* .lhs_type = */ GGML_TYPE_F32,
|
||||
/* .rhs_type = */ GGML_TYPE_Q8_0,
|
||||
/* .op_type = */ GGML_TYPE_F32,
|
||||
},
|
||||
#endif
|
||||
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
{
|
||||
/* I8MM GEMM */
|
||||
{
|
||||
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
|
||||
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
|
||||
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
|
||||
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
|
||||
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
|
||||
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
|
||||
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
|
||||
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
|
||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm>,
|
||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm>,
|
||||
/* .run_kernel_ex = */ &kernel_run_float_fn10<kai_run_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm>,
|
||||
},
|
||||
/* .gemm_lhs_info = */ {
|
||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
|
||||
/* .get_packed_offset_ex = */ &lhs_offs_fn5<kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32>,
|
||||
/* .packed_size_ex = */ &lhs_ps_fn5<kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32>,
|
||||
/* .pack_func_ex = */ &lhs_pack_float_fn9_no_bl<kai_run_lhs_quant_pack_qai8dxp_f32>,
|
||||
},
|
||||
/* I8MM GEMV (dotprod fallback) */
|
||||
{
|
||||
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod,
|
||||
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod,
|
||||
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod,
|
||||
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod,
|
||||
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod,
|
||||
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod,
|
||||
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod,
|
||||
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod,
|
||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod>,
|
||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod>,
|
||||
/* .run_kernel_ex = */ &kernel_run_float_fn10<kai_run_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod>,
|
||||
},
|
||||
/* .gemv_lhs_info = */ {
|
||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
|
||||
/* .get_packed_offset_ex = */ &lhs_offs_fn5<kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32>,
|
||||
/* .packed_size_ex = */ &lhs_ps_fn5<kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32>,
|
||||
/* .pack_func_ex = */ &lhs_pack_float_fn9_no_bl<kai_run_lhs_quant_pack_qai8dxp_f32>,
|
||||
},
|
||||
/* .rhs_info = */ {
|
||||
/* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi8cxp_qsi8cx_neon,
|
||||
/* .to_float = */ dequantize_row_qsi8cxp,
|
||||
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
|
||||
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
|
||||
/* .pack_func_ex = */ &rhs_pack_scale_fn12<kai_run_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
|
||||
},
|
||||
/* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
|
||||
/* .lhs_type = */ GGML_TYPE_F32,
|
||||
/* .rhs_type = */ GGML_TYPE_Q8_0,
|
||||
/* .op_type = */ GGML_TYPE_F32,
|
||||
},
|
||||
#endif
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
{
|
||||
/* DOTPROD GEMM */
|
||||
{
|
||||
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod,
|
||||
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod,
|
||||
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod,
|
||||
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod,
|
||||
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod,
|
||||
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod,
|
||||
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod,
|
||||
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod,
|
||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod>,
|
||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod>,
|
||||
/* .run_kernel_ex = */ &kernel_run_float_fn10<kai_run_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod>,
|
||||
},
|
||||
/* .gemm_lhs_info = */ {
|
||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
|
||||
/* .get_packed_offset_ex = */ &lhs_offs_fn5<kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32>,
|
||||
/* .packed_size_ex = */ &lhs_ps_fn5<kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32>,
|
||||
/* .pack_func_ex = */ &lhs_pack_float_fn9_no_bl<kai_run_lhs_quant_pack_qai8dxp_f32>,
|
||||
},
|
||||
/* DOTPROD GEMV */
|
||||
{
|
||||
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
|
||||
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
|
||||
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
|
||||
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
|
||||
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
|
||||
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
|
||||
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
|
||||
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
|
||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod>,
|
||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod>,
|
||||
/* .run_kernel_ex = */ &kernel_run_float_fn10<kai_run_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod>,
|
||||
},
|
||||
/* .gemv_lhs_info = */ {
|
||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
|
||||
/* .get_packed_offset_ex = */ &lhs_offs_fn5<kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32>,
|
||||
/* .packed_size_ex = */ &lhs_ps_fn5<kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32>,
|
||||
/* .pack_func_ex = */ &lhs_pack_float_fn9_no_bl<kai_run_lhs_quant_pack_qai8dxp_f32>,
|
||||
},
|
||||
/* .rhs_info = */ {
|
||||
/* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi8cxp_qsi8cx_neon,
|
||||
/* .to_float = */ dequantize_row_qsi8cxp,
|
||||
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
|
||||
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
|
||||
/* .pack_func_ex = */ &rhs_pack_scale_fn12<kai_run_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
|
||||
},
|
||||
/* .required_cpu = */ CPU_FEATURE_DOTPROD,
|
||||
/* .lhs_type = */ GGML_TYPE_F32,
|
||||
/* .rhs_type = */ GGML_TYPE_Q8_0,
|
||||
/* .op_type = */ GGML_TYPE_F32,
|
||||
},
|
||||
#endif
|
||||
{ /* Sentinel */ }
|
||||
};
|
||||
|
||||
ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, const ggml_tensor * tensor) {
|
||||
ggml_kleidiai_kernels * kernel = nullptr;
|
||||
|
||||
if (tensor->op == GGML_OP_MUL_MAT && tensor->src[0] != nullptr && tensor->src[1] != nullptr) {
|
||||
#if defined(__ARM_FEATURE_SME) || \
|
||||
defined(__ARM_FEATURE_DOTPROD) || \
|
||||
defined(__ARM_FEATURE_MATMUL_INT8) || \
|
||||
defined(__ARM_FEATURE_SVE)
|
||||
auto try_table = [&](auto & table) {
|
||||
for (size_t i = 0; i < NELEMS(table) - 1; ++i) {
|
||||
if ((cpu_features & table[i].required_cpu) == table[i].required_cpu &&
|
||||
table[i].lhs_type == tensor->src[1]->type &&
|
||||
table[i].rhs_type == tensor->src[0]->type &&
|
||||
table[i].op_type == tensor->type) {
|
||||
kernel = &table[i];
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
if (tensor->src[0]->type == GGML_TYPE_Q8_0) {
|
||||
try_table(gemm_gemv_kernels_q8);
|
||||
} else {
|
||||
try_table(gemm_gemv_kernels);
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(gemm_gemv_kernels);
|
||||
GGML_UNUSED(gemm_gemv_kernels_q8);
|
||||
GGML_UNUSED(cpu_features);
|
||||
#endif
|
||||
}
|
||||
|
||||
return kernel;
|
||||
}
|
||||
|
||||
ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q4_0(cpu_feature features) {
|
||||
ggml_kleidiai_kernels * kernels = nullptr;
|
||||
|
||||
#if defined(__ARM_FEATURE_SME) || \
|
||||
defined(__ARM_FEATURE_DOTPROD) || \
|
||||
defined(__ARM_FEATURE_MATMUL_INT8) || \
|
||||
defined(__ARM_FEATURE_SVE)
|
||||
for (size_t i = 0; i < NELEMS(gemm_gemv_kernels) - 1; ++i) {
|
||||
if ((features & gemm_gemv_kernels[i].required_cpu) == gemm_gemv_kernels[i].required_cpu) {
|
||||
kernels = &gemm_gemv_kernels[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(features);
|
||||
#endif
|
||||
|
||||
return kernels;
|
||||
}
|
||||
|
||||
ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q8_0(cpu_feature features) {
|
||||
ggml_kleidiai_kernels * kernels = nullptr;
|
||||
|
||||
#if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
for (size_t i = 0; i < NELEMS(gemm_gemv_kernels_q8) - 1; ++i) {
|
||||
if ((features & gemm_gemv_kernels_q8[i].required_cpu) == gemm_gemv_kernels_q8[i].required_cpu) {
|
||||
kernels = &gemm_gemv_kernels_q8[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(features);
|
||||
#endif
|
||||
|
||||
return kernels;
|
||||
}
|
||||
@@ -0,0 +1,90 @@
|
||||
// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
enum cpu_feature {
|
||||
CPU_FEATURE_NONE = 0,
|
||||
CPU_FEATURE_DOTPROD = 1,
|
||||
CPU_FEATURE_I8MM = 2,
|
||||
CPU_FEATURE_SVE = 4,
|
||||
CPU_FEATURE_SME = 8
|
||||
};
|
||||
|
||||
inline cpu_feature& operator|=(cpu_feature& lhs, cpu_feature rhs) {
|
||||
lhs = static_cast<cpu_feature>(lhs | rhs);
|
||||
return lhs;
|
||||
}
|
||||
inline cpu_feature operator|(cpu_feature lhs, cpu_feature rhs) {
|
||||
return static_cast<cpu_feature>(static_cast<int>(lhs) | static_cast<int>(rhs));
|
||||
}
|
||||
|
||||
struct kernel_info {
|
||||
size_t (*get_m_step)(void);
|
||||
size_t (*get_n_step)(void);
|
||||
size_t (*get_mr)(void);
|
||||
size_t (*get_nr)(void);
|
||||
size_t (*get_kr)(void);
|
||||
size_t (*get_sr)(void);
|
||||
|
||||
size_t (*get_dst_offset)(size_t m_idx, size_t n_idx, size_t stride);
|
||||
size_t (*get_dst_size)(size_t m, size_t n);
|
||||
|
||||
size_t (*get_lhs_offset_ex)(size_t m_idx, size_t k, size_t bl);
|
||||
|
||||
size_t (*get_rhs_packed_offset_ex)(size_t n_idx, size_t k, size_t bl);
|
||||
|
||||
void (*run_kernel_ex)(
|
||||
size_t m, size_t n, size_t k, size_t bl,
|
||||
const void* lhs_packed, const void* rhs_packed,
|
||||
void* dst, size_t dst_stride_row, size_t dst_stride_col,
|
||||
float clamp_min, float clamp_max);
|
||||
};
|
||||
|
||||
struct lhs_packing_info {
|
||||
size_t (*get_offset)(size_t m_idx, size_t lhs_stride);
|
||||
|
||||
size_t (*get_packed_offset_ex)(size_t m_idx, size_t k, size_t bl, size_t mr, size_t kr, size_t sr);
|
||||
|
||||
size_t (*packed_size_ex)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr);
|
||||
|
||||
void (*pack_func_ex)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr,
|
||||
size_t m_idx_start, const void * lhs, size_t lhs_stride, void * lhs_packed);
|
||||
};
|
||||
|
||||
struct rhs_packing_info {
|
||||
size_t (*packed_stride)(size_t k, size_t nr, size_t kr, size_t bl);
|
||||
|
||||
void (*to_float)(const void *packed_data, int32_t row_idx, int64_t nc, float *out,
|
||||
size_t nr_pack, size_t packed_row_stride, size_t kr, size_t bl,
|
||||
size_t num_bytes_multiplier);
|
||||
|
||||
size_t (*packed_size_ex)(size_t n, size_t k, size_t nr, size_t kr, size_t bl);
|
||||
|
||||
size_t (*packed_stride_ex)(size_t k, size_t nr, size_t kr, size_t bl);
|
||||
|
||||
void (*pack_func_ex)(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t bl,
|
||||
size_t rhs_stride, const void * rhs, const void * bias, const void * scale, void * rhs_packed, size_t extra_bytes, const void * params);
|
||||
};
|
||||
|
||||
struct ggml_kleidiai_kernels {
|
||||
kernel_info gemm;
|
||||
lhs_packing_info gemm_lhs_info;
|
||||
|
||||
kernel_info gemv;
|
||||
lhs_packing_info gemv_lhs_info;
|
||||
|
||||
rhs_packing_info rhs_info;
|
||||
|
||||
cpu_feature required_cpu;
|
||||
ggml_type lhs_type;
|
||||
ggml_type rhs_type;
|
||||
ggml_type op_type;
|
||||
};
|
||||
|
||||
ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, const ggml_tensor * tensor);
|
||||
ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q4_0(cpu_feature features);
|
||||
ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q8_0(cpu_feature features);
|
||||
@@ -0,0 +1,798 @@
|
||||
// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
#include <arm_neon.h>
|
||||
#include <assert.h>
|
||||
#include <atomic>
|
||||
#include <cfloat>
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
#include <stdexcept>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#if defined(__linux__)
|
||||
#include <asm/hwcap.h>
|
||||
#include <sys/auxv.h>
|
||||
#elif defined(__APPLE__)
|
||||
#include <string_view>
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/types.h>
|
||||
#elif defined(_WIN32)
|
||||
#include <windows.h>
|
||||
#include <excpt.h>
|
||||
#endif
|
||||
|
||||
#include "kleidiai.h"
|
||||
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-threading.h"
|
||||
#include "traits.h"
|
||||
|
||||
#include "kernels.h"
|
||||
|
||||
#include "kai_common.h"
|
||||
|
||||
#define GGML_COMMON_DECL_CPP
|
||||
#include "ggml-common.h"
|
||||
|
||||
struct ggml_kleidiai_context {
|
||||
cpu_feature features;
|
||||
ggml_kleidiai_kernels * kernels_q4;
|
||||
ggml_kleidiai_kernels * kernels_q8;
|
||||
} static ctx = { CPU_FEATURE_NONE, NULL, NULL };
|
||||
|
||||
static const char* cpu_feature_to_string(cpu_feature f) {
|
||||
if (f == CPU_FEATURE_NONE) {
|
||||
return "NONE";
|
||||
} else if ((f & CPU_FEATURE_SME) == CPU_FEATURE_SME) {
|
||||
return "SME";
|
||||
} else if ((f & CPU_FEATURE_SVE) == CPU_FEATURE_SVE) {
|
||||
return "SVE";
|
||||
}
|
||||
else if ((f & CPU_FEATURE_I8MM) == CPU_FEATURE_I8MM) {
|
||||
return "I8MM";
|
||||
} else if ((f & CPU_FEATURE_DOTPROD) == CPU_FEATURE_DOTPROD) {
|
||||
return "DOTPROD";
|
||||
}
|
||||
else {
|
||||
return "UNKNOWN";
|
||||
}
|
||||
}
|
||||
|
||||
static void init_kleidiai_context(void) {
|
||||
|
||||
ggml_critical_section_start();
|
||||
static bool initialized = false;
|
||||
|
||||
if (!initialized) {
|
||||
initialized = true;
|
||||
const char *env_var = getenv("GGML_KLEIDIAI_SME");
|
||||
int sme_enabled = 0;
|
||||
|
||||
ctx.features = (ggml_cpu_has_dotprod() ? CPU_FEATURE_DOTPROD : CPU_FEATURE_NONE) |
|
||||
(ggml_cpu_has_matmul_int8() ? CPU_FEATURE_I8MM : CPU_FEATURE_NONE) |
|
||||
((ggml_cpu_has_sve() && ggml_cpu_get_sve_cnt() == QK8_0) ? CPU_FEATURE_SVE : CPU_FEATURE_NONE);
|
||||
|
||||
if (env_var) {
|
||||
sme_enabled = atoi(env_var);
|
||||
}
|
||||
|
||||
if (sme_enabled != 0) {
|
||||
ctx.features |= ggml_cpu_has_sme() ? CPU_FEATURE_SME : CPU_FEATURE_NONE;
|
||||
}
|
||||
ctx.kernels_q4 = ggml_kleidiai_select_kernels_q4_0(ctx.features);
|
||||
ctx.kernels_q8 = ggml_kleidiai_select_kernels_q8_0(ctx.features);
|
||||
#ifndef NDEBUG
|
||||
if (ctx.kernels_q4) {
|
||||
GGML_LOG_DEBUG("kleidiai: using q4 kernel with CPU feature %s\n", cpu_feature_to_string(ctx.kernels_q4->required_cpu));
|
||||
}
|
||||
if (ctx.kernels_q8) {
|
||||
GGML_LOG_DEBUG("kleidiai: using q8 kernel with CPU feature %s\n", cpu_feature_to_string(ctx.kernels_q8->required_cpu));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
ggml_critical_section_end();
|
||||
}
|
||||
|
||||
static inline int64_t ggml_ne(const ggml_tensor * tensor, int dim) {
|
||||
GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS);
|
||||
return tensor->ne[dim];
|
||||
}
|
||||
|
||||
namespace ggml::cpu::kleidiai {
|
||||
|
||||
static size_t round_down(size_t x, size_t y) {
|
||||
return y == 0 ? x : x - (x % y);
|
||||
}
|
||||
|
||||
static void transpose_f32kxn_f16nxk(size_t n, size_t k, float * dst, const uint16_t * src, size_t rhs_stride) {
|
||||
size_t src_stride = rhs_stride / sizeof(uint16_t);
|
||||
size_t dst_stride = n;
|
||||
|
||||
for (size_t k_idx = 0; k_idx < k; ++k_idx) {
|
||||
for (size_t n_idx = 0; n_idx < n; ++n_idx) {
|
||||
uint16_t v = *(src + k_idx + n_idx * src_stride);
|
||||
*(dst + n_idx + k_idx * dst_stride) = kai_cast_f32_f16(v);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class tensor_traits : public ggml::cpu::tensor_traits {
|
||||
bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
|
||||
if (op->op != GGML_OP_MUL_MAT) {
|
||||
return false;
|
||||
}
|
||||
ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, op);
|
||||
if (!kernels) {
|
||||
return false;
|
||||
}
|
||||
bool is_gemv = op->src[1]->ne[1] == 1;
|
||||
kernel_info * kernel = is_gemv ? &kernels->gemv : &kernels->gemm;
|
||||
lhs_packing_info * lhs_info = is_gemv ? &kernels->gemv_lhs_info : &kernels->gemm_lhs_info;
|
||||
|
||||
size_t k = op->src[0]->ne[0];
|
||||
size_t n = op->src[0]->ne[1];
|
||||
size_t m = op->src[1]->ne[1];
|
||||
|
||||
size_t mr = kernel->get_mr();
|
||||
size_t kr = kernel->get_kr();
|
||||
size_t sr = kernel->get_sr();
|
||||
|
||||
if (kernels->rhs_type == GGML_TYPE_Q4_0) {
|
||||
if (!lhs_info->packed_size_ex) return false;
|
||||
size = lhs_info->packed_size_ex(m, k, QK4_0, mr, kr, sr);
|
||||
} else if (kernels->rhs_type == GGML_TYPE_Q8_0) {
|
||||
if (!lhs_info->packed_size_ex) return false;
|
||||
size = lhs_info->packed_size_ex(m, k, QK8_0, mr, kr, sr);
|
||||
} else if (kernels->rhs_type == GGML_TYPE_F16) {
|
||||
if (!lhs_info->packed_size_ex || !kernels->rhs_info.packed_size_ex) return false;
|
||||
const int64_t lhs_batch_size0 = op->src[1]->ne[2];
|
||||
const int64_t rhs_batch_size0 = op->src[0]->ne[2];
|
||||
const int64_t r = lhs_batch_size0 / rhs_batch_size0;
|
||||
size = lhs_info->packed_size_ex(m * r, k, 0, mr, kr, sr) +
|
||||
kernels->rhs_info.packed_size_ex(n, k, kernel->get_nr(), kernel->get_kr(), 0) +
|
||||
k * n * sizeof(float) + n * sizeof(float);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * dst) override {
|
||||
if (dst->op == GGML_OP_MUL_MAT) {
|
||||
if (dst->src[0]->type == GGML_TYPE_Q4_0) {
|
||||
return compute_forward_q4_0(params, dst);
|
||||
} else if (dst->src[0]->type == GGML_TYPE_Q8_0) {
|
||||
return compute_forward_q8_0(params, dst);
|
||||
} else if (dst->src[0]->type == GGML_TYPE_F16) {
|
||||
return compute_forward_fp16(params, dst);
|
||||
}
|
||||
} else if (dst->op == GGML_OP_GET_ROWS) {
|
||||
if (dst->src[0]->type == GGML_TYPE_Q4_0 || dst->src[0]->type == GGML_TYPE_Q8_0) {
|
||||
return compute_forward_get_rows(params, dst);
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool compute_forward_fp16(ggml_compute_params * params, struct ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, dst);
|
||||
if (!kernels) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const bool is_gemv = src1->ne[1] == 1;
|
||||
kernel_info * kernel = is_gemv ? &kernels->gemv : &kernels->gemm;
|
||||
lhs_packing_info * lhs_info = is_gemv ? &kernels->gemv_lhs_info : &kernels->gemm_lhs_info;
|
||||
GGML_ASSERT(kernel);
|
||||
if (!kernels->rhs_info.pack_func_ex ||
|
||||
!kernel->get_lhs_offset_ex || !kernel->get_rhs_packed_offset_ex || !kernel->run_kernel_ex) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const int nth = params->nth;
|
||||
const int ith = params->ith;
|
||||
|
||||
const int64_t lhs_batch_size0 = ne12;
|
||||
const int64_t rhs_batch_size0 = ne02;
|
||||
const int64_t batch_size = lhs_batch_size0;
|
||||
|
||||
GGML_ASSERT(rhs_batch_size0 > 0);
|
||||
GGML_ASSERT(lhs_batch_size0 % rhs_batch_size0 == 0);
|
||||
const int64_t r = lhs_batch_size0 / rhs_batch_size0;
|
||||
|
||||
const int64_t m_group = ne11;
|
||||
const int64_t m = m_group;
|
||||
const int64_t n = ne01;
|
||||
const int64_t k = ne00;
|
||||
|
||||
const size_t lhs_stride = src1->nb[1];
|
||||
const size_t rhs_stride = src0->nb[1];
|
||||
const size_t dst_stride = dst->nb[1];
|
||||
|
||||
const int64_t mr = (int64_t) kernel->get_mr();
|
||||
const int64_t nr = (int64_t) kernel->get_nr();
|
||||
const int64_t kr = (int64_t) kernel->get_kr();
|
||||
const int64_t sr = (int64_t) kernel->get_sr();
|
||||
|
||||
const size_t lhs_packed_size = lhs_info->packed_size_ex(m, k, 0, mr, kr, sr);
|
||||
const size_t rhs_packed_size = kernels->rhs_info.packed_size_ex(n, k, nr, kr, 0);
|
||||
const size_t kxn_size = k * n * sizeof(float);
|
||||
const size_t bias_size = n * sizeof(float);
|
||||
|
||||
const size_t wsize_required = lhs_packed_size + rhs_packed_size + kxn_size + bias_size;
|
||||
GGML_ASSERT(wsize_required <= params->wsize);
|
||||
|
||||
uint8_t * lhs_packed = static_cast<uint8_t *>(params->wdata);
|
||||
uint8_t * rhs_packed = lhs_packed + lhs_packed_size;
|
||||
uint8_t * rhs_kxn = rhs_packed + rhs_packed_size;
|
||||
uint8_t * bias = rhs_kxn + kxn_size;
|
||||
|
||||
for (int64_t batch_idx = 0; batch_idx < batch_size; ++batch_idx) {
|
||||
const int64_t rhs_batch_idx = batch_idx / r;
|
||||
const uint8_t * rhs_batch_base = static_cast<const uint8_t *>(src0->data) + rhs_batch_idx * src0->nb[2];
|
||||
uint8_t * dst_batch_base = static_cast<uint8_t *>(dst->data) + batch_idx * dst->nb[2];
|
||||
|
||||
// LHS packing (threaded over m, honoring mr alignment and KV groups)
|
||||
{
|
||||
const int64_t m_roundup_mr = kai_roundup(m, mr);
|
||||
const int64_t num_threads = KAI_MIN(m_roundup_mr / mr, nth);
|
||||
|
||||
if (ith < num_threads) {
|
||||
const int64_t num_m_per_thread0 = round_down((size_t)(m_roundup_mr / num_threads), (size_t)mr);
|
||||
const int64_t num_m_per_threadN_1 = m - (num_threads - 1) * num_m_per_thread0;
|
||||
|
||||
const int64_t m_start = ith * num_m_per_thread0;
|
||||
const int64_t m_count = (ith == num_threads - 1) ? num_m_per_threadN_1 : num_m_per_thread0;
|
||||
|
||||
// Base packed offset (aligned) and per-row stride in bytes
|
||||
const size_t base_packed_off = lhs_info->get_packed_offset_ex(m_start, k, 0, mr, kr, sr);
|
||||
const size_t next_block_off = lhs_info->get_packed_offset_ex(m_start + mr, k, 0, mr, kr, sr);
|
||||
const size_t row_stride_bytes = (next_block_off - base_packed_off) / (size_t)mr;
|
||||
|
||||
int64_t remaining = m_count;
|
||||
int64_t cur = m_start;
|
||||
|
||||
while (remaining > 0) {
|
||||
const int64_t row_in_group = cur;
|
||||
const int64_t avail = m_group - row_in_group;
|
||||
const int64_t take = std::min(avail, remaining);
|
||||
|
||||
const uint8_t * lhs_batch_base = static_cast<const uint8_t *>(src1->data) + batch_idx * src1->nb[2];
|
||||
const void * src_ptr = lhs_batch_base + (size_t)row_in_group * lhs_stride;
|
||||
const size_t dst_off = base_packed_off + (size_t)(cur - m_start) * row_stride_bytes;
|
||||
void * dst_ptr = lhs_packed + dst_off;
|
||||
|
||||
lhs_info->pack_func_ex(take, k, 0, mr, kr, sr, 0, src_ptr, lhs_stride, dst_ptr);
|
||||
|
||||
cur += take;
|
||||
remaining -= take;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// RHS packing (single thread), then synchronize
|
||||
if (ith == 0) {
|
||||
memset(bias, 0, (size_t)n * sizeof(float));
|
||||
transpose_f32kxn_f16nxk((size_t)n, (size_t)k,
|
||||
reinterpret_cast<float *>(rhs_kxn),
|
||||
reinterpret_cast<const uint16_t *>(rhs_batch_base),
|
||||
rhs_stride);
|
||||
|
||||
kernels->rhs_info.pack_func_ex(1, n, k, nr, kr, sr, 0, n * sizeof(float),
|
||||
rhs_kxn, bias, nullptr, rhs_packed, 0, nullptr);
|
||||
}
|
||||
|
||||
ggml_barrier(params->threadpool);
|
||||
|
||||
// Matmul (threaded over n)
|
||||
{
|
||||
const int64_t n_step = (int64_t) kernel->get_n_step();
|
||||
int64_t num_threads_n = KAI_MIN(n / n_step, nth);
|
||||
if (num_threads_n <= 0) {
|
||||
num_threads_n = 1;
|
||||
}
|
||||
|
||||
if (ith < num_threads_n) {
|
||||
const int64_t num_n_per_thread0 = round_down((size_t)(n / num_threads_n), (size_t)n_step);
|
||||
const int64_t num_n_per_threadN_1 = n - (num_threads_n - 1) * num_n_per_thread0;
|
||||
|
||||
const int64_t n_start = ith * num_n_per_thread0;
|
||||
const int64_t n_to_process = (ith == num_threads_n - 1) ? num_n_per_threadN_1 : num_n_per_thread0;
|
||||
|
||||
// LHS packed base at row 0 (consistent with packing above)
|
||||
const size_t lhs_packed_offset0 = lhs_info->get_packed_offset_ex(0, k, 0, mr, kr, sr);
|
||||
const size_t rhs_packed_offset = kernel->get_rhs_packed_offset_ex(n_start, k, 0);
|
||||
const size_t dst_offset = kernel->get_dst_offset((size_t)0, (size_t)n_start, dst_stride);
|
||||
|
||||
const void * lhs_ptr = lhs_packed + lhs_packed_offset0;
|
||||
const void * rhs_ptr = rhs_packed + rhs_packed_offset;
|
||||
float * dst_ptr = reinterpret_cast<float *>(dst_batch_base + dst_offset);
|
||||
|
||||
kernel->run_kernel_ex(m, n_to_process, k, 0, lhs_ptr, rhs_ptr, dst_ptr, dst_stride, sizeof(float), -FLT_MAX, FLT_MAX);
|
||||
}
|
||||
}
|
||||
|
||||
if (batch_idx != batch_size - 1) {
|
||||
ggml_barrier(params->threadpool);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool compute_forward_q4_0(struct ggml_compute_params * params, struct ggml_tensor * dst) {
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_Q4_0);
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, dst);
|
||||
if (!kernels) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool is_gemv = src1->ne[1] == 1;
|
||||
kernel_info * kernel = is_gemv ? &kernels->gemv : &kernels->gemm;
|
||||
lhs_packing_info * lhs_info = is_gemv ? &kernels->gemv_lhs_info : &kernels->gemm_lhs_info;
|
||||
|
||||
GGML_ASSERT(kernel);
|
||||
if (!lhs_info->get_packed_offset_ex || !lhs_info->pack_func_ex ||
|
||||
!kernel->get_rhs_packed_offset_ex || !kernel->run_kernel_ex || !kernel->get_dst_offset) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth_raw = params->nth;
|
||||
const int nth = nth_raw > 0 ? nth_raw : 1;
|
||||
|
||||
const size_t k = ne00;
|
||||
const size_t m = ne11;
|
||||
const size_t n = ne01;
|
||||
|
||||
size_t mr = kernel->get_mr();
|
||||
size_t kr = kernel->get_kr();
|
||||
size_t sr = kernel->get_sr();
|
||||
|
||||
const uint8_t * lhs = static_cast<const uint8_t *>(src1->data);
|
||||
uint8_t * lhs_packed = (uint8_t*)params->wdata;
|
||||
const uint8_t * rhs_packed = static_cast<const uint8_t *>(src0->data);
|
||||
|
||||
const size_t n_step = kernel->get_n_step();
|
||||
const size_t num_n_per_thread = kai_roundup(kai_roundup(n, nth) / nth, n_step);
|
||||
const size_t n_start = ith * num_n_per_thread;
|
||||
|
||||
size_t n_to_process = 0;
|
||||
if (n_start < n) {
|
||||
n_to_process = num_n_per_thread;
|
||||
if ((n_start + n_to_process) > n) {
|
||||
n_to_process = n - n_start;
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate number of columns to be processed per thread
|
||||
const size_t num_m_per_thread = kai_roundup(m, mr * nth) / nth;
|
||||
const size_t m_start = ith * num_m_per_thread;
|
||||
size_t m_to_process = num_m_per_thread;
|
||||
if ((m_start + m_to_process) > m) {
|
||||
m_to_process = m - m_start;
|
||||
}
|
||||
|
||||
if (m_start < m) {
|
||||
// Transform LHS
|
||||
const size_t src_stride = src1->nb[1];
|
||||
const float * src_ptr = reinterpret_cast<const float *>(lhs + lhs_info->get_offset(m_start, dst->src[1]->nb[1]));
|
||||
const size_t lhs_packed_offset = lhs_info->get_packed_offset_ex(m_start, k, QK4_0, mr, kr, sr);
|
||||
void * lhs_packed_ptr = static_cast<void *>(lhs_packed + lhs_packed_offset);
|
||||
|
||||
// Pack this thread's chunk with m_idx_start = 0 and per-thread output pointer
|
||||
lhs_info->pack_func_ex(m_to_process, k, QK4_0, mr, kr, sr, 0, src_ptr, src_stride, lhs_packed_ptr);
|
||||
}
|
||||
|
||||
ggml_barrier(params->threadpool);
|
||||
|
||||
// Perform the operation
|
||||
const size_t dst_stride = dst->nb[1];
|
||||
const size_t lhs_packed_offset = lhs_info->get_packed_offset_ex(0, k, QK4_0, mr, kr, sr);
|
||||
const size_t rhs_packed_offset = kernel->get_rhs_packed_offset_ex(n_start, k, QK4_0);
|
||||
const size_t dst_offset = kernel->get_dst_offset(0, n_start, dst_stride);
|
||||
const void * rhs_ptr = static_cast<const void *>(rhs_packed + rhs_packed_offset);
|
||||
const void* lhs_ptr = (const void*)((const char *)lhs_packed + lhs_packed_offset);
|
||||
float *dst_ptr = reinterpret_cast<float *>(static_cast<uint8_t *>(dst->data) + dst_offset);
|
||||
|
||||
if (n_to_process > 0) {
|
||||
kernel->run_kernel_ex(m, n_to_process, k, QK4_0, lhs_ptr, rhs_ptr, dst_ptr, dst_stride,
|
||||
sizeof(float), -FLT_MAX, FLT_MAX);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool compute_forward_q8_0(struct ggml_compute_params * params, struct ggml_tensor * dst) {
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_Q8_0);
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, dst);
|
||||
if (!kernels) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool is_gemv = src1->ne[1] == 1;
|
||||
kernel_info * kernel = is_gemv ? &kernels->gemv : &kernels->gemm;
|
||||
lhs_packing_info * lhs_info = is_gemv ? &kernels->gemv_lhs_info : &kernels->gemm_lhs_info;
|
||||
|
||||
if (!kernel || !lhs_info->get_packed_offset_ex || !lhs_info->pack_func_ex ||
|
||||
!kernel->get_rhs_packed_offset_ex || !kernel->run_kernel_ex || !kernel->get_dst_offset) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth_raw = params->nth;
|
||||
const int nth = nth_raw > 0 ? nth_raw : 1;
|
||||
|
||||
const size_t k = ne00;
|
||||
const size_t m = ne11;
|
||||
const size_t n = ne01;
|
||||
|
||||
size_t mr = kernel->get_mr();
|
||||
size_t kr = kernel->get_kr();
|
||||
size_t sr = kernel->get_sr();
|
||||
|
||||
const uint8_t * lhs = static_cast<const uint8_t *>(src1->data);
|
||||
uint8_t * lhs_packed = static_cast<uint8_t *>(params->wdata);
|
||||
const uint8_t * rhs_packed = static_cast<const uint8_t *>(src0->data);
|
||||
|
||||
const size_t n_step = kernel->get_n_step();
|
||||
const size_t num_n_per_thread = kai_roundup(kai_roundup(n, nth) / nth, n_step);
|
||||
const size_t n_start = ith * num_n_per_thread;
|
||||
|
||||
size_t n_to_process = 0;
|
||||
if (n_start < n) {
|
||||
n_to_process = num_n_per_thread;
|
||||
if ((n_start + n_to_process) > n) {
|
||||
n_to_process = n - n_start;
|
||||
}
|
||||
}
|
||||
|
||||
const size_t num_m_per_thread = kai_roundup(m, mr * nth) / nth;
|
||||
const size_t m_start = ith * num_m_per_thread;
|
||||
size_t m_to_process = num_m_per_thread;
|
||||
if ((m_start + m_to_process) > m) {
|
||||
m_to_process = m - m_start;
|
||||
}
|
||||
|
||||
if (m_start < m) {
|
||||
const size_t src_stride = src1->nb[1];
|
||||
const float * src_ptr = reinterpret_cast<const float *>(lhs + lhs_info->get_offset(m_start, dst->src[1]->nb[1]));
|
||||
const size_t lhs_packed_offset = lhs_info->get_packed_offset_ex(m_start, k, 0, mr, kr, sr);
|
||||
void * lhs_packed_ptr = static_cast<void *>(lhs_packed + lhs_packed_offset);
|
||||
|
||||
lhs_info->pack_func_ex(m_to_process, k, 0, mr, kr, sr, 0, src_ptr, src_stride, lhs_packed_ptr);
|
||||
}
|
||||
|
||||
ggml_barrier(params->threadpool);
|
||||
|
||||
const size_t dst_stride = dst->nb[1];
|
||||
const size_t lhs_packed_offset = lhs_info->get_packed_offset_ex(0, k, 0, mr, kr, sr);
|
||||
const size_t rhs_packed_offset = kernel->get_rhs_packed_offset_ex(n_start, k, 0);
|
||||
const size_t dst_offset = kernel->get_dst_offset(0, n_start, dst_stride);
|
||||
const void * rhs_ptr = static_cast<const void *>(rhs_packed + rhs_packed_offset);
|
||||
const void * lhs_ptr = static_cast<const void *>(lhs_packed + lhs_packed_offset);
|
||||
float * dst_ptr = reinterpret_cast<float *>(static_cast<uint8_t *>(dst->data) + dst_offset);
|
||||
|
||||
if (n_to_process > 0) {
|
||||
kernel->run_kernel_ex(m, n_to_process, k, 0, lhs_ptr, rhs_ptr, dst_ptr, dst_stride,
|
||||
sizeof(float), -FLT_MAX, FLT_MAX);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool compute_forward_get_rows(struct ggml_compute_params * params, struct ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
ggml_kleidiai_kernels * kernels = nullptr;
|
||||
size_t block_len = 0;
|
||||
size_t num_bytes_multiplier = 0;
|
||||
|
||||
if (dst->src[0]->type == GGML_TYPE_Q4_0) {
|
||||
if (!ctx.kernels_q4) {
|
||||
return false;
|
||||
}
|
||||
kernels = ctx.kernels_q4;
|
||||
block_len = QK4_0;
|
||||
num_bytes_multiplier = sizeof(uint16_t);
|
||||
} else if (dst->src[0]->type == GGML_TYPE_Q8_0) {
|
||||
if (!ctx.kernels_q8) {
|
||||
return false;
|
||||
}
|
||||
kernels = ctx.kernels_q8;
|
||||
block_len = QK8_0;
|
||||
num_bytes_multiplier = sizeof(float);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
||||
rhs_packing_info * rhs_info = &kernels->rhs_info;
|
||||
kernel_info * kernel = &kernels->gemm;
|
||||
if (!rhs_info->to_float || !kernel->get_nr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const int64_t nc = ne00;
|
||||
const int64_t nr = ggml_nelements(src1);
|
||||
|
||||
const size_t block_rows = kernel->get_nr();
|
||||
const size_t kr = kernel->get_kr();
|
||||
|
||||
const size_t packed_stride = rhs_info->packed_stride(nc, block_rows, kr, block_len);
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int dr = (nr + nth - 1) / nth;
|
||||
const int ir0 = dr * ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
for (int64_t i = ir0; i < ir1; ++i) {
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
||||
int64_t row_idx = ((const int32_t *)src1->data)[i];
|
||||
GGML_ASSERT(row_idx >= 0 && row_idx < src0->ne[1]);
|
||||
|
||||
float *out = (float *)((char *)dst->data + i * nb1);
|
||||
rhs_info->to_float(src0->data, row_idx, nc, out, block_rows, packed_stride, kr, block_len, num_bytes_multiplier);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public:
|
||||
int repack(struct ggml_tensor * tensor, const void * data, size_t data_size) {
|
||||
const size_t n = tensor->ne[1];
|
||||
const size_t k = tensor->ne[0];
|
||||
|
||||
if (tensor->type == GGML_TYPE_Q4_0) {
|
||||
if (!ctx.kernels_q4) {
|
||||
return -1;
|
||||
}
|
||||
size_t nr = ctx.kernels_q4->gemm.get_nr();
|
||||
size_t kr = ctx.kernels_q4->gemm.get_kr();
|
||||
size_t sr = ctx.kernels_q4->gemm.get_sr();
|
||||
|
||||
struct kai_rhs_pack_qs4cxs1s0_param params;
|
||||
params.lhs_zero_point = 1;
|
||||
params.rhs_zero_point = 8;
|
||||
ctx.kernels_q4->rhs_info.pack_func_ex(1, n, k, nr, kr, sr, QK4_0, 0,
|
||||
static_cast<const uint8_t *>(data),
|
||||
nullptr, nullptr, tensor->data, 0, ¶ms);
|
||||
GGML_UNUSED(data_size);
|
||||
return 0;
|
||||
} else if (tensor->type == GGML_TYPE_Q8_0) {
|
||||
if (!ctx.kernels_q8) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
const size_t row_stride = tensor->nb[1];
|
||||
const size_t k_blocks = (k + QK8_0 - 1) / QK8_0;
|
||||
|
||||
std::vector<int8_t> qdata(n * k, 0);
|
||||
std::vector<float> scales(n, 0.0f);
|
||||
|
||||
for (size_t row = 0; row < n; ++row) {
|
||||
const auto * row_blocks = reinterpret_cast<const block_q8_0 *>(
|
||||
static_cast<const uint8_t *>(data) + row * row_stride);
|
||||
|
||||
float max_abs = 0.0f;
|
||||
for (size_t block = 0; block < k_blocks; ++block) {
|
||||
const block_q8_0 & blk = row_blocks[block];
|
||||
const float d = GGML_FP16_TO_FP32(blk.d);
|
||||
for (size_t l = 0; l < QK8_0; ++l) {
|
||||
const size_t linear_idx = block * QK8_0 + l;
|
||||
if (linear_idx >= k) {
|
||||
break;
|
||||
}
|
||||
const float value = d * blk.qs[l];
|
||||
max_abs = std::max(max_abs, std::fabs(value));
|
||||
}
|
||||
}
|
||||
|
||||
float scale = max_abs > 0.0f ? max_abs / 127.0f : 0.0f;
|
||||
scales[row] = scale;
|
||||
const float inv_scale = scale > 0.0f ? 1.0f / scale : 0.0f;
|
||||
|
||||
for (size_t block = 0; block < k_blocks; ++block) {
|
||||
const block_q8_0 & blk = row_blocks[block];
|
||||
const float d = GGML_FP16_TO_FP32(blk.d);
|
||||
for (size_t l = 0; l < QK8_0; ++l) {
|
||||
const size_t linear_idx = block * QK8_0 + l;
|
||||
if (linear_idx >= k) {
|
||||
break;
|
||||
}
|
||||
const float value = d * blk.qs[l];
|
||||
int32_t q = scale > 0.0f ? static_cast<int32_t>(std::lround(value * inv_scale)) : 0;
|
||||
q = std::clamp(q, -127, 127);
|
||||
qdata[row * k + linear_idx] = static_cast<int8_t>(q);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t nr = ctx.kernels_q8->gemm.get_nr();
|
||||
size_t kr = ctx.kernels_q8->gemm.get_kr();
|
||||
size_t sr = ctx.kernels_q8->gemm.get_sr();
|
||||
|
||||
struct kai_rhs_pack_qsi8cx_params params;
|
||||
params.lhs_zero_point = 1;
|
||||
params.scale_multiplier = 1.0f;
|
||||
|
||||
ctx.kernels_q8->rhs_info.pack_func_ex(1, n, k, nr, kr, sr, 0, 0,
|
||||
qdata.data(), nullptr, scales.data(),
|
||||
tensor->data, 0, ¶ms);
|
||||
GGML_UNUSED(data_size);
|
||||
return 0;
|
||||
}
|
||||
|
||||
GGML_UNUSED(data_size);
|
||||
return -1;
|
||||
}
|
||||
};
|
||||
|
||||
static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struct ggml_tensor *) {
|
||||
static tensor_traits traits;
|
||||
return &traits;
|
||||
}
|
||||
} // namespace ggml::cpu::kleidiai
|
||||
|
||||
static enum ggml_status ggml_backend_cpu_kleidiai_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||
tensor->extra = (void *) ggml::cpu::kleidiai::get_tensor_traits(buffer, tensor);
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_kleidiai_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
|
||||
const void * data, size_t offset, size_t size) {
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||
|
||||
auto tensor_traits = (ggml::cpu::kleidiai::tensor_traits *) tensor->extra;
|
||||
auto OK = tensor_traits->repack(tensor, data, size);
|
||||
|
||||
GGML_ASSERT(OK == 0);
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static const char * ggml_backend_cpu_kleidiai_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
return "CPU_KLEIDIAI";
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_cpu_kleidiai_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
|
||||
|
||||
if (buffer == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
buffer->buft = buft;
|
||||
buffer->iface.init_tensor = ggml_backend_cpu_kleidiai_buffer_init_tensor;
|
||||
buffer->iface.set_tensor = ggml_backend_cpu_kleidiai_buffer_set_tensor;
|
||||
buffer->iface.get_tensor = nullptr;
|
||||
buffer->iface.cpy_tensor = nullptr;
|
||||
return buffer;
|
||||
}
|
||||
|
||||
static size_t ggml_backend_cpu_kleidiai_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
return TENSOR_ALIGNMENT;
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_cpu_kleidiai_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
|
||||
GGML_UNUSED(buft);
|
||||
|
||||
const size_t n = tensor->ne[1];
|
||||
const size_t k = tensor->ne[0];
|
||||
|
||||
ggml_kleidiai_kernels * kernels = nullptr;
|
||||
size_t block_len = 0;
|
||||
|
||||
if (tensor->type == GGML_TYPE_Q4_0) {
|
||||
GGML_ASSERT(ctx.kernels_q4);
|
||||
kernels = ctx.kernels_q4;
|
||||
block_len = QK4_0;
|
||||
} else if (tensor->type == GGML_TYPE_Q8_0) {
|
||||
GGML_ASSERT(ctx.kernels_q8);
|
||||
kernels = ctx.kernels_q8;
|
||||
block_len = QK8_0;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
|
||||
const size_t nr = kernels->gemm.get_nr();
|
||||
const size_t kr = kernels->gemm.get_kr();
|
||||
const size_t packed = kernels->rhs_info.packed_size_ex(n, k, nr, kr, block_len);
|
||||
const size_t raw = ggml_nbytes(tensor);
|
||||
|
||||
return packed > raw ? packed : raw;
|
||||
}
|
||||
|
||||
namespace ggml::cpu::kleidiai {
|
||||
class extra_buffer_type : ggml::cpu::extra_buffer_type {
|
||||
bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
|
||||
if ((op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_GET_ROWS) &&
|
||||
(op->src[0]->type == GGML_TYPE_Q4_0 || op->src[0]->type == GGML_TYPE_Q8_0) &&
|
||||
op->src[0]->buffer &&
|
||||
(ggml_n_dims(op->src[0]) == 2) &&
|
||||
op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type()) {
|
||||
if (((op->src[0]->type == GGML_TYPE_Q4_0) ? ctx.kernels_q4 : ctx.kernels_q8) == nullptr) {
|
||||
return false;
|
||||
}
|
||||
if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
|
||||
return false;
|
||||
}
|
||||
if ((op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_I32) &&
|
||||
ggml_ne(op->src[1], 2) == 1 && ggml_ne(op->src[1], 3) == 1) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
|
||||
if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_GET_ROWS) {
|
||||
if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type()) {
|
||||
return (ggml::cpu::tensor_traits *) op->src[0]->extra;
|
||||
}
|
||||
else if (ggml_kleidiai_select_kernels(ctx.features, op) && op->src[1]->ne[1] > 1) {
|
||||
if ((op->src[0]->nb[1] * op->src[0]->ne[1] != op->src[0]->nb[2]) ||
|
||||
(op->src[1]->nb[1] * op->src[1]->ne[1] != op->src[1]->nb[2])) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return ggml::cpu::kleidiai::get_tensor_traits(NULL, NULL);
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
};
|
||||
} // namespace ggml::cpu::kleidiai
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cpu_kleidiai_buffer_type(void) {
|
||||
static ggml::cpu::kleidiai::extra_buffer_type ctx;
|
||||
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_kleidiai = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_cpu_kleidiai_buffer_type_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_cpu_kleidiai_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_cpu_kleidiai_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ nullptr, // defaults to SIZE_MAX
|
||||
/* .get_alloc_size = */ ggml_backend_cpu_kleidiai_buffer_type_get_alloc_size,
|
||||
/* .is_host = */ nullptr,
|
||||
},
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
||||
/* .context = */ &ctx,
|
||||
};
|
||||
|
||||
init_kleidiai_context();
|
||||
|
||||
return &ggml_backend_cpu_buffer_type_kleidiai;
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ggml-alloc.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cpu_kleidiai_buffer_type(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,333 @@
|
||||
#pragma once
|
||||
|
||||
typedef vector unsigned char vec_t;
|
||||
typedef __vector_quad acc_t;
|
||||
|
||||
template <typename TA>
|
||||
class tinyBLAS_Q0_PPC {
|
||||
public:
|
||||
tinyBLAS_Q0_PPC(int64_t k,
|
||||
const TA *A, int64_t lda,
|
||||
const block_q8_0 *B, int64_t ldb,
|
||||
float *C, int64_t ldc,
|
||||
int ith, int nth);
|
||||
|
||||
void matmul(int64_t m, int64_t n);
|
||||
void matmul_tiled_q0(int64_t m, int64_t n, int64_t mc, int64_t nc, int64_t kc) {
|
||||
vec_t A_pack[mc*kc*2];
|
||||
vec_t B_pack[nc*kc*2];
|
||||
int comparray[mc*kc];
|
||||
constexpr bool is_Ablock_q4 = std::is_same_v<TA, block_q4_0>;
|
||||
int64_t ytiles = m / mc;
|
||||
int64_t xtiles = n / nc;
|
||||
int64_t tiles = xtiles * ytiles;
|
||||
int64_t duty = (tiles + nth - 1) / nth;
|
||||
int64_t start = duty * ith;
|
||||
int64_t end = start + duty;
|
||||
if (end > tiles) {
|
||||
end = tiles;
|
||||
}
|
||||
for (int64_t job = start; job < end; ++job) {
|
||||
int64_t ii = (job / xtiles) * mc;
|
||||
int64_t jj = (job % xtiles) * nc;
|
||||
for (int64_t kk = 0; kk < k; kk += kc) {
|
||||
if constexpr(is_Ablock_q4) {
|
||||
packNormalInt4_large(A + ii*lda + kk, lda, mc, 4, (int8_t*)A_pack, comparray);
|
||||
} else {
|
||||
packNormal_large<int8_t, vector signed char>(A + ii*lda + kk, lda, mc, 8, (int8_t*)A_pack, false, comparray);
|
||||
}
|
||||
packNormal_large<uint8_t, vector unsigned char>(B + jj*ldb + kk, ldb, nc, 8, (uint8_t*)B_pack, true);
|
||||
KERNEL_Q0(ii, jj, mc, nc, kc, kk, A_pack, B_pack, comparray);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
inline void save_res(int ii, int jj, int idx, vector float* fin_res, int RM=4, int RN=4) {
|
||||
for (int I = 0; I < RM; I++) {
|
||||
for (int J = 0; J < RN; J++) {
|
||||
*((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&fin_res[idx+I]+J);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void add_save_res(int ii, int jj, int idx, vector float* fin_res, int RM=4, int RN=4) {
|
||||
for (int I = 0; I < RM; I++) {
|
||||
for (int J = 0; J < RN; J++) {
|
||||
float * c_ptr = (float *)(C+ii+((jj+J)*ldc)+I);
|
||||
*c_ptr += *((float*)&fin_res[idx+I]+J);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename ArrayType>
|
||||
inline void compute(acc_t* ACC, int c_idx, int s_idx, ArrayType& comparray, vector float* vs, vector float* fin_res) {
|
||||
vector signed int vec_C[4];
|
||||
vector float CA[4] = {0};
|
||||
vector float res[4] = {0};
|
||||
__builtin_mma_disassemble_acc(vec_C, ACC);
|
||||
for (int i = 0; i < 4; i++) {
|
||||
CA[i] = vec_splats((float)(((double)comparray[c_idx+i]) * -128.0));
|
||||
res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
|
||||
fin_res[s_idx+i] = vec_madd(res[i], vs[s_idx+i], fin_res[s_idx+i]);
|
||||
}
|
||||
}
|
||||
|
||||
inline void process_q4_elements(vector signed char (&c)[2], int* ca) {
|
||||
const vector signed char lowMask = vec_splats((signed char)0xF);
|
||||
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
||||
const vector signed char v8 = vec_splats((signed char)0x8);
|
||||
vector signed int vsum = {0};
|
||||
vector signed int vsum2 = {0};
|
||||
c[0] = vec_and(c[1], lowMask);
|
||||
c[1] = vec_sr(c[1], v4);
|
||||
c[0] = vec_sub(c[0], v8);
|
||||
c[1] = vec_sub(c[1], v8);
|
||||
vsum = vec_sum4s(c[0], vsum);
|
||||
vsum2 = vec_sum4s(c[1], vsum2);
|
||||
vsum = vec_add(vsum, vsum2);
|
||||
*(ca) = vsum[0] + vsum[1] + vsum[2] + vsum[3];
|
||||
}
|
||||
|
||||
template <typename V1, typename V2>
|
||||
inline void vector_permute_store(V2 &s1, V2 &s2, V2 &s3, V2 &s4, V1 *vecOffset, bool flip) {
|
||||
vector unsigned char swiz1 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
|
||||
vector unsigned char swiz2 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
|
||||
vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
|
||||
vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
|
||||
V2 t1, t2, t3, t4, t5, t6, t7, t8;
|
||||
vector unsigned char xor_vector;
|
||||
uint8_t flip_vec = 0x80;
|
||||
xor_vector = vec_splats(flip_vec);
|
||||
t1 = vec_perm(s1, s2, swiz1);
|
||||
t2 = vec_perm(s1, s2, swiz2);
|
||||
t3 = vec_perm(s3, s4, swiz1);
|
||||
t4 = vec_perm(s3, s4, swiz2);
|
||||
t5 = vec_perm(t1, t3, swiz3);
|
||||
t6 = vec_perm(t1, t3, swiz4);
|
||||
t7 = vec_perm(t2, t4, swiz3);
|
||||
t8 = vec_perm(t2, t4, swiz4);
|
||||
if (flip == true) {
|
||||
t5 = vec_xor(t5, xor_vector);
|
||||
t6 = vec_xor(t6, xor_vector);
|
||||
t7 = vec_xor(t7, xor_vector);
|
||||
t8 = vec_xor(t8, xor_vector);
|
||||
}
|
||||
vec_xst(t5, 0, vecOffset);
|
||||
vec_xst(t6, 0, vecOffset+16);
|
||||
vec_xst(t7, 0, vecOffset+32);
|
||||
vec_xst(t8, 0, vecOffset+48);
|
||||
}
|
||||
|
||||
template<int RM, int RN>
|
||||
inline void kernel(int64_t ii, int64_t jj) {
|
||||
if constexpr(RM == 4 && RN == 8) {
|
||||
KERNEL_4x8(ii,jj);
|
||||
} else if constexpr(RM == 8 && RN == 4) {
|
||||
KERNEL_8x4(ii,jj);
|
||||
} else if constexpr(RM == 8 && RN == 8) {
|
||||
KERNEL_8x8(ii,jj);
|
||||
} else {
|
||||
assert(false && "RN/RM values not supported");
|
||||
}
|
||||
}
|
||||
template<int size>
|
||||
void packNormalInt4(const TA* a, int64_t lda, int rows, int cols, int8_t* vec, std::array<int, size>& comparray);
|
||||
template<typename VA, typename VB>
|
||||
void packNormal(const block_q8_0* a, int64_t lda, int rows, int cols, VA* vec, bool flip);
|
||||
void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n);
|
||||
void KERNEL_4x8(int64_t ii, int64_t jj);
|
||||
void KERNEL_8x4(int64_t ii, int64_t jj);
|
||||
void KERNEL_8x8(int64_t ii, int64_t jj);
|
||||
void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN);
|
||||
template <int RM, int RN>
|
||||
void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n);
|
||||
|
||||
void compute_scale(int64_t ii, int64_t jj, int blk, vector float* vs){
|
||||
for (int I = 0; I<8; I++) {
|
||||
float a_scale = unhalf((A+((ii+I)*lda)+blk)->d);
|
||||
for (int J = 0; J<4; J++) {
|
||||
*((float*)&vs[I]+J) = (a_scale * unhalf((B+((jj+J)*ldb)+blk)->d));
|
||||
*((float*)&vs[I+8]+J) = (a_scale * unhalf((B+((jj+J+4)*ldb)+blk)->d));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void process_q8_elements(const int8_t *qs, int *ca) {
|
||||
vector signed char c1 = vec_xl(0, qs);
|
||||
vector signed char c2 = vec_xl(16, qs);
|
||||
vector signed int vsum1 = {0};
|
||||
vector signed int vsum2 = {0};
|
||||
vsum1 = vec_sum4s(c1, vsum1);
|
||||
vsum2 = vec_sum4s(c2, vsum2);
|
||||
vector signed int vsum = vec_add(vsum1, vsum2);
|
||||
*ca = vsum[0] + vsum[1] + vsum[2] + vsum[3];
|
||||
}
|
||||
|
||||
template<typename VA, typename VB>
|
||||
void packNormal_large(const block_q8_0* a, int64_t lda, int rows, int cols, VA* vec, bool flip, int* comparray=nullptr) {
|
||||
int64_t i, j;
|
||||
block_q8_0 *aoffset = NULL;
|
||||
VA *vecOffset = NULL;
|
||||
block_q8_0* aoffsets[8];
|
||||
__vector_pair arr[8];
|
||||
VB c[8][2] = {0};
|
||||
VB c1[8] = {0}; VB c2[8] = {0};
|
||||
aoffset = const_cast<block_q8_0*>(a);
|
||||
vecOffset = vec;
|
||||
j = (rows >> 3);
|
||||
int index = 0;
|
||||
if (j > 0) {
|
||||
do {
|
||||
for (int it = 0; it < 8; it++)
|
||||
aoffsets[it] = aoffset + it*lda;
|
||||
aoffset += 8 * lda;
|
||||
for (int blk = 0; blk < kc; blk++) {
|
||||
for (int it = 0; it < 8; it++) {
|
||||
arr[it] = __builtin_vsx_lxvp(0, (__vector_pair*)(aoffsets[it]+blk)->qs);
|
||||
__builtin_vsx_disassemble_pair(c[it], &arr[it]);
|
||||
c1[it] = c[it][0];
|
||||
c2[it] = c[it][1];
|
||||
if (comparray){
|
||||
process_q8_elements((aoffsets[it]+ blk)->qs, &comparray[index + 8*blk + it]);
|
||||
}
|
||||
}
|
||||
vector_permute_store<VA, VB>(c1[0], c1[1], c1[2], c1[3], vecOffset, flip);
|
||||
vector_permute_store<VA, VB>(c2[0], c2[1], c2[2], c2[3], vecOffset+64, flip);
|
||||
vector_permute_store<VA, VB>(c1[4], c1[5], c1[6], c1[7], vecOffset+128, flip);
|
||||
vector_permute_store<VA, VB>(c2[4], c2[5], c2[6], c2[7], vecOffset+192, flip);
|
||||
vecOffset += 256;
|
||||
}
|
||||
j--;
|
||||
index += 8*kc;
|
||||
} while(j > 0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void packNormalInt4_large(const TA* a, int64_t lda, int rows, int cols, int8_t* vec, int*comparray) {
|
||||
int64_t i, j;
|
||||
TA *aoffset = NULL;
|
||||
int8_t *vecOffset = NULL;
|
||||
TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
|
||||
TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
|
||||
vector signed char c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2] = {0};
|
||||
vector signed char c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2] = {0};
|
||||
aoffset = const_cast<TA*>(a);
|
||||
vecOffset = vec;
|
||||
int index = 0;
|
||||
j = (rows >> 3);
|
||||
if (j > 0) {
|
||||
do {
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
aoffset4 = aoffset3 + lda;
|
||||
aoffset5 = aoffset4 + lda;
|
||||
aoffset6 = aoffset5 + lda;
|
||||
aoffset7 = aoffset6 + lda;
|
||||
aoffset8 = aoffset7 + lda;
|
||||
aoffset += 8 * lda;
|
||||
for (int blk = 0; blk < kc; blk++) {
|
||||
c1[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset1+blk)->qs));
|
||||
c2[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset2+blk)->qs));
|
||||
c3[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset3+blk)->qs));
|
||||
c4[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset4+blk)->qs));
|
||||
c5[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset5+blk)->qs));
|
||||
c6[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset6+blk)->qs));
|
||||
c7[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset7+blk)->qs));
|
||||
c8[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset8+blk)->qs));
|
||||
|
||||
process_q4_elements(c1, &comparray[index + 8*blk+0]);
|
||||
process_q4_elements(c2, &comparray[index + 8*blk+1]);
|
||||
process_q4_elements(c3, &comparray[index + 8*blk+2]);
|
||||
process_q4_elements(c4, &comparray[index + 8*blk+3]);
|
||||
process_q4_elements(c5, &comparray[index + 8*blk+4]);
|
||||
process_q4_elements(c6, &comparray[index + 8*blk+5]);
|
||||
process_q4_elements(c7, &comparray[index + 8*blk+6]);
|
||||
process_q4_elements(c8, &comparray[index + 8*blk+7]);
|
||||
vector_permute_store<int8_t, vector signed char>(c1[0], c2[0], c3[0], c4[0], vecOffset, false);
|
||||
vector_permute_store<int8_t, vector signed char>(c1[1], c2[1], c3[1], c4[1], vecOffset+64, false);
|
||||
vector_permute_store<int8_t, vector signed char>(c5[0], c6[0], c7[0], c8[0], vecOffset+128, false);
|
||||
vector_permute_store<int8_t, vector signed char>(c5[1], c6[1], c7[1], c8[1], vecOffset+192, false);
|
||||
vecOffset += 256;
|
||||
}
|
||||
j--;
|
||||
index += 8*kc;
|
||||
} while (j > 0);
|
||||
}
|
||||
}
|
||||
|
||||
void KERNEL_Q0(int64_t ii, int64_t jj, int64_t mc, int64_t nc, int64_t kc, int64_t l, vec_t *vec_A, vec_t *vec_B, int *comparray) {
|
||||
acc_t acc[8];
|
||||
for (int i = 0; i < mc ; i += 8) {
|
||||
for (int j = 0; j < nc; j += 8) {
|
||||
vector float fin_res[16] = {0};
|
||||
vector float vs[16] = {0};
|
||||
for (int64_t kk = 0; kk < kc; kk+=2) {
|
||||
for (int x = 0; x < 8; x++) {
|
||||
__builtin_mma_xxsetaccz(&acc[x]);
|
||||
}
|
||||
int A_block_idx = (i/8)*(16*kc) + kk*16;
|
||||
int B_block_idx = (j/8)*(16*kc)+ kk*16;
|
||||
vec_t *A_block = &vec_A[A_block_idx];
|
||||
vec_t *B_block = &vec_B[B_block_idx];
|
||||
for (int x = 0; x < 8; x++) {
|
||||
__builtin_mma_xvi8ger4pp(&acc[0], A_block[x], B_block[x]);
|
||||
__builtin_mma_xvi8ger4pp(&acc[1], A_block[x + 8], B_block[x]);
|
||||
__builtin_mma_xvi8ger4pp(&acc[2], A_block[x], B_block[x+8]);
|
||||
__builtin_mma_xvi8ger4pp(&acc[3], A_block[x+8], B_block[x+8]);
|
||||
}
|
||||
compute_scale(ii+i, jj+j, l+kk, vs);
|
||||
int c_index = (i/8)*(8*kc)+ kk*8;
|
||||
int* c_block = &comparray[c_index];
|
||||
compute(&acc[0], 0, 0, c_block, vs, fin_res);
|
||||
compute(&acc[1], 4, 4, c_block, vs, fin_res);
|
||||
compute(&acc[2], 0, 8, c_block, vs, fin_res);
|
||||
compute(&acc[3], 4, 12, c_block, vs, fin_res);
|
||||
|
||||
A_block_idx = (i/8)*(16*kc) + (kk+1)*16;
|
||||
B_block_idx = (j/8)*(16*kc)+ (kk+1)*16;
|
||||
A_block = &vec_A[A_block_idx];
|
||||
B_block = &vec_B[B_block_idx];
|
||||
for (int x = 0; x < 8; x++) {
|
||||
__builtin_mma_xvi8ger4pp(&acc[4], A_block[x], B_block[x]);
|
||||
__builtin_mma_xvi8ger4pp(&acc[5], A_block[x + 8], B_block[x]);
|
||||
__builtin_mma_xvi8ger4pp(&acc[6], A_block[x], B_block[x+8]);
|
||||
__builtin_mma_xvi8ger4pp(&acc[7], A_block[x+8], B_block[x+8]);
|
||||
}
|
||||
compute_scale(ii+i, jj+j, l+kk+1, vs);
|
||||
c_index = (i/8)*(8*kc)+ (kk+1)*8;
|
||||
c_block = &comparray[c_index];
|
||||
compute(&acc[4], 0, 0, c_block, vs, fin_res);
|
||||
compute(&acc[5], 4, 4, c_block, vs, fin_res);
|
||||
compute(&acc[6], 0, 8, c_block, vs, fin_res);
|
||||
compute(&acc[7], 4, 12, c_block, vs, fin_res);
|
||||
|
||||
}
|
||||
if (l == 0) {
|
||||
save_res(ii+i, jj+j, 0, fin_res);
|
||||
save_res(ii+i+4, jj+j, 4, fin_res);
|
||||
save_res(ii+i, jj+j+4, 8, fin_res);
|
||||
save_res(ii+i+4, jj+j+4, 12, fin_res);
|
||||
} else {
|
||||
add_save_res(ii+i, jj+j, 0, fin_res);
|
||||
add_save_res(ii+i+4, jj+j, 4, fin_res);
|
||||
add_save_res(ii+i, jj+j+4, 8, fin_res);
|
||||
add_save_res(ii+i+4, jj+j+4, 12, fin_res);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const TA *const A;
|
||||
const block_q8_0 *const B;
|
||||
float *C;
|
||||
const int64_t k;
|
||||
int64_t kc;
|
||||
const int64_t lda;
|
||||
const int64_t ldb;
|
||||
const int64_t ldc;
|
||||
const int ith;
|
||||
const int nth;
|
||||
};
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,25 @@
|
||||
#pragma once
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#if defined(__VXE__) || defined(__VXE2__)
|
||||
#include <vecintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define NOINLINE __declspec(noinline)
|
||||
#else
|
||||
#define NOINLINE __attribute__((__noinline__))
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t, int64_t, int64_t,
|
||||
const void *, int64_t, const void *, int64_t, void *, int64_t,
|
||||
int, int, int);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,116 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
//
|
||||
// cache line
|
||||
//
|
||||
|
||||
#if defined(__cpp_lib_hardware_interference_size)
|
||||
#define CACHE_LINE_SIZE std::hardware_destructive_interference_size
|
||||
#else
|
||||
#if defined(__POWER9_VECTOR__)
|
||||
#define CACHE_LINE_SIZE 128
|
||||
#elif defined(__VXE__) || defined(__VXE2__)
|
||||
#define CACHE_LINE_SIZE 256
|
||||
#else
|
||||
#define CACHE_LINE_SIZE 64
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
||||
|
||||
// Work buffer size for im2col operations in CONV2D
|
||||
#define GGML_IM2COL_WORK_SIZE (16 * 1024 * 1024)
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void ggml_compute_forward_dup(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_add(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_add_id(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_add1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_acc(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_sum(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_sum_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_cumsum(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_mean(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_argmax(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_count_equal(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_repeat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_repeat_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_concat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_silu_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_rms_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_rms_norm_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_group_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_l2_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_out_prod(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_scale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_set(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_cpy(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_cont(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_get_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_get_rows_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_set_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_diag(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_diag_mask_inf(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_diag_mask_zero(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_soft_max(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_soft_max_ext_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_rope(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_rope_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_im2col_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_conv_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_pool_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_roll(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_top_k(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_leaky_relu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_tri(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_fill(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_flash_attn_ext(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_flash_attn_back(
|
||||
const struct ggml_compute_params * params,
|
||||
const bool masked,
|
||||
struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_ssm_conv(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_ssm_scan(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_win_part(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_win_unpart(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_unary(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_glu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_get_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_add_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_rwkv_wkv6(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_rwkv_wkv7(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_solve_tri(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_gla(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_map_custom1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_map_custom2(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_map_custom3(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_custom(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_opt_step_sgd(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,97 @@
|
||||
#pragma once
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
#include "ggml-common.h"
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
// GGML CPU internal header
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Quantization
|
||||
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_mxfp4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
// Dot product
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
// Generic implementation
|
||||
void quantize_row_q8_0_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void quantize_row_q8_1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,134 @@
|
||||
#pragma once
|
||||
|
||||
#define GGML_COMMON_DECL_CPP
|
||||
#include "ggml-common.h"
|
||||
|
||||
#include "traits.h"
|
||||
#include "ggml.h"
|
||||
|
||||
// GGML internal header
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cpu_repack_buffer_type(void);
|
||||
|
||||
template <int K> constexpr int QK_0() {
|
||||
if constexpr (K == 4) {
|
||||
return QK4_0;
|
||||
}
|
||||
if constexpr (K == 8) {
|
||||
return QK8_0;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
template <int K, int N> struct block {
|
||||
ggml_half d[N]; // deltas for N qK_0 blocks
|
||||
int8_t qs[(QK_0<K>() * N * K) / 8]; // quants for N qK_0 blocks
|
||||
};
|
||||
|
||||
// control size
|
||||
static_assert(sizeof(block<4, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 2, "wrong block<4,4> size/padding");
|
||||
static_assert(sizeof(block<4, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<4,8> size/padding");
|
||||
static_assert(sizeof(block<8, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<8,4> size/padding");
|
||||
static_assert(sizeof(block<8, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong block<8,8> size/padding");
|
||||
|
||||
using block_q4_0x4 = block<4, 4>;
|
||||
using block_q4_0x8 = block<4, 8>;
|
||||
using block_q8_0x4 = block<8, 4>;
|
||||
using block_q8_0x8 = block<8, 8>;
|
||||
|
||||
struct block_q4_Kx8 {
|
||||
ggml_half d[8]; // super-block scale for quantized scales
|
||||
ggml_half dmin[8]; // super-block scale for quantized mins
|
||||
uint8_t scales[96]; // scales and mins, quantized with 6 bits
|
||||
uint8_t qs[1024]; // 4--bit quants
|
||||
};
|
||||
|
||||
static_assert(sizeof(block_q4_Kx8) == sizeof(ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding");
|
||||
struct block_q2_Kx8 {
|
||||
ggml_half d[8]; // super-block scale for quantized scales
|
||||
ggml_half dmin[8]; // super-block scale for quantized mins
|
||||
uint8_t scales[128]; // scales and mins, quantized with 4 bits
|
||||
uint8_t qs[512]; // 2--bit quants
|
||||
};
|
||||
|
||||
static_assert(sizeof(block_q2_Kx8) == sizeof(ggml_half) * 16 + QK_K/2 + QK_K * 2, "wrong q2_K block size/padding");
|
||||
struct block_q8_Kx4 {
|
||||
float d[4]; // delta
|
||||
int8_t qs[QK_K * 4]; // quants
|
||||
int16_t bsums[QK_K / 4]; // sum of quants in groups of 16
|
||||
};
|
||||
|
||||
static_assert(sizeof(block_q8_Kx4) == sizeof(float) * 4 + QK_K * 4 + (QK_K / 4) * sizeof(int16_t), "wrong q8_K block size/padding");
|
||||
|
||||
struct block_iq4_nlx4 {
|
||||
ggml_half d[4]; // deltas for 4 iq4_nl blocks
|
||||
uint8_t qs[QK4_NL * 2]; // nibbles / quants for 4 iq4_nl blocks
|
||||
};
|
||||
|
||||
static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
|
||||
|
||||
struct block_iq4_nlx8 {
|
||||
ggml_half d[8]; // deltas for 8 iq4_nl blocks
|
||||
uint8_t qs[QK4_NL * 4]; // nibbles / quants for 8 iq4_nl blocks
|
||||
};
|
||||
|
||||
static_assert(sizeof(block_iq4_nlx8) == 8 * sizeof(ggml_half) + QK4_NL * 4, "wrong iq4_nlx8 block size/padding");
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_quantize_mat_q8_K_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
// Native implementations
|
||||
void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_quantize_mat_q8_K_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
#if defined(__cplusplus)
|
||||
} // extern "C"
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,13 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml-alloc.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cpu_riscv64_spacemit_buffer_type(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,26 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace sqnbitgemm_spacemit_ime {
|
||||
namespace ime1 {
|
||||
size_t gemm_kernel_i8i4(size_t blk_len,
|
||||
const std::byte * quant_a_ptr,
|
||||
const std::byte * quant_b_data,
|
||||
const float * quant_b_scale,
|
||||
const std::byte * quant_b_zp,
|
||||
float * c_ptr,
|
||||
size_t count_m,
|
||||
size_t count_n,
|
||||
size_t count_k,
|
||||
size_t block_count_k,
|
||||
size_t ldc,
|
||||
const float * bias,
|
||||
const size_t scale_stride);
|
||||
|
||||
void quantize_a_row_i8(size_t blk_len, const float * a_ptr, size_t count_k, std::byte * quant_a_ptr);
|
||||
|
||||
void quantize_a_4row_i8(size_t blk_len, const float * a_ptr, size_t count_k, std::byte * quant_a_ptr);
|
||||
|
||||
} // namespace ime1
|
||||
} // namespace sqnbitgemm_spacemit_ime
|
||||
@@ -0,0 +1,36 @@
|
||||
#include "traits.h"
|
||||
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
namespace ggml::cpu {
|
||||
tensor_traits::~tensor_traits() {}
|
||||
|
||||
extra_buffer_type::~extra_buffer_type() {}
|
||||
} // namespace ggml::cpu
|
||||
|
||||
bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
|
||||
for (auto extra : ggml_backend_cpu_get_extra_buffer_types()) {
|
||||
if (extra && extra->context) {
|
||||
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
|
||||
auto tensor_traits = buf_extra->get_tensor_traits(op);
|
||||
if (tensor_traits && tensor_traits->compute_forward(params, op)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) {
|
||||
for (auto extra : ggml_backend_cpu_get_extra_buffer_types()) {
|
||||
if (extra && extra->context) {
|
||||
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
|
||||
auto tensor_traits = buf_extra->get_tensor_traits(op);
|
||||
if (tensor_traits && tensor_traits->work_size(n_threads, op, *size)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@@ -0,0 +1,38 @@
|
||||
#pragma once
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
# include <vector>
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// return true if op part of extra "accelerator"
|
||||
bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op);
|
||||
bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
namespace ggml::cpu {
|
||||
// register in tensor->extra
|
||||
class tensor_traits {
|
||||
public:
|
||||
virtual ~tensor_traits();
|
||||
virtual bool work_size(int n_threads, const struct ggml_tensor * op, size_t & size) = 0;
|
||||
virtual bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) = 0;
|
||||
};
|
||||
|
||||
class extra_buffer_type {
|
||||
public:
|
||||
virtual ~extra_buffer_type();
|
||||
virtual bool supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) = 0;
|
||||
virtual tensor_traits * get_tensor_traits(const struct ggml_tensor * op) = 0;
|
||||
};
|
||||
} // namespace ggml::cpu
|
||||
|
||||
// implemented in ggml-cpu.cpp.
|
||||
std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_types();
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,337 @@
|
||||
#include "unary-ops.h"
|
||||
|
||||
static inline float op_abs(float x) {
|
||||
return fabsf(x);
|
||||
}
|
||||
|
||||
static inline float op_sgn(float x) {
|
||||
return (x > 0.f) ? 1.f : ((x < 0.f) ? -1.f : 0.f);
|
||||
}
|
||||
|
||||
static inline float op_neg(float x) {
|
||||
return -x;
|
||||
}
|
||||
|
||||
static inline float op_step(float x) {
|
||||
return (x > 0.f) ? 1.f : 0.f;
|
||||
}
|
||||
|
||||
static inline float op_tanh(float x) {
|
||||
return tanhf(x);
|
||||
}
|
||||
|
||||
static inline float op_elu(float x) {
|
||||
return (x > 0.f) ? x : expm1f(x);
|
||||
}
|
||||
|
||||
static inline float op_relu(float x) {
|
||||
return (x > 0.f) ? x : 0.f;
|
||||
}
|
||||
|
||||
static inline float op_sigmoid(float x) {
|
||||
return 1.f / (1.f + expf(-x));
|
||||
}
|
||||
|
||||
static inline float op_hardsigmoid(float x) {
|
||||
return fminf(1.0f, fmaxf(0.0f, (x + 3.0f) / 6.0f));
|
||||
}
|
||||
|
||||
static inline float op_exp(float x) {
|
||||
return expf(x);
|
||||
}
|
||||
|
||||
static inline float op_hardswish(float x) {
|
||||
return x * fminf(1.0f, fmaxf(0.0f, (x + 3.0f) / 6.0f));
|
||||
}
|
||||
|
||||
static inline float op_sqr(float x) {
|
||||
return x * x;
|
||||
}
|
||||
|
||||
static inline float op_sqrt(float x) {
|
||||
return sqrtf(x);
|
||||
}
|
||||
|
||||
static inline float op_xielu(float x, float alpha_n, float alpha_p, float beta, float eps) {
|
||||
if (x > 0.0f) {
|
||||
return alpha_p * x * x + beta * x;
|
||||
} else {
|
||||
const float min_x_eps = fminf(x, eps);
|
||||
return (expm1f(min_x_eps) - x) * alpha_n + beta * x;
|
||||
}
|
||||
}
|
||||
|
||||
static inline float op_sin(float x) {
|
||||
return sinf(x);
|
||||
}
|
||||
|
||||
static inline float op_cos(float x) {
|
||||
return cosf(x);
|
||||
}
|
||||
|
||||
static inline float op_log(float x) {
|
||||
return logf(x);
|
||||
}
|
||||
|
||||
static inline float op_expm1(float x) {
|
||||
return expf(x) - 1.0f;
|
||||
}
|
||||
|
||||
static inline float op_softplus(float x) {
|
||||
return (x > 20.0f) ? x : logf(1.0f + expf(x));
|
||||
}
|
||||
|
||||
static inline float op_floor(float x) {
|
||||
return floorf(x);
|
||||
}
|
||||
|
||||
static inline float op_ceil(float x) {
|
||||
return ceilf(x);
|
||||
}
|
||||
|
||||
static inline float op_round(float x) {
|
||||
return roundf(x);
|
||||
}
|
||||
|
||||
static inline float op_trunc(float x) {
|
||||
return truncf(x);
|
||||
}
|
||||
|
||||
template <float (*op)(float), typename src0_t, typename dst_t>
|
||||
static inline void vec_unary_op(int64_t n, dst_t * y, const src0_t * x) {
|
||||
constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
|
||||
constexpr auto f32_to_dst = type_conversion_table<dst_t >::from_f32;
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
y[i] = f32_to_dst(op(src0_to_f32(x[i])));
|
||||
}
|
||||
}
|
||||
|
||||
template <float (*op)(float), typename src0_t, typename dst_t>
|
||||
static void apply_unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src0) && ggml_is_contiguous_1(dst) && ggml_are_same_shape(src0, dst));
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS
|
||||
|
||||
GGML_ASSERT( nb0 == sizeof(dst_t));
|
||||
GGML_ASSERT(nb00 == sizeof(src0_t));
|
||||
|
||||
const auto [ir0, ir1] = get_thread_range(params, src0);
|
||||
|
||||
for (int64_t ir = ir0; ir < ir1; ++ir) {
|
||||
const int64_t i03 = ir/(ne02*ne01);
|
||||
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
||||
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
||||
|
||||
dst_t * dst_ptr = (dst_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
||||
const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
||||
|
||||
vec_unary_op<op>(ne0, dst_ptr, src0_ptr);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Use the 'traits' lookup table (for type conversion fns), instead of a mass of 'if' conditions with long templates
|
||||
template <float (*op)(float)>
|
||||
static void unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
/* */ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { // all f32
|
||||
apply_unary_op<op, float, float>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { // all f16
|
||||
apply_unary_op<op, ggml_fp16_t, ggml_fp16_t>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
|
||||
apply_unary_op<op, ggml_bf16_t, ggml_bf16_t>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) {
|
||||
apply_unary_op<op, ggml_bf16_t, float>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
||||
apply_unary_op<op, ggml_fp16_t, float>(params, dst);
|
||||
} else {
|
||||
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s\n", __func__,
|
||||
ggml_type_name(dst->type), ggml_type_name(src0->type));
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
template <float (*op)(float, ggml_tensor *)>
|
||||
static void unary_op_params(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
/* */ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { // all f32
|
||||
apply_unary_op<op, float, float>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { // all f16
|
||||
apply_unary_op<op, ggml_fp16_t, ggml_fp16_t>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
|
||||
apply_unary_op<op, ggml_bf16_t, ggml_bf16_t>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) {
|
||||
apply_unary_op<op, ggml_bf16_t, float>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
||||
apply_unary_op<op, ggml_fp16_t, float>(params, dst);
|
||||
} else {
|
||||
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s\n", __func__,
|
||||
ggml_type_name(dst->type), ggml_type_name(src0->type));
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
// Extend vec_unary_op to support functors
|
||||
template <typename Op, typename src0_t, typename dst_t>
|
||||
static inline void vec_unary_op_functor(int64_t n, dst_t * y, const src0_t * x, Op op) {
|
||||
constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
|
||||
constexpr auto f32_to_dst = type_conversion_table<dst_t >::from_f32;
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
y[i] = f32_to_dst(op(src0_to_f32(x[i])));
|
||||
}
|
||||
}
|
||||
|
||||
// Extend apply_unary_op to support functors
|
||||
template <typename Op, typename src0_t, typename dst_t>
|
||||
static void apply_unary_op_functor(const ggml_compute_params * params, ggml_tensor * dst, Op op) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src0) && ggml_is_contiguous_1(dst) && ggml_are_same_shape(src0, dst));
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS
|
||||
|
||||
GGML_ASSERT( nb0 == sizeof(dst_t));
|
||||
GGML_ASSERT(nb00 == sizeof(src0_t));
|
||||
|
||||
const auto [ir0, ir1] = get_thread_range(params, src0);
|
||||
|
||||
for (int64_t ir = ir0; ir < ir1; ++ir) {
|
||||
const int64_t i03 = ir/(ne02*ne01);
|
||||
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
||||
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
||||
|
||||
dst_t * dst_ptr = (dst_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
||||
const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
||||
|
||||
vec_unary_op_functor(ne0, dst_ptr, src0_ptr, op);
|
||||
}
|
||||
}
|
||||
|
||||
// Generic dispatcher for functors
|
||||
template <typename Op>
|
||||
static void unary_op_functor(const ggml_compute_params * params, ggml_tensor * dst, Op op) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
/* */ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { // all f32
|
||||
apply_unary_op_functor<Op, float, float>(params, dst, op);
|
||||
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { // all f16
|
||||
apply_unary_op_functor<Op, ggml_fp16_t, ggml_fp16_t>(params, dst, op);
|
||||
} else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
|
||||
apply_unary_op_functor<Op, ggml_bf16_t, ggml_bf16_t>(params, dst, op);
|
||||
} else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) {
|
||||
apply_unary_op_functor<Op, ggml_bf16_t, float>(params, dst, op);
|
||||
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
||||
apply_unary_op_functor<Op, ggml_fp16_t, float>(params, dst, op);
|
||||
} else {
|
||||
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s\n", __func__,
|
||||
ggml_type_name(dst->type), ggml_type_name(src0->type));
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_compute_forward_abs(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_abs>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_sgn(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_sgn>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_neg(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_neg>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_step(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_step>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_tanh(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_tanh>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_elu(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_elu>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_relu(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_relu>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_sigmoid(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_sigmoid>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_hardsigmoid(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_hardsigmoid>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_exp(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_exp>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_hardswish(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_hardswish>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_sqr(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_sqr>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_sqrt(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_sqrt>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_sin(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_sin>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_cos(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_cos>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_log>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_expm1(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_expm1>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_softplus(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_softplus>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_floor(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_floor>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_ceil(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_ceil>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_round(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_round>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_trunc(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_trunc>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_xielu(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
const float alpha_n = ggml_get_op_params_f32(dst, 1);
|
||||
const float alpha_p = ggml_get_op_params_f32(dst, 2);
|
||||
const float beta = ggml_get_op_params_f32(dst, 3);
|
||||
const float eps = ggml_get_op_params_f32(dst, 4);
|
||||
|
||||
const auto xielu_op_params = [alpha_n, alpha_p, beta, eps](float f) {
|
||||
return op_xielu(f, alpha_n, alpha_p, beta, eps);
|
||||
};
|
||||
|
||||
unary_op_functor(params, dst, xielu_op_params);
|
||||
}
|
||||
|
||||
@@ -0,0 +1,35 @@
|
||||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void ggml_compute_forward_abs(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_sgn(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_neg(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_step(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_tanh(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_elu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_relu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_sigmoid(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_hardsigmoid(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_exp(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_hardswish(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_sqr(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_sqrt(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_sin(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_cos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_log(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_expm1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_softplus(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_floor(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_ceil(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_round(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_trunc(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_xielu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,612 @@
|
||||
#include "vec.h"
|
||||
|
||||
#include <cassert>
|
||||
|
||||
// precomputed gelu table for f16 (128 KB)
|
||||
ggml_fp16_t ggml_table_gelu_f16[1 << 16];
|
||||
|
||||
// precomputed quick gelu table for f16 (128 KB)
|
||||
ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
|
||||
|
||||
void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc) {
|
||||
assert(nrc == 1);
|
||||
GGML_UNUSED(nrc);
|
||||
GGML_UNUSED(bx);
|
||||
GGML_UNUSED(by);
|
||||
GGML_UNUSED(bs);
|
||||
|
||||
#if defined(GGML_SIMD)
|
||||
float sumf = 0.0f;
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
|
||||
const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
|
||||
const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
|
||||
|
||||
const int np = (n & ~(ggml_f32_step - 1));
|
||||
svfloat32_t sum1 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum2 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum3 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum4 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum5 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum6 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum7 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum8 = svdup_n_f32(0.0f);
|
||||
svfloat32_t ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8;
|
||||
svfloat32_t ay1,ay2,ay3,ay4,ay5,ay6,ay7,ay8;
|
||||
for (int i = 0; i < np; i += ggml_f32_step) {
|
||||
ax1 = GGML_F32_VEC_LOAD(x + i);
|
||||
ay1 = GGML_F32_VEC_LOAD(y + i);
|
||||
sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1);
|
||||
|
||||
ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
|
||||
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
|
||||
sum2 = GGML_F32_VEC_FMA(sum2, ax2, ay2);
|
||||
|
||||
ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
|
||||
ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
|
||||
sum3 = GGML_F32_VEC_FMA(sum3, ax3, ay3);
|
||||
|
||||
ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
|
||||
ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
|
||||
sum4 = GGML_F32_VEC_FMA(sum4, ax4, ay4);
|
||||
|
||||
ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
|
||||
ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
|
||||
sum5 = GGML_F32_VEC_FMA(sum5, ax5, ay5);
|
||||
|
||||
ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
|
||||
ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
|
||||
sum6 = GGML_F32_VEC_FMA(sum6, ax6, ay6);
|
||||
|
||||
ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
|
||||
ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
|
||||
sum7 = GGML_F32_VEC_FMA(sum7, ax7, ay7);
|
||||
|
||||
ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
|
||||
ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
|
||||
sum8 = GGML_F32_VEC_FMA(sum8, ax8, ay8);
|
||||
}
|
||||
// leftovers
|
||||
// Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
|
||||
const int np2 = (n & ~(ggml_f32_epr - 1));
|
||||
for (int i = np; i < np2; i += ggml_f32_epr) {
|
||||
ax1 = GGML_F32_VEC_LOAD(x + i);
|
||||
ay1 = GGML_F32_VEC_LOAD(y + i);
|
||||
sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1);
|
||||
}
|
||||
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
|
||||
if (np2 < n) {
|
||||
svbool_t pg = svwhilelt_b32(np2, n);
|
||||
ax1 = svld1_f32(pg, x + np2);
|
||||
ay1 = svld1_f32(pg, y + np2);
|
||||
sum1 = svmad_f32_m(pg, ax1, ay1, sum1);
|
||||
}
|
||||
// reduce sum1,sum2 to sum1
|
||||
GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8);
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
int vl = __riscv_vsetvlmax_e32m8();
|
||||
vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1);
|
||||
vfloat32m8_t vsum;
|
||||
vfloat32m8_t ax;
|
||||
vfloat32m8_t ay;
|
||||
vsum = __riscv_vfmv_v_f_f32m8_tu(vsum, 0.0f, vl);
|
||||
for (int i = 0; i < n; i += vl) {
|
||||
vl = __riscv_vsetvl_e32m8(n - i);
|
||||
ax = __riscv_vle32_v_f32m8_tu(ax, &x[i], vl);
|
||||
ay = __riscv_vle32_v_f32m8_tu(ay, &y[i], vl);
|
||||
vsum = __riscv_vfmacc_vv_f32m8_tu(vsum, ax, ay, vl);
|
||||
}
|
||||
vl = __riscv_vsetvlmax_e32m8();
|
||||
vs = __riscv_vfredusum_vs_f32m8_f32m1(vsum, vs, vl);
|
||||
sumf += __riscv_vfmv_f_s_f32m1_f32(vs);
|
||||
#else
|
||||
const int np = (n & ~(GGML_F32_STEP - 1));
|
||||
|
||||
GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
|
||||
|
||||
GGML_F32_VEC ax[GGML_F32_ARR];
|
||||
GGML_F32_VEC ay[GGML_F32_ARR];
|
||||
|
||||
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
||||
for (int j = 0; j < GGML_F32_ARR; j++) {
|
||||
ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
|
||||
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
||||
|
||||
sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
|
||||
}
|
||||
}
|
||||
|
||||
// reduce sum0..sum3 to sum0
|
||||
GGML_F32_VEC_REDUCE(sumf, sum);
|
||||
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
sumf += x[i]*y[i];
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
// scalar
|
||||
ggml_float sumf = 0.0;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
sumf += (ggml_float)(x[i]*y[i]);
|
||||
}
|
||||
#endif
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc) {
|
||||
assert(nrc == 1);
|
||||
GGML_UNUSED(nrc);
|
||||
GGML_UNUSED(bx);
|
||||
GGML_UNUSED(by);
|
||||
GGML_UNUSED(bs);
|
||||
int i = 0;
|
||||
ggml_float sumf = 0;
|
||||
|
||||
#if defined(__AVX512BF16__)
|
||||
__m512 c1 = _mm512_setzero_ps();
|
||||
__m512 c2 = _mm512_setzero_ps();
|
||||
for (; i + 64 <= n; i += 64) {
|
||||
c1 = _mm512_dpbf16_ps(c1, m512bh(_mm512_loadu_si512((x + i))),
|
||||
m512bh(_mm512_loadu_si512((y + i))));
|
||||
c2 = _mm512_dpbf16_ps(c2, m512bh(_mm512_loadu_si512((x + i + 32))),
|
||||
m512bh(_mm512_loadu_si512((y + i + 32))));
|
||||
}
|
||||
sumf += (ggml_float)_mm512_reduce_add_ps(c1);
|
||||
sumf += (ggml_float)_mm512_reduce_add_ps(c2);
|
||||
|
||||
#elif defined(__AVX512F__)
|
||||
#define LOAD(p) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(p))), 16))
|
||||
__m512 c1 = _mm512_setzero_ps();
|
||||
__m512 c2 = _mm512_setzero_ps();
|
||||
for (; i + 32 <= n; i += 32) {
|
||||
c1 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i), LOAD(y + i)), c1);
|
||||
c2 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c2);
|
||||
}
|
||||
sumf += (ggml_float)_mm512_reduce_add_ps(c1);
|
||||
sumf += (ggml_float)_mm512_reduce_add_ps(c2);
|
||||
|
||||
#undef LOAD
|
||||
#elif defined(__AVX2__) || defined(__AVX__)
|
||||
#if defined(__AVX2__)
|
||||
#define LOAD(p) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16))
|
||||
#else
|
||||
#define LOAD(p) _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16)), (_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_bsrli_si128(_mm_loadu_si128((const __m128i *)(p)), 8)), 16)), 1))
|
||||
#endif
|
||||
__m256 c1 = _mm256_setzero_ps();
|
||||
__m256 c2 = _mm256_setzero_ps();
|
||||
__m256 c3 = _mm256_setzero_ps();
|
||||
__m256 c4 = _mm256_setzero_ps();
|
||||
for (; i + 32 <= n; i += 32) {
|
||||
c1 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i), LOAD(y + i)), c1);
|
||||
c2 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 8), LOAD(y + i + 8)), c2);
|
||||
c3 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c3);
|
||||
c4 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 24), LOAD(y + i + 24)), c4);
|
||||
}
|
||||
__m128 g;
|
||||
c1 = _mm256_add_ps(_mm256_add_ps(c1, c3),
|
||||
_mm256_add_ps(c2, c4));
|
||||
g = _mm_add_ps(_mm256_extractf128_ps(c1, 1),
|
||||
_mm256_castps256_ps128(c1));
|
||||
g = _mm_add_ps(g, _mm_movehl_ps(g, g));
|
||||
g = _mm_add_ss(g, _mm_movehdup_ps(g));
|
||||
sumf += (ggml_float)_mm_cvtss_f32(g);
|
||||
|
||||
#undef LOAD
|
||||
#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfbfwma)
|
||||
size_t vl = __riscv_vsetvlmax_e32m4();
|
||||
|
||||
// initialize accumulators to all zeroes
|
||||
vfloat32m4_t vsum0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
|
||||
vfloat32m4_t vsum1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
|
||||
|
||||
// calculate step size
|
||||
const size_t epr = __riscv_vsetvlmax_e16m2();
|
||||
const size_t step = epr * 2;
|
||||
const int np = (n & ~(step - 1));
|
||||
|
||||
// unroll by 2
|
||||
for (; i < np; i += step) {
|
||||
vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i], epr);
|
||||
vbfloat16m2_t ay0 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i], epr);
|
||||
vsum0 = __riscv_vfwmaccbf16_vv_f32m4(vsum0, ax0, ay0, epr);
|
||||
__asm__ __volatile__ ("" ::: "memory");
|
||||
|
||||
vbfloat16m2_t ax1 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i + epr], epr);
|
||||
vbfloat16m2_t ay1 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i + epr], epr);
|
||||
vsum1 = __riscv_vfwmaccbf16_vv_f32m4(vsum1, ax1, ay1, epr);
|
||||
__asm__ __volatile__ ("" ::: "memory");
|
||||
}
|
||||
|
||||
// accumulate in 1 register
|
||||
vsum0 = __riscv_vfadd_vv_f32m4(vsum0, vsum1, vl);
|
||||
|
||||
// leftovers
|
||||
for (i = np; i < n; i += vl) {
|
||||
vl = __riscv_vsetvl_e16m2(n - i);
|
||||
vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i], vl);
|
||||
vbfloat16m2_t ay0 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i], vl);
|
||||
vsum0 = __riscv_vfwmaccbf16_vv_f32m4(vsum0, ax0, ay0, vl);
|
||||
}
|
||||
|
||||
// reduce
|
||||
vl = __riscv_vsetvlmax_e32m4();
|
||||
vfloat32m1_t redsum = __riscv_vfredusum_vs_f32m4_f32m1(vsum0, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
|
||||
sumf += __riscv_vfmv_f_s_f32m1_f32(redsum);
|
||||
|
||||
#endif
|
||||
for (; i < n; ++i) {
|
||||
sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) *
|
||||
GGML_BF16_TO_FP32(y[i]));
|
||||
}
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc) {
|
||||
assert(nrc == 1);
|
||||
GGML_UNUSED(nrc);
|
||||
GGML_UNUSED(bx);
|
||||
GGML_UNUSED(by);
|
||||
GGML_UNUSED(bs);
|
||||
|
||||
ggml_float sumf = 0.0;
|
||||
|
||||
|
||||
#if defined(GGML_SIMD)
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
const int sve_register_length = svcntb() * 8; //get vector length
|
||||
const int ggml_f16_epr = sve_register_length / 16; // running when 16
|
||||
const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
|
||||
|
||||
const int np= (n & ~(ggml_f16_step - 1));
|
||||
svfloat16_t sum1 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum2 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum3 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum4 = svdup_n_f16(0.0f);
|
||||
|
||||
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
||||
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
||||
for (int i = 0; i < np; i += ggml_f16_step) {
|
||||
ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
|
||||
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
|
||||
sum1 = GGML_F16x_VEC_FMA(sum1, ax1, ay1);
|
||||
|
||||
ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
|
||||
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
|
||||
sum2 = GGML_F16x_VEC_FMA(sum2, ax2, ay2);
|
||||
|
||||
ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
|
||||
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
|
||||
sum3 = GGML_F16x_VEC_FMA(sum3, ax3, ay3);
|
||||
|
||||
ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
|
||||
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
|
||||
sum4 = GGML_F16x_VEC_FMA(sum4, ax4, ay4);
|
||||
|
||||
ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
|
||||
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
|
||||
sum1 = GGML_F16x_VEC_FMA(sum1, ax5, ay5);
|
||||
|
||||
ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
|
||||
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
|
||||
sum2 = GGML_F16x_VEC_FMA(sum2, ax6, ay6);
|
||||
|
||||
ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
|
||||
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
|
||||
sum3 = GGML_F16x_VEC_FMA(sum3, ax7, ay7);
|
||||
|
||||
ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
|
||||
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
|
||||
sum4 = GGML_F16x_VEC_FMA(sum4, ax8, ay8);
|
||||
}
|
||||
|
||||
const int np2 = (n & ~(ggml_f16_epr - 1)); // round down to multiple of 8
|
||||
for (int k = np; k < np2; k += ggml_f16_epr) {
|
||||
svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
|
||||
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
|
||||
sum1 = GGML_F16x_VEC_FMA(sum1, rx, ry);
|
||||
}
|
||||
|
||||
if (np2 < n) {
|
||||
svbool_t pg = svwhilelt_b16(np2, n);
|
||||
svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
|
||||
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
|
||||
|
||||
sum1 = svmad_f16_x(pg, hx, hy, sum1);
|
||||
}
|
||||
GGML_F16x_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4);
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#if defined(__riscv_zvfh)
|
||||
int vl = __riscv_vsetvlmax_e32m2();
|
||||
vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1);
|
||||
vfloat32m2_t vsum;
|
||||
vfloat16m1_t ax;
|
||||
vfloat16m1_t ay;
|
||||
vsum = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vmv_v_x_u32m2(0, vl));
|
||||
for (int i = 0; i < n; i += vl) {
|
||||
vl = __riscv_vsetvl_e16m1(n - i);
|
||||
ax = __riscv_vle16_v_f16m1_tu(ax, (const _Float16 *)&x[i], vl);
|
||||
ay = __riscv_vle16_v_f16m1_tu(ay, (const _Float16 *)&y[i], vl);
|
||||
vsum = __riscv_vfwmacc_vv_f32m2_tu(vsum, ax, ay, vl);
|
||||
}
|
||||
vl = __riscv_vsetvlmax_e32m1();
|
||||
vfloat32m1_t ac0 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(vsum, 0), __riscv_vget_v_f32m2_f32m1(vsum, 1), vl);
|
||||
vs = __riscv_vfredusum_vs_f32m1_f32m1(ac0, vs, vl);
|
||||
sumf += __riscv_vfmv_f_s_f32m1_f32(vs);
|
||||
#else
|
||||
for (int i = 0; i < n; ++i) {
|
||||
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
||||
}
|
||||
#endif // __riscv_zvfh
|
||||
#else
|
||||
const int np = (n & ~(GGML_F16_STEP - 1));
|
||||
|
||||
GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
|
||||
|
||||
GGML_F16_VEC ax[GGML_F16_ARR];
|
||||
GGML_F16_VEC ay[GGML_F16_ARR];
|
||||
|
||||
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
||||
for (int j = 0; j < GGML_F16_ARR; j++) {
|
||||
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
|
||||
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
||||
|
||||
sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
|
||||
}
|
||||
}
|
||||
|
||||
// reduce sum0..sum3 to sum0
|
||||
GGML_F16_VEC_REDUCE(sumf, sum);
|
||||
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
||||
}
|
||||
// if you hit this, you are likely running outside the FP range
|
||||
assert(!isnan(sumf) && !isinf(sumf));
|
||||
#endif
|
||||
#else
|
||||
for (int i = 0; i < n; ++i) {
|
||||
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
||||
}
|
||||
#endif // GGML_SIMD
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_silu_f32(const int n, float * y, const float * x) {
|
||||
int i = 0;
|
||||
#if defined(__AVX512F__) && defined(__AVX512DQ__)
|
||||
for (; i + 15 < n; i += 16) {
|
||||
_mm512_storeu_ps(y + i, ggml_v_silu(_mm512_loadu_ps(x + i)));
|
||||
}
|
||||
#elif defined(__AVX2__) && defined(__FMA__)
|
||||
for (; i + 7 < n; i += 8) {
|
||||
_mm256_storeu_ps(y + i, ggml_v_silu(_mm256_loadu_ps(x + i)));
|
||||
}
|
||||
#elif defined(__SSE2__)
|
||||
for (; i + 3 < n; i += 4) {
|
||||
_mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i)));
|
||||
}
|
||||
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
||||
const int vlen = svcntw();
|
||||
for (; i < n; i += vlen) {
|
||||
const svbool_t pg = svwhilelt_b32_s32(i, n);
|
||||
svst1_f32(pg, y + i, ggml_v_silu(pg, svld1_f32(pg, x + i)));
|
||||
}
|
||||
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
||||
for (; i + 3 < n; i += 4) {
|
||||
vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i)));
|
||||
}
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
for (int vl; i < n; i += vl) {
|
||||
vl = __riscv_vsetvl_e32m2(n - i);
|
||||
vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
|
||||
vfloat32m2_t vy = ggml_v_silu_m2(vx, vl);
|
||||
__riscv_vse32_v_f32m2(&y[i], vy, vl);
|
||||
}
|
||||
#endif
|
||||
for (; i < n; ++i) {
|
||||
y[i] = ggml_silu_f32(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g) {
|
||||
int i = 0;
|
||||
#if defined(__AVX512F__) && defined(__AVX512DQ__)
|
||||
for (; i + 15 < n; i += 16) {
|
||||
_mm512_storeu_ps(y + i, _mm512_mul_ps(ggml_v_silu(_mm512_loadu_ps(x + i)), _mm512_loadu_ps(g + i)));
|
||||
}
|
||||
#elif defined(__AVX2__) && defined(__FMA__)
|
||||
for (; i + 7 < n; i += 8) {
|
||||
_mm256_storeu_ps(y + i, _mm256_mul_ps(ggml_v_silu(_mm256_loadu_ps(x + i)), _mm256_loadu_ps(g + i)));
|
||||
}
|
||||
#elif defined(__SSE2__)
|
||||
for (; i + 3 < n; i += 4) {
|
||||
_mm_storeu_ps(y + i, _mm_mul_ps(ggml_v_silu(_mm_loadu_ps(x + i)), _mm_loadu_ps(g + i)));
|
||||
}
|
||||
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
||||
const int vlen = svcntw();
|
||||
for (; i < n; i += vlen) {
|
||||
const svbool_t pg = svwhilelt_b32_s32(i, n);
|
||||
svst1_f32(pg, y + i, svmul_f32_x(pg, ggml_v_silu(pg, svld1_f32(pg, x + i)), svld1_f32(pg, g + i)));
|
||||
}
|
||||
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
||||
for (; i + 3 < n; i += 4) {
|
||||
vst1q_f32(y + i, vmulq_f32(ggml_v_silu(vld1q_f32(x + i)), vld1q_f32(g + i)));
|
||||
}
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
for (int vl; i < n; i += vl) {
|
||||
vl = __riscv_vsetvl_e32m2(n - i);
|
||||
vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
|
||||
vfloat32m2_t vg = __riscv_vle32_v_f32m2(&g[i], vl);
|
||||
vfloat32m2_t vy = __riscv_vfmul_vv_f32m2(ggml_v_silu_m2(vx, vl), vg, vl);
|
||||
__riscv_vse32_v_f32m2(&y[i], vy, vl);
|
||||
}
|
||||
#endif
|
||||
for (; i < n; ++i) {
|
||||
y[i] = ggml_silu_f32(x[i]) * g[i];
|
||||
}
|
||||
}
|
||||
|
||||
ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean) {
|
||||
int i = 0;
|
||||
ggml_float sum = 0;
|
||||
// TODO: optimize to process the remaining elements in groups using the smaller vector sizes from AVX2 and SSE
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/15953#pullrequestreview-3310928344
|
||||
#if defined(__AVX512F__) && defined(__AVX512DQ__)
|
||||
for (; i + 15 < n; i += 16) {
|
||||
__m512 val = _mm512_sub_ps(_mm512_loadu_ps(x + i),
|
||||
_mm512_set1_ps(mean));
|
||||
_mm512_storeu_ps(y + i, val);
|
||||
sum += (ggml_float)_mm512_reduce_add_ps(_mm512_mul_ps(val, val));
|
||||
}
|
||||
#elif defined(__AVX2__) && defined(__FMA__)
|
||||
for (; i + 7 < n; i += 8) {
|
||||
__m256 val = _mm256_sub_ps(_mm256_loadu_ps(x + i),
|
||||
_mm256_set1_ps(mean));
|
||||
_mm256_storeu_ps(y + i, val);
|
||||
val = _mm256_mul_ps(val,val);
|
||||
__m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1),
|
||||
_mm256_castps256_ps128(val));
|
||||
val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2));
|
||||
val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2));
|
||||
sum += (ggml_float)_mm_cvtss_f32(val2);
|
||||
}
|
||||
#elif defined(__SSE2__)
|
||||
for (; i + 3 < n; i += 4) {
|
||||
__m128 val = _mm_sub_ps(_mm_loadu_ps(x + i),
|
||||
_mm_set1_ps(mean));
|
||||
_mm_storeu_ps(y + i, val);
|
||||
val = _mm_mul_ps(val, val);
|
||||
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
||||
val = _mm_add_ps(val, _mm_movehl_ps(val, val));
|
||||
val = _mm_add_ss(val, _mm_movehdup_ps(val));
|
||||
#else
|
||||
__m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
val = _mm_add_ps(val, tmp);
|
||||
tmp = _mm_movehl_ps(tmp, val);
|
||||
val = _mm_add_ss(val, tmp);
|
||||
#endif // __AVX__ || __AVX2__ || __AVX512F__
|
||||
sum += (ggml_float)_mm_cvtss_f32(val);
|
||||
}
|
||||
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
||||
for (; i + 3 < n; i += 4) {
|
||||
float32x4_t val = vsubq_f32(vld1q_f32(x + i),
|
||||
vdupq_n_f32(mean));
|
||||
vst1q_f32(y + i, val);
|
||||
val = vmulq_f32(val, val);
|
||||
sum += (ggml_float)vaddvq_f32(val);
|
||||
}
|
||||
#elif defined(__VXE__) || defined(__VXE2__)
|
||||
for (; i + 3 < n; i += 4) {
|
||||
float32x4_t val = vec_sub(vec_xl(0, x + i), vec_splats(mean));
|
||||
vec_xst(val, 0, y + i);
|
||||
val = vec_mul(val, val);
|
||||
sum += (ggml_float)vec_hsum_f32x4(val);
|
||||
}
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
vfloat64m1_t vsum = __riscv_vfmv_v_f_f64m1(0, 1);
|
||||
for (int vl; i < n; i += vl) {
|
||||
vl = __riscv_vsetvl_e32m2(n - i);
|
||||
vfloat32m2_t val = __riscv_vfsub_vf_f32m2(__riscv_vle32_v_f32m2(&x[i], vl), mean, vl);
|
||||
__riscv_vse32_v_f32m2(&y[i], val, vl);
|
||||
val = __riscv_vfmul_vv_f32m2(val, val, vl);
|
||||
vsum = __riscv_vfwredusum_vs_f32m2_f64m1(val, vsum, vl);
|
||||
}
|
||||
sum = (ggml_float)__riscv_vfmv_f_s_f64m1_f64(vsum);
|
||||
#endif
|
||||
for (; i < n; ++i) {
|
||||
float val = x[i] - mean;
|
||||
y[i] = val;
|
||||
val *= val;
|
||||
sum += (ggml_float)val;
|
||||
}
|
||||
return sum/n;
|
||||
}
|
||||
|
||||
ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
|
||||
int i = 0;
|
||||
ggml_float sum = 0;
|
||||
#if defined(__AVX512F__) && defined(__AVX512DQ__)
|
||||
for (; i + 15 < n; i += 16) {
|
||||
__m512 val = ggml_v_expf(_mm512_sub_ps(_mm512_loadu_ps(x + i),
|
||||
_mm512_set1_ps(max)));
|
||||
_mm512_storeu_ps(y + i, val);
|
||||
sum += (ggml_float)_mm512_reduce_add_ps(val);
|
||||
}
|
||||
#elif defined(__AVX2__) && defined(__FMA__)
|
||||
for (; i + 7 < n; i += 8) {
|
||||
__m256 val = ggml_v_expf(_mm256_sub_ps(_mm256_loadu_ps(x + i),
|
||||
_mm256_set1_ps(max)));
|
||||
_mm256_storeu_ps(y + i, val);
|
||||
__m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1),
|
||||
_mm256_castps256_ps128(val));
|
||||
val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2));
|
||||
val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2));
|
||||
sum += (ggml_float)_mm_cvtss_f32(val2);
|
||||
}
|
||||
#elif defined(__SSE2__)
|
||||
for (; i + 3 < n; i += 4) {
|
||||
__m128 val = ggml_v_expf(_mm_sub_ps(_mm_loadu_ps(x + i),
|
||||
_mm_set1_ps(max)));
|
||||
_mm_storeu_ps(y + i, val);
|
||||
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
||||
val = _mm_add_ps(val, _mm_movehl_ps(val, val));
|
||||
val = _mm_add_ss(val, _mm_movehdup_ps(val));
|
||||
#else
|
||||
__m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
val = _mm_add_ps(val, tmp);
|
||||
tmp = _mm_movehl_ps(tmp, val);
|
||||
val = _mm_add_ss(val, tmp);
|
||||
#endif
|
||||
sum += (ggml_float)_mm_cvtss_f32(val);
|
||||
}
|
||||
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
||||
const int vlen = svcntw();
|
||||
for (; i < n; i += vlen) {
|
||||
const svbool_t pg = svwhilelt_b32_s32(i, n);
|
||||
svfloat32_t val = ggml_v_expf(pg, svsub_f32_x(pg, svld1_f32(pg, x + i),
|
||||
svdup_n_f32_x(pg, max)));
|
||||
svst1_f32(pg, y + i, val);
|
||||
sum += (ggml_float)svaddv_f32(pg, val);
|
||||
}
|
||||
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
||||
for (; i + 3 < n; i += 4) {
|
||||
float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i),
|
||||
vdupq_n_f32(max)));
|
||||
vst1q_f32(y + i, val);
|
||||
sum += (ggml_float)vaddvq_f32(val);
|
||||
}
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
vfloat64m1_t vsum = __riscv_vfmv_v_f_f64m1(0, 1);
|
||||
for (int avl; i < n; i += avl) {
|
||||
avl = __riscv_vsetvl_e32m2(n - i);
|
||||
vfloat32m2_t val = ggml_v_expf_m2(__riscv_vfsub_vf_f32m2(__riscv_vle32_v_f32m2(&x[i], avl), max, avl), avl);
|
||||
__riscv_vse32_v_f32m2(&y[i], val, avl);
|
||||
vsum = __riscv_vfwredusum_vs_f32m2_f64m1(val, vsum, avl);
|
||||
}
|
||||
return (ggml_float)__riscv_vfmv_f_s_f64m1_f64(vsum);
|
||||
#endif
|
||||
for (; i < n; ++i) {
|
||||
float val = expf(x[i] - max);
|
||||
sum += (ggml_float)val;
|
||||
y[i] = val;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max) {
|
||||
// log(soft_max) = log(soft_max_i / soft_max_sum) = log(soft_max_i) - log(soft_max_sum) = (logit_i - max) - log(soft_max_i)
|
||||
|
||||
int i = 0;
|
||||
ggml_float sum = 0;
|
||||
for (; i < n; ++i) {
|
||||
float val = x[i] - max;
|
||||
y[i] = val;
|
||||
sum += (ggml_float)expf(val);
|
||||
}
|
||||
return sum = (ggml_float)logf(sum);
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,259 @@
|
||||
cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES
|
||||
|
||||
find_package(CUDAToolkit)
|
||||
|
||||
if (CUDAToolkit_FOUND)
|
||||
message(STATUS "CUDA Toolkit found")
|
||||
|
||||
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
||||
# native == GPUs available at build time
|
||||
# 50 == Maxwell, lowest CUDA 12 standard
|
||||
# 60 == P100, FP16 CUDA intrinsics
|
||||
# 61 == Pascal, __dp4a instruction (per-byte integer dot product)
|
||||
# 70 == V100, FP16 tensor cores
|
||||
# 75 == Turing, int8 tensor cores
|
||||
# 80 == Ampere, asynchronous data loading, faster tensor core instructions
|
||||
# 86 == RTX 3000, needs CUDA v11.1
|
||||
# 89 == RTX 4000, needs CUDA v11.8
|
||||
# 120 == Blackwell, needs CUDA v12.8, FP4 tensor cores
|
||||
#
|
||||
# XX-virtual == compile CUDA code as PTX, do JIT compilation to binary code on first run
|
||||
# XX-real == compile CUDA code as device code for this specific architecture
|
||||
# no suffix == compile as both PTX and device code
|
||||
#
|
||||
# The default behavior for a non-native is to build virtual architectures as needed to cover all features needed
|
||||
# for best performance and to also build real architectures for the most commonly used GPUs.
|
||||
if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
|
||||
set(CMAKE_CUDA_ARCHITECTURES "native")
|
||||
else()
|
||||
if (CUDAToolkit_VERSION VERSION_LESS "13")
|
||||
list(APPEND CMAKE_CUDA_ARCHITECTURES 50-virtual 61-virtual 70-virtual)
|
||||
endif ()
|
||||
|
||||
list(APPEND CMAKE_CUDA_ARCHITECTURES 75-virtual 80-virtual 86-real)
|
||||
|
||||
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8")
|
||||
list(APPEND CMAKE_CUDA_ARCHITECTURES 89-real)
|
||||
endif()
|
||||
|
||||
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
|
||||
# The CUDA architecture 120f-virtual would in principle work for Blackwell support
|
||||
# but the newly added "f" suffix conflicted with a preexising regex for validating CUDA architectures in CMake.
|
||||
# So either a recent CMake version or one with the backported fix is needed.
|
||||
# The following versions should work:
|
||||
# - CMake >= v3.31.8 && CMake < v4.0.0
|
||||
# - CMake >= v4.0.2
|
||||
# This is NOT documented in the CMake release notes,
|
||||
# check Modules/Internal/CMakeCUDAArchitecturesValidate.cmake in the CMake git repository instead.
|
||||
# However, the architectures 120a-real and 121a-real should work with basically any CMake version and
|
||||
# until the release of e.g. Rubin there is no benefit to shipping virtual architectures for Blackwell.
|
||||
list(APPEND CMAKE_CUDA_ARCHITECTURES 120a-real)
|
||||
endif()
|
||||
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.9")
|
||||
list(APPEND CMAKE_CUDA_ARCHITECTURES 121a-real)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
enable_language(CUDA)
|
||||
|
||||
# TODO: Remove once CCCL 3.2 has been released and bundled with CUDA Toolkit
|
||||
if (GGML_CUDA_CUB_3DOT2)
|
||||
include(FetchContent)
|
||||
|
||||
FetchContent_Declare(
|
||||
CCCL
|
||||
GIT_REPOSITORY https://github.com/nvidia/cccl.git
|
||||
GIT_TAG v3.2.0-rc2
|
||||
GIT_SHALLOW TRUE
|
||||
)
|
||||
|
||||
FetchContent_MakeAvailable(CCCL)
|
||||
endif()
|
||||
|
||||
# Replace any plain 12X CUDA architectures with their "architecture-specific" equivalents 12Xa.
|
||||
# 12X is forwards-compatible, 12Xa is not.
|
||||
# Notably the Blackwell FP4 tensor core instructions are not forwards compatible and therefore need 12Xa.
|
||||
# But while 12X vs. 12Xa can be checked in device code there is (to my knowledge) no easy way to do the same check in host code.
|
||||
# So for now just replace all instances of 12X with 12Xa, this should be fine until Rubin is released.
|
||||
foreach(ARCHS IN ITEMS CMAKE_CUDA_ARCHITECTURES CMAKE_CUDA_ARCHITECTURES_NATIVE)
|
||||
set(FIXED_ARCHS "")
|
||||
foreach(ARCH IN LISTS ${ARCHS})
|
||||
if (ARCH MATCHES "^12[0-9](-real|-virtual)?$")
|
||||
string(REGEX REPLACE "^(12[0-9])((-real|-virtual)?)$" "\\1a\\2" FIXED_ARCH ${ARCH})
|
||||
message(STATUS "Replacing ${ARCH} in ${ARCHS} with ${FIXED_ARCH}")
|
||||
list(APPEND FIXED_ARCHS "${FIXED_ARCH}")
|
||||
else()
|
||||
list(APPEND FIXED_ARCHS "${ARCH}")
|
||||
endif()
|
||||
endforeach()
|
||||
set(${ARCHS} ${FIXED_ARCHS})
|
||||
endforeach()
|
||||
|
||||
# If we try to compile a "native" build it will use the 12X architectures and fail.
|
||||
# So we should instead use the native architectures as determined by CMake after replacing 12X with 12Xa.
|
||||
# But if at the time of the build no GPUs are connected at all CMAKE_CUDA_ARCHITECTURES will contain garbage that we should not use.
|
||||
if (CMAKE_CUDA_ARCHITECTURES STREQUAL "native" AND CMAKE_CUDA_ARCHITECTURES_NATIVE MATCHES "^[0-9]+(a|f)?(-real|-virtual)?(;[0-9]+(a|f)?(-real|-virtual)?|;)*$")
|
||||
set(CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES_NATIVE})
|
||||
endif()
|
||||
message(STATUS "Using CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} CMAKE_CUDA_ARCHITECTURES_NATIVE=${CMAKE_CUDA_ARCHITECTURES_NATIVE}")
|
||||
|
||||
file(GLOB GGML_HEADERS_CUDA "*.cuh")
|
||||
list(APPEND GGML_HEADERS_CUDA "../../include/ggml-cuda.h")
|
||||
|
||||
file(GLOB GGML_SOURCES_CUDA "*.cu")
|
||||
file(GLOB SRCS "template-instances/fattn-tile*.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
file(GLOB SRCS "template-instances/fattn-mma*.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
file(GLOB SRCS "template-instances/mmq*.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
file(GLOB SRCS "template-instances/mmf*.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
|
||||
if (GGML_CUDA_FA_ALL_QUANTS)
|
||||
file(GLOB SRCS "template-instances/fattn-vec*.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
|
||||
else()
|
||||
file(GLOB SRCS "template-instances/fattn-vec*q4_0-q4_0.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
file(GLOB SRCS "template-instances/fattn-vec*q8_0-q8_0.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
file(GLOB SRCS "template-instances/fattn-vec*f16-f16.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
endif()
|
||||
|
||||
ggml_add_backend_library(ggml-cuda
|
||||
${GGML_HEADERS_CUDA}
|
||||
${GGML_SOURCES_CUDA}
|
||||
)
|
||||
|
||||
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
|
||||
|
||||
if (GGML_CUDA_GRAPHS)
|
||||
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_FORCE_MMQ)
|
||||
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_FORCE_CUBLAS)
|
||||
add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_NO_VMM)
|
||||
add_compile_definitions(GGML_CUDA_NO_VMM)
|
||||
endif()
|
||||
|
||||
if (NOT GGML_CUDA_FA)
|
||||
add_compile_definitions(GGML_CUDA_NO_FA)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_NO_PEER_COPY)
|
||||
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
|
||||
endif()
|
||||
|
||||
if (GGML_STATIC)
|
||||
if (WIN32)
|
||||
# As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
|
||||
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas)
|
||||
else ()
|
||||
if (GGML_CUDA_CUB_3DOT2)
|
||||
target_link_libraries(ggml-cuda PRIVATE CCCL::CCCL)
|
||||
endif()
|
||||
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "10.1")
|
||||
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
||||
else()
|
||||
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas_static)
|
||||
endif()
|
||||
endif()
|
||||
else()
|
||||
if (GGML_CUDA_CUB_3DOT2)
|
||||
target_link_libraries(ggml-cuda PRIVATE CCCL::CCCL)
|
||||
endif()
|
||||
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_NO_VMM)
|
||||
# No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
|
||||
else()
|
||||
target_link_libraries(ggml-cuda PRIVATE CUDA::cuda_driver)
|
||||
endif()
|
||||
|
||||
set(CUDA_CXX_FLAGS "")
|
||||
|
||||
set(CUDA_FLAGS -use_fast_math -extended-lambda)
|
||||
|
||||
if (GGML_CUDA_DEBUG)
|
||||
list(APPEND CUDA_FLAGS -lineinfo)
|
||||
add_compile_definitions(GGML_CUDA_DEBUG)
|
||||
endif()
|
||||
|
||||
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
|
||||
# Options are:
|
||||
# - none (not recommended)
|
||||
# - speed (nvcc's default)
|
||||
# - balance
|
||||
# - size
|
||||
list(APPEND CUDA_FLAGS -compress-mode=${GGML_CUDA_COMPRESSION_MODE})
|
||||
endif()
|
||||
|
||||
if (GGML_FATAL_WARNINGS)
|
||||
list(APPEND CUDA_FLAGS -Werror all-warnings)
|
||||
endif()
|
||||
|
||||
if (GGML_ALL_WARNINGS AND NOT MSVC)
|
||||
set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
|
||||
if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
|
||||
list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER})
|
||||
endif()
|
||||
|
||||
execute_process(
|
||||
COMMAND ${NVCC_CMD} -Xcompiler --version
|
||||
OUTPUT_VARIABLE CUDA_CCFULLVER
|
||||
ERROR_QUIET
|
||||
)
|
||||
|
||||
if (NOT CUDA_CCFULLVER MATCHES clang)
|
||||
set(CUDA_CCID "GNU")
|
||||
execute_process(
|
||||
COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion"
|
||||
OUTPUT_VARIABLE CUDA_CCVER
|
||||
ERROR_QUIET
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
)
|
||||
else()
|
||||
if (CUDA_CCFULLVER MATCHES Apple)
|
||||
set(CUDA_CCID "AppleClang")
|
||||
else()
|
||||
set(CUDA_CCID "Clang")
|
||||
endif()
|
||||
string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER})
|
||||
endif()
|
||||
|
||||
message(STATUS "CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
|
||||
|
||||
ggml_get_flags(${CUDA_CCID} ${CUDA_CCVER})
|
||||
list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later
|
||||
endif()
|
||||
|
||||
if (NOT MSVC)
|
||||
list(APPEND CUDA_CXX_FLAGS -Wno-pedantic)
|
||||
else()
|
||||
# CCCL 3.2 onwards will require a cpp-standard-compliant preprocessor for MSVC
|
||||
# https://github.com/NVIDIA/cccl/pull/6827
|
||||
list(APPEND CUDA_CXX_FLAGS /Zc:preprocessor)
|
||||
endif()
|
||||
|
||||
list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument
|
||||
|
||||
if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
|
||||
list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED})
|
||||
endif()
|
||||
|
||||
target_compile_options(ggml-cuda PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
|
||||
else()
|
||||
message(FATAL_ERROR "CUDA Toolkit not found")
|
||||
endif()
|
||||
@@ -0,0 +1,61 @@
|
||||
#include "acc.cuh"
|
||||
|
||||
static __global__ void acc_f32(const float * x, const float * y, float * dst, const int64_t ne,
|
||||
const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
|
||||
const int64_t s11, const int64_t s12, const int64_t s13, const int64_t offset) {
|
||||
const int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
int64_t src1_idx = i - offset;
|
||||
|
||||
int64_t tmp = src1_idx;
|
||||
const int64_t i13 = tmp / s13;
|
||||
tmp -= i13 * s13;
|
||||
const int64_t i12 = tmp / s12;
|
||||
tmp -= i12 * s12;
|
||||
const int64_t i11 = tmp / s11;
|
||||
tmp -= i11 * s11;
|
||||
const int64_t i10 = tmp;
|
||||
|
||||
float val = x[i];
|
||||
if (src1_idx >= 0 && i10 < ne10 && i11 < ne11 && i12 < ne12 && i13 < ne13) {
|
||||
val += y[((i13*ne12 + i12) * ne11 + i11) * ne10 + i10];
|
||||
}
|
||||
dst[i] = val;
|
||||
}
|
||||
|
||||
static void acc_f32_cuda(const float * x, const float * y, float * dst, const int64_t n_elements,
|
||||
const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
|
||||
const int64_t s1, const int64_t s2, const int64_t s3, const int64_t offset, cudaStream_t stream) {
|
||||
const int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
|
||||
acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, ne13, s1, s2, s3, offset);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src1));
|
||||
GGML_ASSERT(dst->nb[0] == ggml_element_size(dst));
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(dst));
|
||||
|
||||
const int64_t s1 = dst->op_params[0] / sizeof(float);
|
||||
const int64_t s2 = dst->op_params[1] / sizeof(float);
|
||||
const int64_t s3 = dst->op_params[2] / sizeof(float);
|
||||
const int64_t offset = dst->op_params[3] / sizeof(float);
|
||||
|
||||
acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], s1, s2, s3, offset, stream);
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
#include "common.cuh"
|
||||
|
||||
#define CUDA_ACC_BLOCK_SIZE 256
|
||||
|
||||
void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
@@ -0,0 +1,58 @@
|
||||
#include "add-id.cuh"
|
||||
|
||||
static __global__ void add_id_kernel(
|
||||
const float * src0, const float * src1, const int32_t * src2, float * dst,
|
||||
int64_t ne0, int64_t ne1,
|
||||
size_t nb01, size_t nb02,
|
||||
size_t nb11,
|
||||
size_t nb21
|
||||
) {
|
||||
|
||||
const int64_t i1 = blockIdx.x;
|
||||
const int64_t i2 = blockIdx.y;
|
||||
|
||||
const int i11 = *(const int32_t *) ((const char *) src2 + i1*sizeof(int32_t) + i2*nb21);
|
||||
|
||||
const size_t nb1 = ne0 * sizeof(float);
|
||||
const size_t nb2 = ne1 * nb1;
|
||||
|
||||
float * dst_row = (float *)((char *)dst + i1*nb1 + i2*nb2);
|
||||
const float * src0_row = (const float *)((const char *)src0 + i1*nb01 + i2*nb02);
|
||||
const float * src1_row = (const float *)((const char *)src1 + i11*nb11);
|
||||
|
||||
for (int64_t i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
|
||||
dst_row[i0] = src0_row[i0] + src1_row[i0];
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_op_add_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
const ggml_tensor * src2 = dst->src[2];
|
||||
|
||||
GGML_TENSOR_TERNARY_OP_LOCALS
|
||||
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src2->type == GGML_TYPE_I32);
|
||||
|
||||
GGML_ASSERT(nb00 == sizeof(float));
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
GGML_ASSERT(nb20 == sizeof(int32_t));
|
||||
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
const float * src1_d = (const float *)src1->data;
|
||||
const int32_t * src2_d = (const int32_t *)src2->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
|
||||
int threads = std::min((int)ne00, 768); // cols
|
||||
dim3 blocks(ne01, ne02); // n_experts_used, n_tokens
|
||||
add_id_kernel<<<blocks, threads, 0, ctx.stream()>>>(
|
||||
src0_d, src1_d, src2_d, dst_d,
|
||||
ne0, ne1,
|
||||
nb01, nb02,
|
||||
nb11,
|
||||
nb21
|
||||
);
|
||||
}
|
||||
@@ -0,0 +1,3 @@
|
||||
#include "common.cuh"
|
||||
|
||||
void ggml_cuda_op_add_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
@@ -0,0 +1,34 @@
|
||||
#include "arange.cuh"
|
||||
|
||||
static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
|
||||
// blockIDx.x: idx of ne0 / BLOCK_SIZE
|
||||
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (nidx >= ne0) {
|
||||
return;
|
||||
}
|
||||
dst[nidx] = start + step * nidx;
|
||||
}
|
||||
|
||||
static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
|
||||
int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
|
||||
arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start, step);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
float start;
|
||||
float stop;
|
||||
float step;
|
||||
memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
|
||||
memcpy(&stop, (float *)dst->op_params + 1, sizeof(float));
|
||||
memcpy(&step, (float *)dst->op_params + 2, sizeof(float));
|
||||
|
||||
int64_t steps = (int64_t)ceil((stop - start) / step);
|
||||
GGML_ASSERT(ggml_nelements(dst) == steps);
|
||||
|
||||
arange_f32_cuda(dst_d, dst->ne[0], start, step, stream);
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
#include "common.cuh"
|
||||
|
||||
#define CUDA_ARANGE_BLOCK_SIZE 256
|
||||
|
||||
void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
@@ -0,0 +1,91 @@
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
|
||||
#include "argmax.cuh"
|
||||
#include "common.cuh"
|
||||
#include "sum.cuh"
|
||||
|
||||
static __global__ void argmax_f32(const float * __restrict__ x, int32_t * __restrict__ dst, const int64_t ncols) {
|
||||
const int64_t row = blockIdx.x;
|
||||
|
||||
float maxval = -FLT_MAX;
|
||||
int argmax = -1;
|
||||
const float * rowx = x + row * ncols;
|
||||
|
||||
for (int32_t col = threadIdx.x; col < ncols; col += blockDim.x) {
|
||||
const float val = rowx[col];
|
||||
if (val > maxval) {
|
||||
maxval = val;
|
||||
argmax = col;
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int offset = WARP_SIZE/2; offset > 0; offset >>= 1) {
|
||||
const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE);
|
||||
const int col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE);
|
||||
if (val > maxval) {
|
||||
maxval = val;
|
||||
argmax = col;
|
||||
}
|
||||
}
|
||||
|
||||
const int n_warps = blockDim.x / WARP_SIZE;
|
||||
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||
if (n_warps > 1) {
|
||||
constexpr int max_warps = 1024 / WARP_SIZE;
|
||||
__shared__ float shared_maxval[max_warps];
|
||||
__shared__ int shared_argmax[max_warps];
|
||||
if (lane_id == 0) {
|
||||
shared_maxval[warp_id] = maxval;
|
||||
shared_argmax[warp_id] = argmax;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (warp_id == 0) {
|
||||
if (lane_id < n_warps) {
|
||||
maxval = shared_maxval[lane_id];
|
||||
argmax = shared_argmax[lane_id];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int offset = WARP_SIZE/2; offset > 0; offset >>= 1) {
|
||||
const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE);
|
||||
const int col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE);
|
||||
if (val > maxval) {
|
||||
maxval = val;
|
||||
argmax = col;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (warp_id == 0 && lane_id == 0) {
|
||||
dst[row] = argmax;
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_I32);
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t nrows = ggml_nrows(src0);
|
||||
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
int32_t * dst_d = (int32_t *) dst->data;
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
const int64_t num_blocks = nrows;
|
||||
const int64_t num_threads = std::min<int64_t>(1024, (ne00 + WARP_SIZE - 1) / WARP_SIZE * WARP_SIZE);
|
||||
const dim3 blocks_dim(num_threads, 1, 1);
|
||||
const dim3 blocks_num(num_blocks, 1, 1);
|
||||
|
||||
argmax_f32<<<blocks_num, blocks_dim, 0, stream>>>(src0_d, dst_d, ne00);
|
||||
}
|
||||
@@ -0,0 +1,3 @@
|
||||
#include "common.cuh"
|
||||
|
||||
void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
@@ -0,0 +1,221 @@
|
||||
#include "argsort.cuh"
|
||||
|
||||
#ifdef GGML_CUDA_USE_CUB
|
||||
# include <cub/cub.cuh>
|
||||
using namespace cub;
|
||||
#endif // GGML_CUDA_USE_CUB
|
||||
|
||||
static __global__ void init_indices(int * indices, const int ncols, const int nrows) {
|
||||
const int col = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int row = blockIdx.y;
|
||||
|
||||
if (col < ncols && row < nrows) {
|
||||
indices[row * ncols + col] = col;
|
||||
}
|
||||
}
|
||||
|
||||
static __global__ void init_offsets(int * offsets, const int ncols, const int nrows) {
|
||||
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (idx <= nrows) {
|
||||
offsets[idx] = idx * ncols;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_CUDA_USE_CUB
|
||||
void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
|
||||
const float * x,
|
||||
int * dst,
|
||||
const int ncols,
|
||||
const int nrows,
|
||||
ggml_sort_order order,
|
||||
cudaStream_t stream) {
|
||||
ggml_cuda_pool_alloc<int> temp_indices_alloc(pool, ncols * nrows);
|
||||
ggml_cuda_pool_alloc<float> temp_keys_alloc(pool, ncols * nrows);
|
||||
ggml_cuda_pool_alloc<int> offsets_alloc(pool, nrows + 1);
|
||||
|
||||
int * temp_indices = temp_indices_alloc.get();
|
||||
float * temp_keys = temp_keys_alloc.get();
|
||||
int * d_offsets = offsets_alloc.get();
|
||||
|
||||
static const int block_size = 256;
|
||||
const dim3 grid_size((ncols + block_size - 1) / block_size, nrows);
|
||||
init_indices<<<grid_size, block_size, 0, stream>>>(temp_indices, ncols, nrows);
|
||||
|
||||
const dim3 offset_grid((nrows + block_size - 1) / block_size);
|
||||
init_offsets<<<offset_grid, block_size, 0, stream>>>(d_offsets, ncols, nrows);
|
||||
|
||||
CUDA_CHECK(cudaMemcpyAsync(temp_keys, x, ncols * nrows * sizeof(float), cudaMemcpyDeviceToDevice, stream));
|
||||
|
||||
size_t temp_storage_bytes = 0;
|
||||
|
||||
if (order == GGML_SORT_ORDER_ASC) {
|
||||
if (nrows == 1) {
|
||||
DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
|
||||
temp_indices, dst, // values (indices)
|
||||
ncols, 0, sizeof(float) * 8, stream);
|
||||
} else {
|
||||
DeviceSegmentedSort::SortPairs(nullptr, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
|
||||
temp_indices, dst, // values (indices)
|
||||
ncols * nrows, nrows, // num items, num segments
|
||||
d_offsets, d_offsets + 1, stream);
|
||||
}
|
||||
} else {
|
||||
if (nrows == 1) {
|
||||
DeviceRadixSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
|
||||
temp_indices, dst, // values (indices)
|
||||
ncols, 0, sizeof(float) * 8, stream);
|
||||
} else {
|
||||
DeviceSegmentedSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys, temp_keys, temp_indices,
|
||||
dst, ncols * nrows, nrows, d_offsets, d_offsets + 1, stream);
|
||||
}
|
||||
}
|
||||
|
||||
ggml_cuda_pool_alloc<uint8_t> temp_storage_alloc(pool, temp_storage_bytes);
|
||||
void * d_temp_storage = temp_storage_alloc.get();
|
||||
|
||||
if (order == GGML_SORT_ORDER_ASC) {
|
||||
if (nrows == 1) {
|
||||
DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
|
||||
temp_indices, dst, // values (indices)
|
||||
ncols, 0, sizeof(float) * 8, stream);
|
||||
} else {
|
||||
DeviceSegmentedSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, temp_indices, dst,
|
||||
ncols * nrows, nrows, d_offsets, d_offsets + 1, stream);
|
||||
}
|
||||
} else {
|
||||
if (nrows == 1) {
|
||||
DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
|
||||
temp_indices, dst, // values (indices)
|
||||
ncols, 0, sizeof(float) * 8, stream);
|
||||
} else {
|
||||
DeviceSegmentedSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys,
|
||||
temp_indices, dst, ncols * nrows, nrows, d_offsets, d_offsets + 1,
|
||||
stream);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // GGML_CUDA_USE_CUB
|
||||
|
||||
// Bitonic sort implementation
|
||||
template<typename T>
|
||||
static inline __device__ void ggml_cuda_swap(T & a, T & b) {
|
||||
T tmp = a;
|
||||
a = b;
|
||||
b = tmp;
|
||||
}
|
||||
|
||||
template<ggml_sort_order order>
|
||||
static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) {
|
||||
// bitonic sort
|
||||
int col = threadIdx.x;
|
||||
int row = blockIdx.x;
|
||||
|
||||
if (col >= ncols_pad) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float * x_row = x + row * ncols;
|
||||
extern __shared__ int dst_row[];
|
||||
|
||||
// initialize indices
|
||||
dst_row[col] = col;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for (int k = 2; k <= ncols_pad; k *= 2) {
|
||||
for (int j = k / 2; j > 0; j /= 2) {
|
||||
int ixj = col ^ j;
|
||||
if (ixj > col) {
|
||||
if ((col & k) == 0) {
|
||||
if (dst_row[col] >= ncols ||
|
||||
(dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ?
|
||||
x_row[dst_row[col]] > x_row[dst_row[ixj]] :
|
||||
x_row[dst_row[col]] < x_row[dst_row[ixj]]))
|
||||
) {
|
||||
ggml_cuda_swap(dst_row[col], dst_row[ixj]);
|
||||
}
|
||||
} else {
|
||||
if (dst_row[ixj] >= ncols ||
|
||||
(dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ?
|
||||
x_row[dst_row[col]] < x_row[dst_row[ixj]] :
|
||||
x_row[dst_row[col]] > x_row[dst_row[ixj]]))
|
||||
) {
|
||||
ggml_cuda_swap(dst_row[col], dst_row[ixj]);
|
||||
}
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
// copy the result to dst without the padding
|
||||
if (col < ncols) {
|
||||
dst[row * ncols + col] = dst_row[col];
|
||||
}
|
||||
}
|
||||
|
||||
static int next_power_of_2(int x) {
|
||||
int n = 1;
|
||||
while (n < x) {
|
||||
n *= 2;
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
void argsort_f32_i32_cuda_bitonic(const float * x,
|
||||
int * dst,
|
||||
const int ncols,
|
||||
const int nrows,
|
||||
ggml_sort_order order,
|
||||
cudaStream_t stream) {
|
||||
// bitonic sort requires ncols to be power of 2
|
||||
const int ncols_pad = next_power_of_2(ncols);
|
||||
|
||||
const dim3 block_dims(ncols_pad, 1, 1);
|
||||
const dim3 block_nums(nrows, 1, 1);
|
||||
const size_t shared_mem = ncols_pad * sizeof(int);
|
||||
|
||||
// FIXME: this limit could be raised by ~2-4x on Ampere or newer
|
||||
GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
|
||||
|
||||
if (order == GGML_SORT_ORDER_ASC) {
|
||||
k_argsort_f32_i32<GGML_SORT_ORDER_ASC>
|
||||
<<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
|
||||
} else if (order == GGML_SORT_ORDER_DESC) {
|
||||
k_argsort_f32_i32<GGML_SORT_ORDER_DESC>
|
||||
<<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
const int64_t ncols = src0->ne[0];
|
||||
const int64_t nrows = ggml_nrows(src0);
|
||||
|
||||
enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
|
||||
|
||||
#ifdef GGML_CUDA_USE_CUB
|
||||
const int ncols_pad = next_power_of_2(ncols);
|
||||
const size_t shared_mem = ncols_pad * sizeof(int);
|
||||
const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
|
||||
|
||||
if (shared_mem > max_shared_mem || ncols > 1024) {
|
||||
ggml_cuda_pool & pool = ctx.pool();
|
||||
argsort_f32_i32_cuda_cub(pool, src0_d, (int *) dst_d, ncols, nrows, order, stream);
|
||||
} else {
|
||||
argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream);
|
||||
}
|
||||
#else
|
||||
argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream);
|
||||
#endif
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
#include "common.cuh"
|
||||
|
||||
void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
#ifdef GGML_CUDA_USE_CUB
|
||||
void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
|
||||
const float * x,
|
||||
int * dst,
|
||||
const int ncols,
|
||||
const int nrows,
|
||||
ggml_sort_order order,
|
||||
cudaStream_t stream);
|
||||
#endif // GGML_CUDA_USE_CUB
|
||||
void argsort_f32_i32_cuda_bitonic(const float * x,
|
||||
int * dst,
|
||||
const int ncols,
|
||||
const int nrows,
|
||||
ggml_sort_order order,
|
||||
cudaStream_t stream);
|
||||
@@ -0,0 +1,502 @@
|
||||
#include "binbcast.cuh"
|
||||
#include <cstdint>
|
||||
#include <utility>
|
||||
|
||||
static __device__ __forceinline__ float op_repeat(const float a, const float b) {
|
||||
return b;
|
||||
GGML_UNUSED(a);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float op_add(const float a, const float b) {
|
||||
return a + b;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float op_sub(const float a, const float b) {
|
||||
return a - b;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float op_mul(const float a, const float b) {
|
||||
return a * b;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float op_div(const float a, const float b) {
|
||||
return a / b;
|
||||
}
|
||||
|
||||
template <float (*bin_op)(const float, const float),
|
||||
typename src0_t,
|
||||
typename src1_t,
|
||||
typename dst_t,
|
||||
typename... src1_ptrs>
|
||||
static __global__ void k_bin_bcast(const src0_t * src0,
|
||||
const src1_t * src1,
|
||||
dst_t * dst,
|
||||
const int ne0,
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const uint3 ne3,
|
||||
const uint3 ne10,
|
||||
const uint3 ne11,
|
||||
const uint3 ne12,
|
||||
const uint3 ne13,
|
||||
/*int s0, */ const int s1,
|
||||
const int s2,
|
||||
const int s3,
|
||||
/*int s00,*/ const int s01,
|
||||
const int s02,
|
||||
const int s03,
|
||||
/*int s10,*/ const int s11,
|
||||
const int s12,
|
||||
const int s13,
|
||||
src1_ptrs... src1s) {
|
||||
const uint32_t i0s = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const uint32_t i1 = (blockDim.y * blockIdx.y + threadIdx.y);
|
||||
const uint32_t i2 = fastdiv((blockDim.z * blockIdx.z + threadIdx.z), ne3);
|
||||
const uint32_t i3 = (blockDim.z * blockIdx.z + threadIdx.z) - (i2 * ne3.z);
|
||||
|
||||
if (i0s >= (uint32_t)ne0 || i1 >= (uint32_t)ne1 || i2 >= (uint32_t)ne2 || i3 >= ne3.z) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint32_t i11 = fastmodulo(i1, ne11);
|
||||
const uint32_t i12 = fastmodulo(i2, ne12);
|
||||
const uint32_t i13 = fastmodulo(i3, ne13);
|
||||
|
||||
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
|
||||
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
||||
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
|
||||
|
||||
const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr;
|
||||
dst_t * dst_row = dst + i_dst;
|
||||
|
||||
for (int i0 = i0s; i0 < ne0; i0 += blockDim.x * gridDim.x) {
|
||||
const uint32_t i10 = fastmodulo(i0, ne10);
|
||||
|
||||
float result = src0_row ? (float) src0_row[i0] : 0.0f;
|
||||
if constexpr (sizeof...(src1_ptrs) > 0) {
|
||||
result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10])));
|
||||
} else {
|
||||
result = bin_op(result, (float)src1[i_src1 + i10]);
|
||||
}
|
||||
|
||||
dst_row[i0] = (dst_t) result;
|
||||
}
|
||||
}
|
||||
|
||||
template <float (*bin_op)(const float, const float),
|
||||
typename src0_t,
|
||||
typename src1_t,
|
||||
typename dst_t,
|
||||
typename... src1_ptrs>
|
||||
static __global__ void k_bin_bcast_unravel(const src0_t * src0,
|
||||
const src1_t * src1,
|
||||
dst_t * dst,
|
||||
const uint3 ne0,
|
||||
const uint3 ne1,
|
||||
const uint3 ne2,
|
||||
const uint32_t ne3,
|
||||
const uint3 prod_012,
|
||||
const uint3 prod_01,
|
||||
const uint3 ne10,
|
||||
const uint3 ne11,
|
||||
const uint3 ne12,
|
||||
const uint3 ne13,
|
||||
/*int s0, */ const int s1,
|
||||
const int s2,
|
||||
const int s3,
|
||||
/*int s00,*/ const int s01,
|
||||
const int s02,
|
||||
const int s03,
|
||||
/*int s10,*/ const int s11,
|
||||
const int s12,
|
||||
const int s13,
|
||||
src1_ptrs... src1s) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
const uint32_t i3 = fastdiv(i, prod_012);
|
||||
const uint32_t i2 = fastdiv(i - i3 * prod_012.z, prod_01);
|
||||
const uint32_t i1 = fastdiv(i - i3 * prod_012.z - i2 * prod_01.z, ne0);
|
||||
const uint32_t i0 = i - i3 * prod_012.z - i2 * prod_01.z - i1 * ne0.z;
|
||||
|
||||
if (i0 >= ne0.z || i1 >= ne1.z || i2 >= ne2.z || i3 >= ne3) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int i11 = fastmodulo(i1, ne11);
|
||||
const int i12 = fastmodulo(i2, ne12);
|
||||
const int i13 = fastmodulo(i3, ne13);
|
||||
|
||||
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
|
||||
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
||||
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
|
||||
|
||||
const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr;
|
||||
dst_t * dst_row = dst + i_dst;
|
||||
|
||||
const int i10 = fastmodulo(i0, ne10);
|
||||
|
||||
float result = src0_row ? (float) src0_row[i0] : 0.0f;
|
||||
if constexpr (sizeof...(src1_ptrs) > 0) {
|
||||
result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10])));
|
||||
} else {
|
||||
result = bin_op(result, (float)src1[i_src1 + i10]);
|
||||
}
|
||||
|
||||
dst_row[i0] = (dst_t) result;
|
||||
}
|
||||
|
||||
template <float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t, size_t... I>
|
||||
static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||
const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
|
||||
cudaStream_t stream, std::index_sequence<I...>) {
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
int nr0 = ne10 / ne0;
|
||||
int nr1 = ne11 / ne1;
|
||||
int nr2 = ne12 / ne2;
|
||||
int nr3 = ne13 / ne3;
|
||||
|
||||
int nr[4] = { nr0, nr1, nr2, nr3 };
|
||||
|
||||
int64_t cne[] = { ne0, ne1, ne2, ne3 };
|
||||
int64_t cne0[] = { ne00, ne01, ne02, ne03 };
|
||||
int64_t cne1[] = { ne10, ne11, ne12, ne13 };
|
||||
|
||||
size_t cnb[] = { nb0, nb1, nb2, nb3 };
|
||||
size_t cnb0[] = { nb00, nb01, nb02, nb03 };
|
||||
size_t cnb1[] = { nb10, nb11, nb12, nb13 };
|
||||
|
||||
auto collapse = [](int64_t cne[]) {
|
||||
cne[0] *= cne[1];
|
||||
cne[1] = cne[2];
|
||||
cne[2] = cne[3];
|
||||
cne[3] = 1;
|
||||
};
|
||||
|
||||
auto collapse_nb = [](size_t cnb[], const int64_t cne[]) {
|
||||
cnb[1] *= cne[1];
|
||||
cnb[2] *= cne[2];
|
||||
cnb[3] *= cne[3];
|
||||
};
|
||||
|
||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (nr[i] != 1) {
|
||||
break;
|
||||
}
|
||||
if (i > 0) {
|
||||
collapse_nb(cnb, cne);
|
||||
collapse_nb(cnb0, cne0);
|
||||
collapse_nb(cnb1, cne1);
|
||||
collapse(cne);
|
||||
collapse(cne0);
|
||||
collapse(cne1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
int64_t ne0 = cne[0];
|
||||
int64_t ne1 = cne[1];
|
||||
int64_t ne2 = cne[2];
|
||||
int64_t ne3 = cne[3];
|
||||
|
||||
//int64_t ne00 = cne0[0]; GGML_UNUSED(ne00);
|
||||
//int64_t ne01 = cne0[1]; GGML_UNUSED(ne01);
|
||||
//int64_t ne02 = cne0[2]; GGML_UNUSED(ne02);
|
||||
//int64_t ne03 = cne0[3]; GGML_UNUSED(ne03);
|
||||
|
||||
size_t nb0 = cnb[0];
|
||||
size_t nb1 = cnb[1];
|
||||
size_t nb2 = cnb[2];
|
||||
size_t nb3 = cnb[3];
|
||||
|
||||
size_t nb00 = cnb0[0];
|
||||
size_t nb01 = cnb0[1];
|
||||
size_t nb02 = cnb0[2];
|
||||
size_t nb03 = cnb0[3];
|
||||
|
||||
size_t nb10 = cnb1[0];
|
||||
size_t nb11 = cnb1[1];
|
||||
size_t nb12 = cnb1[2];
|
||||
size_t nb13 = cnb1[3];
|
||||
|
||||
size_t s0 = nb0 / sizeof(dst_t);
|
||||
size_t s1 = nb1 / sizeof(dst_t);
|
||||
size_t s2 = nb2 / sizeof(dst_t);
|
||||
size_t s3 = nb3 / sizeof(dst_t);
|
||||
|
||||
size_t s10 = nb10 / sizeof(src1_t);
|
||||
size_t s11 = nb11 / sizeof(src1_t);
|
||||
size_t s12 = nb12 / sizeof(src1_t);
|
||||
size_t s13 = nb13 / sizeof(src1_t);
|
||||
|
||||
size_t s00 = nb00 / sizeof(src0_t);
|
||||
size_t s01 = nb01 / sizeof(src0_t);
|
||||
size_t s02 = nb02 / sizeof(src0_t);
|
||||
size_t s03 = nb03 / sizeof(src0_t);
|
||||
|
||||
GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
|
||||
|
||||
GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
|
||||
GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
|
||||
GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
|
||||
GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
|
||||
|
||||
GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
|
||||
|
||||
GGML_ASSERT(s0 == 1);
|
||||
GGML_ASSERT(s00 == 1);
|
||||
GGML_ASSERT(s10 == 1);
|
||||
|
||||
const int block_size = 128;
|
||||
|
||||
int64_t hne0 = std::max(ne0 / 2LL, 1LL);
|
||||
|
||||
dim3 block_dims;
|
||||
block_dims.x = std::min<unsigned int>(hne0, block_size);
|
||||
block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
|
||||
block_dims.z = std::min(std::min<unsigned int>(ne2 * ne3, block_size / block_dims.x / block_dims.y), 64U);
|
||||
|
||||
dim3 block_nums((hne0 + block_dims.x - 1) / block_dims.x, (ne1 + block_dims.y - 1) / block_dims.y,
|
||||
(ne2 * ne3 + block_dims.z - 1) / block_dims.z);
|
||||
|
||||
const uint3 ne10 = init_fastdiv_values((uint32_t) cne1[0]);
|
||||
const uint3 ne11 = init_fastdiv_values((uint32_t) cne1[1]);
|
||||
const uint3 ne12 = init_fastdiv_values((uint32_t) cne1[2]);
|
||||
const uint3 ne13 = init_fastdiv_values((uint32_t) cne1[3]);
|
||||
|
||||
if (block_nums.z > 65535 || block_nums.y > 65535) {
|
||||
int block_num = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size;
|
||||
const uint3 prod_012 = init_fastdiv_values((uint32_t) (ne0 * ne1 * ne2));
|
||||
const uint3 prod_01 = init_fastdiv_values((uint32_t) (ne0 * ne1));
|
||||
const uint3 ne0_fastdiv = init_fastdiv_values((uint32_t) ne0);
|
||||
const uint3 ne1_fastdiv = init_fastdiv_values((uint32_t) ne1);
|
||||
const uint3 ne2_fastdiv = init_fastdiv_values((uint32_t) ne2);
|
||||
|
||||
if constexpr (sizeof...(I) > 0) {
|
||||
k_bin_bcast_unravel<bin_op, src0_t, src1_t, dst_t><<<block_num, block_size, 0, stream>>>(
|
||||
src0_dd, src1_dd, dst_dd, ne0_fastdiv, ne1_fastdiv, ne2_fastdiv, ne3, prod_012, prod_01, ne10, ne11,
|
||||
ne12, ne13,
|
||||
/* s0, */ s1, s2, s3,
|
||||
/* s00,*/ s01, s02, s03,
|
||||
/* s10,*/ s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...);
|
||||
} else {
|
||||
k_bin_bcast_unravel<bin_op, src0_t, src1_t, dst_t>
|
||||
<<<block_num, block_size, 0, stream>>>(src0_dd, src1_dd, dst_dd, ne0_fastdiv, ne1_fastdiv,
|
||||
ne2_fastdiv, ne3, prod_012, prod_01, ne10, ne11, ne12, ne13,
|
||||
/* s0, */ s1, s2, s3,
|
||||
/* s00,*/ s01, s02, s03,
|
||||
/* s10,*/ s11, s12, s13);
|
||||
}
|
||||
} else {
|
||||
const uint3 ne3_fastdiv = init_fastdiv_values((uint32_t) ne3);
|
||||
if constexpr (sizeof...(I) > 0) {
|
||||
k_bin_bcast<bin_op, src0_t, src1_t, dst_t><<<block_nums, block_dims, 0, stream>>>(
|
||||
src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3_fastdiv, ne10, ne11, ne12, ne13,
|
||||
/* s0, */ s1, s2, s3,
|
||||
/* s00,*/ s01, s02, s03,
|
||||
/* s10,*/ s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...);
|
||||
} else {
|
||||
k_bin_bcast<bin_op, src0_t, src1_t, dst_t><<<block_nums, block_dims, 0, stream>>>(
|
||||
src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3_fastdiv, ne10, ne11, ne12, ne13,
|
||||
/* s0, */ s1, s2, s3,
|
||||
/* s00,*/ s01, s02, s03,
|
||||
/* s10,*/ s11, s12, s13);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __global__ void k_repeat_back(
|
||||
const T * __restrict__ src, T * __restrict__ dst, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
|
||||
const size_t s00, const size_t s01, const size_t s02, const size_t s03,
|
||||
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3) {
|
||||
|
||||
const int64_t tid0 = int64_t(blockIdx.x)*blockDim.x + threadIdx.x;
|
||||
const int64_t tid1 = int64_t(blockIdx.y)*blockDim.y + threadIdx.y;
|
||||
const int64_t tid23 = int64_t(blockIdx.z)*blockDim.z + threadIdx.z;
|
||||
const int64_t tid2 = tid23 % ne2;
|
||||
const int64_t tid3 = tid23 / ne2;
|
||||
|
||||
if (tid0 >= ne0) {
|
||||
return;
|
||||
}
|
||||
|
||||
T sum = 0;
|
||||
for (int64_t i3 = tid3; i3 < ne03; i3 += ne3) {
|
||||
for (int64_t i2 = tid2; i2 < ne02; i2 += ne2) {
|
||||
for (int64_t i1 = tid1; i1 < ne01; i1 += ne1) {
|
||||
for (int64_t i0 = tid0; i0 < ne00; i0 += ne0) {
|
||||
sum += src[i3*s03 + i2*s02 + i1*s01 + i0*s00];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
dst[tid3*ne2*ne1*ne0 + tid2*ne1*ne0 + tid1*ne0 + tid0] = sum;
|
||||
}
|
||||
|
||||
template <float (*bin_op)(const float, const float), int n_fuse = 1>
|
||||
struct bin_bcast_cuda {
|
||||
template<typename src0_t, typename src1_t, typename dst_t>
|
||||
void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,
|
||||
const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
|
||||
cudaStream_t stream) {
|
||||
launch_bin_bcast_pack<bin_op, src0_t, src1_t, dst_t>(
|
||||
src0, src1, dst, src0_dd, src1_dd, dst_dd, stream, std::make_index_sequence<n_fuse>{});
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
static void repeat_back_cuda(
|
||||
const T * src, T * dst, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
|
||||
const size_t s00, const size_t s01, const size_t s02, const size_t s03,
|
||||
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
|
||||
|
||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
||||
const dim3 block_nums((ne0 + WARP_SIZE - 1) / WARP_SIZE, ne1, ne2*ne3);
|
||||
k_repeat_back<T><<<block_nums, block_dims, 0, stream>>>
|
||||
(src, dst, ne00, ne01, ne02, ne03, s00, s01, s02, s03, ne0, ne1, ne2, ne3);
|
||||
}
|
||||
|
||||
template<class op>
|
||||
static void ggml_cuda_op_bin_bcast(
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||
const void * src0_dd, const void * src1_dd, void * dst_dd, cudaStream_t stream) {
|
||||
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
|
||||
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
op()(src0, src1, dst, (const float *)src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
||||
op()(src0, src1, dst, (const half *) src0_dd, (const half *)src1_dd, (half *) dst_dd, stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
|
||||
op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (half *) dst_dd, stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
||||
op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
|
||||
} else {
|
||||
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
|
||||
ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat, 0>>(dst, dst->src[0], dst, nullptr, dst->src[0]->data, dst->data, ctx.stream());
|
||||
}
|
||||
|
||||
void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
|
||||
}
|
||||
|
||||
void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_sub>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
|
||||
}
|
||||
|
||||
void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
|
||||
}
|
||||
|
||||
void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
|
||||
}
|
||||
|
||||
template <float (*op)(const float, const float), int n_fuse>
|
||||
static void ggml_cuda_op_fused_binbcast_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
launch_bin_bcast_pack<op, float, float, float>(src0, src1, dst,
|
||||
(const float *) src0->data, (const float *) src1->data, (float *) dst->data,
|
||||
stream, std::make_index_sequence<n_fuse>{});
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
||||
launch_bin_bcast_pack<op, half, half, half>(src0, src1, dst,
|
||||
(const half *) src0->data, (const half *) src1->data, (half *) dst->data,
|
||||
stream, std::make_index_sequence<n_fuse>{});
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
|
||||
launch_bin_bcast_pack<op, half, float, half>(src0, src1, dst,
|
||||
(const half *) src0->data, (const float *) src1->data, (half *) dst->data,
|
||||
stream, std::make_index_sequence<n_fuse>{});
|
||||
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
||||
launch_bin_bcast_pack<op, half, float, float>(src0, src1, dst,
|
||||
(const half *) src0->data, (const float *) src1->data, (float *) dst->data,
|
||||
stream, std::make_index_sequence<n_fuse>{});
|
||||
} else {
|
||||
fprintf(stderr,
|
||||
"%s: unsupported types for fusion: dst: %s, src0: %s, src1: %s\n",
|
||||
__func__, ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void ggml_cuda_op_fused_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst, int n_fuse) {
|
||||
GGML_ASSERT(2 <= n_fuse && n_fuse <= 8);
|
||||
|
||||
switch (n_fuse) {
|
||||
case 2:
|
||||
ggml_cuda_op_fused_binbcast_impl<op_add, 2>(ctx, dst);
|
||||
break;
|
||||
case 3:
|
||||
ggml_cuda_op_fused_binbcast_impl<op_add, 3>(ctx, dst);
|
||||
break;
|
||||
case 4:
|
||||
ggml_cuda_op_fused_binbcast_impl<op_add, 4>(ctx, dst);
|
||||
break;
|
||||
case 5:
|
||||
ggml_cuda_op_fused_binbcast_impl<op_add, 5>(ctx, dst);
|
||||
break;
|
||||
case 6:
|
||||
ggml_cuda_op_fused_binbcast_impl<op_add, 6>(ctx, dst);
|
||||
break;
|
||||
case 7:
|
||||
ggml_cuda_op_fused_binbcast_impl<op_add, 7>(ctx, dst);
|
||||
break;
|
||||
case 8:
|
||||
ggml_cuda_op_fused_binbcast_impl<op_add, 8>(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false && "Unsupported n_fuse value");
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
GGML_ASSERT(src0->type == dst->type);
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
GGML_ASSERT(ggml_can_repeat(dst, src0));
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS;
|
||||
|
||||
GGML_ASSERT(ne2*ne3 <= (1 << 15));
|
||||
|
||||
const size_t ts = ggml_type_size(src0->type);
|
||||
const size_t s00 = nb00 / ts;
|
||||
const size_t s01 = nb01 / ts;
|
||||
const size_t s02 = nb02 / ts;
|
||||
const size_t s03 = nb03 / ts;
|
||||
|
||||
switch (dst->type) {
|
||||
case GGML_TYPE_F32: {
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
repeat_back_cuda(src0_d, dst_d, ne00, ne01, ne02, ne03, s00, s01, s02, s03, ne0, ne1, ne2, ne3, stream);
|
||||
} break;
|
||||
default: {
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
#include "common.cuh"
|
||||
|
||||
void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_fused_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst, int n_fuse);
|
||||
@@ -0,0 +1,45 @@
|
||||
#include "clamp.cuh"
|
||||
|
||||
static __device__ __forceinline__ float op_clamp(float x, float min, float max) {
|
||||
return fminf(fmaxf(x, min), max);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
static __global__ void op_clamp_kernel(const T * x, T * dst, const T min, const T max, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
|
||||
dst[i] = (T)op_clamp((float)x[i], (float)min, (float)max);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
static void clamp_cuda(const T * x, T * dst, const T min, const T max, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
|
||||
op_clamp_kernel<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
|
||||
}
|
||||
|
||||
|
||||
void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const void * src0_d = src0->data;
|
||||
void * dst_d = dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src0->type == dst->type);
|
||||
|
||||
float min;
|
||||
float max;
|
||||
memcpy(&min, dst->op_params, sizeof(float));
|
||||
memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
|
||||
|
||||
if (src0->type == GGML_TYPE_F16) {
|
||||
clamp_cuda((const half *)src0_d, (half *)dst_d, (half)min, (half)max, ggml_nelements(src0), stream);
|
||||
} else {
|
||||
clamp_cuda((const float *)src0_d, (float *)dst_d, (float)min, (float)max, ggml_nelements(src0), stream);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
#include "common.cuh"
|
||||
|
||||
#define CUDA_CLAMP_BLOCK_SIZE 256
|
||||
|
||||
void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,221 @@
|
||||
#include "concat.cuh"
|
||||
|
||||
// contiguous kernels
|
||||
static __global__ void concat_f32_dim0(const float * x, const float * y, float * dst, const int ne0, const int ne00) {
|
||||
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (nidx >= ne0) {
|
||||
return;
|
||||
}
|
||||
|
||||
int offset_dst =
|
||||
nidx +
|
||||
blockIdx.y * ne0 +
|
||||
blockIdx.z * ne0 * gridDim.y;
|
||||
|
||||
if (nidx < ne00) { // src0
|
||||
int offset_src =
|
||||
nidx +
|
||||
blockIdx.y * ne00 +
|
||||
blockIdx.z * ne00 * gridDim.y;
|
||||
dst[offset_dst] = x[offset_src];
|
||||
} else {
|
||||
int offset_src =
|
||||
(nidx - ne00) +
|
||||
blockIdx.y * (ne0 - ne00) +
|
||||
blockIdx.z * (ne0 - ne00) * gridDim.y;
|
||||
dst[offset_dst] = y[offset_src];
|
||||
}
|
||||
}
|
||||
|
||||
static __global__ void concat_f32_dim1(const float * x, const float * y, float * dst, const int ne0, const int ne01) {
|
||||
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (nidx >= ne0) {
|
||||
return;
|
||||
}
|
||||
|
||||
int offset_dst =
|
||||
nidx +
|
||||
blockIdx.y * ne0 +
|
||||
blockIdx.z * ne0 * gridDim.y;
|
||||
|
||||
if (blockIdx.y < (unsigned)ne01) { // src0
|
||||
int offset_src =
|
||||
nidx +
|
||||
blockIdx.y * ne0 +
|
||||
blockIdx.z * ne0 * ne01;
|
||||
dst[offset_dst] = x[offset_src];
|
||||
} else {
|
||||
int offset_src =
|
||||
nidx +
|
||||
(blockIdx.y - ne01) * ne0 +
|
||||
blockIdx.z * ne0 * (gridDim.y - ne01);
|
||||
dst[offset_dst] = y[offset_src];
|
||||
}
|
||||
}
|
||||
|
||||
static __global__ void concat_f32_dim2(const float * x, const float * y, float * dst, const int ne0, const int ne02) {
|
||||
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (nidx >= ne0) {
|
||||
return;
|
||||
}
|
||||
|
||||
int offset_dst =
|
||||
nidx +
|
||||
blockIdx.y * ne0 +
|
||||
blockIdx.z * ne0 * gridDim.y;
|
||||
|
||||
if (blockIdx.z < (unsigned)ne02) { // src0
|
||||
int offset_src =
|
||||
nidx +
|
||||
blockIdx.y * ne0 +
|
||||
blockIdx.z * ne0 * gridDim.y;
|
||||
dst[offset_dst] = x[offset_src];
|
||||
} else {
|
||||
int offset_src =
|
||||
nidx +
|
||||
blockIdx.y * ne0 +
|
||||
(blockIdx.z - ne02) * ne0 * gridDim.y;
|
||||
dst[offset_dst] = y[offset_src];
|
||||
}
|
||||
}
|
||||
|
||||
static void concat_f32_cuda(const float * x, const float * y, float * dst, int ne00, int ne01, int ne02, int ne0, int ne1, int ne2, int dim, cudaStream_t stream) {
|
||||
int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
|
||||
dim3 gridDim(num_blocks, ne1, ne2);
|
||||
if (dim == 0) {
|
||||
concat_f32_dim0<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne00);
|
||||
return;
|
||||
}
|
||||
if (dim == 1) {
|
||||
concat_f32_dim1<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne01);
|
||||
return;
|
||||
}
|
||||
concat_f32_dim2<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
|
||||
}
|
||||
|
||||
// non-contiguous kernel (slow)
|
||||
template <int dim>
|
||||
static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
|
||||
concat_f32_non_cont(
|
||||
const char * src0,
|
||||
const char * src1,
|
||||
char * dst,
|
||||
int64_t ne00,
|
||||
int64_t ne01,
|
||||
int64_t ne02,
|
||||
int64_t ne03,
|
||||
uint64_t nb00,
|
||||
uint64_t nb01,
|
||||
uint64_t nb02,
|
||||
uint64_t nb03,
|
||||
int64_t /*ne10*/,
|
||||
int64_t /*ne11*/,
|
||||
int64_t /*ne12*/,
|
||||
int64_t /*ne13*/,
|
||||
uint64_t nb10,
|
||||
uint64_t nb11,
|
||||
uint64_t nb12,
|
||||
uint64_t nb13,
|
||||
int64_t ne0,
|
||||
int64_t /*ne1*/,
|
||||
int64_t /*ne2*/,
|
||||
int64_t /*ne3*/,
|
||||
uint64_t nb0,
|
||||
uint64_t nb1,
|
||||
uint64_t nb2,
|
||||
uint64_t nb3){
|
||||
static_assert(dim >= 0 && dim <= 3, "dim must be in [0, 3]");
|
||||
|
||||
const int64_t i3 = blockIdx.z;
|
||||
const int64_t i2 = blockIdx.y;
|
||||
const int64_t i1 = blockIdx.x;
|
||||
|
||||
const float * x;
|
||||
|
||||
for (int64_t i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
|
||||
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||
x = (const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00);
|
||||
} else {
|
||||
if constexpr (dim == 0) {
|
||||
x = (const float *) (src1 + i3 * nb13 + i2 * nb12 + i1 * nb11 + (i0 - ne00) * nb10);
|
||||
} else if constexpr (dim == 1) {
|
||||
x = (const float *) (src1 + i3 * nb13 + i2 * nb12 + (i1 - ne01) * nb11 + i0 * nb10);
|
||||
} else if constexpr (dim == 2) {
|
||||
x = (const float *) (src1 + i3 * nb13 + (i2 - ne02) * nb12 + i1 * nb11 + i0 * nb10);
|
||||
} else if constexpr (dim == 3) {
|
||||
x = (const float *) (src1 + (i3 - ne03) * nb13 + i2 * nb12 + i1 * nb11 + i0 * nb10);
|
||||
}
|
||||
}
|
||||
|
||||
float * y = (float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
*y = *x;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
const int32_t dim = ((int32_t *) dst->op_params)[0];
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
const float * src1_d = (const float *)src1->data;
|
||||
|
||||
float * dst_d = (float *)dst->data;
|
||||
|
||||
if (dim != 3) {
|
||||
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
|
||||
concat_f32_cuda(
|
||||
src0_d + i3 * (src0->nb[3] / 4),
|
||||
src1_d + i3 * (src1->nb[3] / 4),
|
||||
dst_d + i3 * ( dst->nb[3] / 4),
|
||||
src0->ne[0], src0->ne[1], src0->ne[2],
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dim, stream);
|
||||
}
|
||||
} else {
|
||||
const size_t size0 = ggml_nbytes(src0);
|
||||
const size_t size1 = ggml_nbytes(src1);
|
||||
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst_d, src0_d, size0, cudaMemcpyDeviceToDevice, stream));
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst_d + size0/4, src1_d, size1, cudaMemcpyDeviceToDevice, stream));
|
||||
}
|
||||
} else {
|
||||
dim3 grid_dim(dst->ne[1], dst->ne[2], dst->ne[3]);
|
||||
auto launch_kernel = [&](auto dim) {
|
||||
concat_f32_non_cont<dim><<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(
|
||||
(const char *) src0->data, (const char *) src1->data, (char *) dst->data,
|
||||
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
||||
src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
|
||||
src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3],
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||
dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3]);
|
||||
};
|
||||
switch (dim) {
|
||||
case 0:
|
||||
launch_kernel(std::integral_constant<int, 0>{});
|
||||
break;
|
||||
case 1:
|
||||
launch_kernel(std::integral_constant<int, 1>{});
|
||||
break;
|
||||
case 2:
|
||||
launch_kernel(std::integral_constant<int, 2>{});
|
||||
break;
|
||||
case 3:
|
||||
launch_kernel(std::integral_constant<int, 3>{});
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("Invalid dim: %d", dim);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
#include "common.cuh"
|
||||
|
||||
#define CUDA_CONCAT_BLOCK_SIZE 256
|
||||
|
||||
void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
@@ -0,0 +1,86 @@
|
||||
#include "conv-transpose-1d.cuh"
|
||||
|
||||
static __global__ void conv_transpose_1d_kernel(
|
||||
const int s0, const int p0, const int d0, const int output_size,
|
||||
const int src0_ne0, const int src0_ne1, const int src0_ne2, const int src0_ne3,
|
||||
const int src1_ne0, const int src1_ne1, const int src1_ne2, const int src1_ne3,
|
||||
const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3,
|
||||
const float * src0, const float * src1, float * dst) {
|
||||
int global_index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (global_index >= output_size) {
|
||||
return;
|
||||
}
|
||||
|
||||
int out_index = global_index / dst_ne0;
|
||||
|
||||
float accumulator = 0;
|
||||
|
||||
for (int c = 0; c < src0_ne2; c++) {
|
||||
int idx = global_index % dst_ne0;
|
||||
|
||||
int kernel_offset = (src0_ne0 * src0_ne1 * c) + (out_index * src0_ne0);
|
||||
int input_offset = src1_ne0 * c;
|
||||
|
||||
for (int i = 0; i < src1_ne0; i++) {
|
||||
if (!(idx >= i*s0 && idx < i*s0 + src0_ne0)) {
|
||||
continue;
|
||||
}
|
||||
int weight_idx = idx - i*s0;
|
||||
|
||||
float kernel_weight = src0[kernel_offset + weight_idx];
|
||||
float input_value = src1[input_offset+i];
|
||||
|
||||
accumulator += kernel_weight * input_value;
|
||||
}
|
||||
}
|
||||
dst[global_index] = accumulator;
|
||||
GGML_UNUSED_VARS(p0, d0, src0_ne3, src1_ne3, dst_ne3, src1_ne1, dst_ne1, src1_ne2, dst_ne2);
|
||||
}
|
||||
|
||||
static void conv_transpose_1d_f32_f32_cuda(
|
||||
const int s0, const int p0, const int d0, const int output_size,
|
||||
const int src0_ne0, const int src0_ne1, const int src0_ne2, const int src0_ne3,
|
||||
const int src1_ne0, const int src1_ne1, const int src1_ne2, const int src1_ne3,
|
||||
const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3,
|
||||
const float * src0, const float * src1, float * dst,
|
||||
cudaStream_t stream) {
|
||||
|
||||
const int num_blocks = (output_size + CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE;
|
||||
conv_transpose_1d_kernel<<<num_blocks,CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE, 0, stream>>>(
|
||||
s0,p0,d0,output_size,
|
||||
src0_ne0, src0_ne1, src0_ne2, src0_ne3,
|
||||
src1_ne0, src1_ne1, src1_ne2, src1_ne3,
|
||||
dst_ne0, dst_ne1, dst_ne2, dst_ne3,
|
||||
src0,src1, dst);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
const float * src1_d = (const float *)src1->data;
|
||||
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous(src1));
|
||||
|
||||
const int32_t * opts = (const int32_t *)dst->op_params;
|
||||
|
||||
const int s0 = opts[0];
|
||||
const int p0 = 0;//opts[3];
|
||||
const int d0 = 1;//opts[4];
|
||||
|
||||
const int64_t output_size = ggml_nelements(dst);
|
||||
|
||||
conv_transpose_1d_f32_f32_cuda(s0, p0, d0, output_size,
|
||||
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||
src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||
src0_d, src1_d, dst_d, stream);
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
#include "common.cuh"
|
||||
|
||||
#define CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE 256
|
||||
|
||||
void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
@@ -0,0 +1,161 @@
|
||||
#include "conv2d-dw.cuh"
|
||||
|
||||
struct conv_params {
|
||||
int in_w, in_h;
|
||||
int out_w, out_h;
|
||||
int kernel_w, kernel_h;
|
||||
int stride_x, stride_y;
|
||||
int padding_x, padding_y;
|
||||
int dilation_x, dilation_y;
|
||||
int channels, batches;
|
||||
};
|
||||
|
||||
struct kernel_bounds {
|
||||
int y_min, y_max;
|
||||
int x_min, x_max;
|
||||
};
|
||||
|
||||
__device__ __forceinline__ kernel_bounds calculate_kernel_bounds(int out_x, int out_y, const conv_params & params) {
|
||||
kernel_bounds bounds;
|
||||
bounds.y_min = max(0, (params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y);
|
||||
bounds.y_max =
|
||||
min(params.kernel_h,
|
||||
(params.in_h + params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y);
|
||||
bounds.x_min = max(0, (params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x);
|
||||
bounds.x_max =
|
||||
min(params.kernel_w,
|
||||
(params.in_w + params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x);
|
||||
return bounds;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ int calculate_input_coord(int out_coord, int kern_coord, int stride, int dilation, int padding) {
|
||||
return out_coord * stride + kern_coord * dilation - padding;
|
||||
}
|
||||
|
||||
struct whcn_layout {
|
||||
__device__ static int input_index(int n, int c, int y, int x, const conv_params & params) {
|
||||
return n * (params.channels * params.in_w * params.in_h) + c * params.in_w * params.in_h + y * params.in_w + x;
|
||||
}
|
||||
|
||||
__device__ static int kernel_index(int c, int ky, int kx, const conv_params & params) {
|
||||
return c * params.kernel_h * params.kernel_w + ky * params.kernel_w + kx;
|
||||
}
|
||||
|
||||
__device__ static int output_index(int n, int c, int y, int x, const conv_params & params) {
|
||||
return n * (params.channels * params.out_w * params.out_h) + c * params.out_w * params.out_h +
|
||||
y * params.out_w + x;
|
||||
}
|
||||
|
||||
__device__ static void unpack_indices(int global_idx, const conv_params & params, int & n, int & c, int & out_y,
|
||||
int & out_x) {
|
||||
out_x = global_idx % params.out_w;
|
||||
out_y = (global_idx / params.out_w) % params.out_h;
|
||||
c = (global_idx / (params.out_w * params.out_h)) % params.channels;
|
||||
n = global_idx / (params.out_w * params.out_h * params.channels);
|
||||
}
|
||||
};
|
||||
|
||||
struct cwhn_layout {
|
||||
__device__ static int input_index(int n, int c, int y, int x, const conv_params & params) {
|
||||
return n * (params.channels * params.in_w * params.in_h) + (y * params.in_w + x) * params.channels + c;
|
||||
}
|
||||
|
||||
__device__ static int kernel_index(int c, int ky, int kx, const conv_params & params) {
|
||||
return (ky * params.kernel_w + kx) * params.channels + c;
|
||||
}
|
||||
|
||||
__device__ static int output_index(int n, int c, int y, int x, const conv_params & params) {
|
||||
return n * (params.channels * params.out_w * params.out_h) + y * (params.out_w * params.channels) +
|
||||
x * params.channels + c;
|
||||
}
|
||||
|
||||
__device__ static void unpack_indices(int global_idx, const conv_params & params, int & n, int & c, int & out_y,
|
||||
int & out_x) {
|
||||
c = global_idx % params.channels;
|
||||
out_x = (global_idx / params.channels) % params.out_w;
|
||||
out_y = (global_idx / (params.channels * params.out_w)) % params.out_h;
|
||||
n = global_idx / (params.channels * params.out_w * params.out_h);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename Layout>
|
||||
__global__ void conv2d_dw_kernel(const T * __restrict__ input, const T * __restrict__ kernel, T * __restrict__ output,
|
||||
const int in_w, const int in_h, const int out_w, const int out_h,
|
||||
const int kernel_w, const int kernel_h, const int stride_x, const int stride_y,
|
||||
const int padding_x, const int padding_y, const int dilation_x, const int dilation_y,
|
||||
const int channels, const int batches) {
|
||||
const int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int total_elements = batches * channels * out_h * out_w;
|
||||
|
||||
if (global_idx >= total_elements) {
|
||||
return;
|
||||
}
|
||||
|
||||
conv_params params = { in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x,
|
||||
stride_y, padding_x, padding_y, dilation_x, dilation_y, channels, batches };
|
||||
|
||||
int batch_idx, channel_idx, out_y_idx, out_x_idx;
|
||||
Layout::unpack_indices(global_idx, params, batch_idx, channel_idx, out_y_idx, out_x_idx);
|
||||
|
||||
T accumulator = 0;
|
||||
kernel_bounds bounds = calculate_kernel_bounds(out_x_idx, out_y_idx, params);
|
||||
|
||||
for (int kern_y = bounds.y_min; kern_y < bounds.y_max; ++kern_y) {
|
||||
int in_y_idx = calculate_input_coord(out_y_idx, kern_y, params.stride_y, params.dilation_y, params.padding_y);
|
||||
|
||||
for (int kern_x = bounds.x_min; kern_x < bounds.x_max; ++kern_x) {
|
||||
int in_x_idx = calculate_input_coord(out_x_idx, kern_x, params.stride_x, params.dilation_x, params.padding_x);
|
||||
|
||||
const T input_val = input[Layout::input_index(batch_idx, channel_idx, in_y_idx, in_x_idx, params)];
|
||||
const T kernel_val = kernel[Layout::kernel_index(channel_idx, kern_y, kern_x, params)];
|
||||
|
||||
accumulator += input_val * kernel_val;
|
||||
}
|
||||
}
|
||||
|
||||
output[Layout::output_index(batch_idx, channel_idx, out_y_idx, out_x_idx, params)] = accumulator;
|
||||
}
|
||||
|
||||
void ggml_cuda_op_conv2d_dw(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * kernel = dst->src[0];
|
||||
const ggml_tensor * input = dst->src[1];
|
||||
|
||||
GGML_ASSERT(kernel->type == GGML_TYPE_F32 && input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||
const float * w_d = (const float *) kernel->data;
|
||||
const float * x_d = (const float *) input->data;
|
||||
float * y_d = (float *) dst->data;
|
||||
|
||||
const int32_t * p = (const int32_t *) dst->op_params;
|
||||
const int stride_x = p[0];
|
||||
const int stride_y = p[1];
|
||||
const int padding_x = p[2];
|
||||
const int padding_y = p[3];
|
||||
const int dilation_x = p[4];
|
||||
const int dilation_y = p[5];
|
||||
|
||||
const int in_w = input->ne[0];
|
||||
const int in_h = input->ne[1];
|
||||
const int kernel_w = kernel->ne[0];
|
||||
const int kernel_h = kernel->ne[1];
|
||||
const int out_w = dst->ne[0];
|
||||
const int out_h = dst->ne[1];
|
||||
const int channels = dst->ne[2];
|
||||
const int batches = dst->ne[3];
|
||||
|
||||
cudaStream_t st = ctx.stream();
|
||||
|
||||
const int total = batches * channels * out_h * out_w;
|
||||
const int blocks = (total + CUDA_CONV2D_DW_BLOCK_SIZE - 1) / CUDA_CONV2D_DW_BLOCK_SIZE;
|
||||
|
||||
if (ggml_is_contiguous(input)) {
|
||||
conv2d_dw_kernel<float, whcn_layout><<<blocks, CUDA_CONV2D_DW_BLOCK_SIZE, 0, st>>>(
|
||||
x_d, w_d, y_d, in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x, stride_y, padding_x, padding_y,
|
||||
dilation_x, dilation_y, channels, batches);
|
||||
} else if (ggml_is_contiguous_channels(input)) {
|
||||
conv2d_dw_kernel<float, cwhn_layout><<<blocks, CUDA_CONV2D_DW_BLOCK_SIZE, 0, st>>>(
|
||||
x_d, w_d, y_d, in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x, stride_y, padding_x, padding_y,
|
||||
dilation_x, dilation_y, channels, batches);
|
||||
} else {
|
||||
GGML_ABORT("Unsupported memory layout for conv_2d_dw");
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
#pragma once
|
||||
#include "common.cuh"
|
||||
|
||||
#define CUDA_CONV2D_DW_BLOCK_SIZE 256
|
||||
void ggml_cuda_op_conv2d_dw(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
@@ -0,0 +1,91 @@
|
||||
#include <algorithm>
|
||||
|
||||
#include "conv2d-transpose.cuh"
|
||||
#include "ggml.h"
|
||||
|
||||
__global__ void conv2d_transpose_kernel(const float * __restrict__ input, const half * __restrict__ kernel,
|
||||
float * __restrict__ output, const int in_w, const int in_h, const int out_w,
|
||||
const int out_h, const int kernel_w, const int kernel_h, const int stride,
|
||||
const int c_in, const int c_out, const int batches) {
|
||||
const int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
const int total_elements = out_w * out_h * c_out * batches;
|
||||
|
||||
if (global_idx >= total_elements) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int out_x_idx = global_idx % out_w;
|
||||
const int out_y_idx = (global_idx / out_w) % out_h;
|
||||
const int c_idx = (global_idx / (out_w * out_h)) % c_out;
|
||||
const int n_idx = global_idx / (out_w * out_h * c_out);
|
||||
|
||||
float accumulator = 0;
|
||||
// For each output idx, find the inputs that contribute to it by checking stride alignment and bounds
|
||||
|
||||
for (int c_in_idx = 0; c_in_idx < c_in; c_in_idx++) {
|
||||
for (int kh = 0; kh < kernel_h; ++kh) {
|
||||
int in_y = out_y_idx - kh;
|
||||
if (in_y < 0 || in_y % stride) continue;
|
||||
in_y /= stride;
|
||||
if (in_y >= in_h) continue;
|
||||
|
||||
for (int kw = 0; kw < kernel_w; ++kw) {
|
||||
int in_x = out_x_idx - kw;
|
||||
if (in_x < 0 || in_x % stride) continue;
|
||||
in_x /= stride;
|
||||
if (in_x >= in_w) continue;
|
||||
|
||||
const int input_idx = (in_w * in_h * c_in) * n_idx + (in_w * in_h) * c_in_idx + (in_w) *in_y + in_x;
|
||||
const int kernel_idx =
|
||||
(kernel_h * kernel_w * c_out) * c_in_idx + (kernel_h * kernel_w) * c_idx + (kernel_w) *kh + kw;
|
||||
|
||||
float input_val = input[input_idx];
|
||||
half kern_val = kernel[kernel_idx];
|
||||
|
||||
accumulator += input_val * (float) kern_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
output[(out_w * out_h * c_out) * n_idx + (out_w * out_h) * c_idx + (out_w) *out_y_idx + out_x_idx] = accumulator;
|
||||
}
|
||||
|
||||
//input is (W, H, C_in, N), Kernel is (W, H, C_out, C_in)
|
||||
void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * kernel = dst->src[0];
|
||||
const ggml_tensor * input = dst->src[1];
|
||||
|
||||
GGML_ASSERT(kernel->type == GGML_TYPE_F16 && input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||
|
||||
const float * input_data = (const float *) input->data;
|
||||
float * output_data = (float *) dst->data;
|
||||
const half * kernel_data = (const half *) kernel->data;
|
||||
|
||||
const int input_w = input->ne[0];
|
||||
const int input_h = input->ne[1];
|
||||
const int output_w = dst->ne[0];
|
||||
const int output_h = dst->ne[1];
|
||||
const int channels_in = input->ne[2];
|
||||
const int channels_out = kernel->ne[2];
|
||||
const int kernel_w = kernel->ne[0];
|
||||
const int kernel_h = kernel->ne[1];
|
||||
const int stride = dst->op_params[0];
|
||||
const int batches = input->ne[3];
|
||||
|
||||
GGML_ASSERT(channels_in == kernel->ne[3]);
|
||||
GGML_ASSERT(stride > 0);
|
||||
|
||||
cudaStream_t st = ctx.stream();
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(input));
|
||||
GGML_ASSERT(ggml_is_contiguous(kernel));
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
|
||||
const int total = (output_w * output_h * channels_out * batches);
|
||||
const int blocks = (total + CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE - 1) / CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE;
|
||||
|
||||
conv2d_transpose_kernel<<<blocks, CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE, 0, st>>>(
|
||||
input_data, kernel_data, output_data, input_w, input_h, output_w, output_h, kernel_w, kernel_h, stride,
|
||||
channels_in, channels_out, batches);
|
||||
}
|
||||
@@ -0,0 +1,4 @@
|
||||
#include "common.cuh"
|
||||
|
||||
#define CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE 256
|
||||
void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
@@ -0,0 +1,166 @@
|
||||
#include "conv2d.cuh"
|
||||
#include "convert.cuh"
|
||||
|
||||
struct conv_params {
|
||||
const int64_t IW, IH;
|
||||
const int64_t OW, OH;
|
||||
const int64_t KW, KH;
|
||||
const int64_t ST_X, ST_Y;
|
||||
const int64_t PD_X, PD_Y;
|
||||
const int64_t DL_X, DL_Y;
|
||||
const int64_t IC, OC;
|
||||
const int64_t B;
|
||||
const int64_t TOTAL;
|
||||
};
|
||||
|
||||
struct kernel_bounds {
|
||||
int64_t y_min, y_max;
|
||||
int64_t x_min, x_max;
|
||||
};
|
||||
|
||||
__device__ __forceinline__ int64_t max64(int64_t a, int64_t b) {
|
||||
return (a > b) ? a : b;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ int64_t min64(int64_t a, int64_t b) {
|
||||
return (a < b) ? a : b;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ kernel_bounds calculate_kernel_bounds(int64_t out_x, int64_t out_y, const conv_params & P) {
|
||||
kernel_bounds bounds;
|
||||
bounds.y_min = max64(0, (P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y);
|
||||
bounds.y_max = min64(P.KH, (P.IH + P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y);
|
||||
bounds.x_min = max64(0, (P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X);
|
||||
bounds.x_max = min64(P.KW, (P.IW + P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X);
|
||||
return bounds;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ int calculate_input_coord(int64_t out_coord,
|
||||
int64_t kern_coord,
|
||||
int64_t stride,
|
||||
int64_t dilation,
|
||||
int64_t padding) {
|
||||
return out_coord * stride + kern_coord * dilation - padding;
|
||||
}
|
||||
|
||||
struct whcn_layout {
|
||||
__device__ static int64_t input_index(int64_t n, int64_t c, int64_t y, int64_t x, const conv_params & P) {
|
||||
return n * (P.IC * P.IW * P.IH) + c * P.IW * P.IH + y * P.IW + x;
|
||||
}
|
||||
|
||||
__device__ static int64_t kernel_index(int64_t c_out, int64_t c_in, int64_t ky, int64_t kx, const conv_params & P) {
|
||||
return c_out * (P.IC * P.KH * P.KW) + c_in * (P.KH * P.KW) + ky * P.KW + kx;
|
||||
}
|
||||
|
||||
__device__ static int64_t output_index(int64_t n, int64_t c, int64_t y, int64_t x, const conv_params & P) {
|
||||
return n * (P.OC * P.OW * P.OH) + c * P.OW * P.OH + y * P.OW + x;
|
||||
}
|
||||
|
||||
__device__ static void unpack_indices(int64_t global_idx,
|
||||
const conv_params & P,
|
||||
int64_t & n,
|
||||
int64_t & c,
|
||||
int64_t & out_y,
|
||||
int64_t & out_x) {
|
||||
out_x = global_idx % P.OW;
|
||||
out_y = (global_idx / P.OW) % P.OH;
|
||||
c = (global_idx / (P.OW * P.OH)) % P.OC;
|
||||
n = global_idx / (P.OW * P.OH * P.OC);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename Layout>
|
||||
static __global__ void conv2d_kernel(const float * __restrict__ input,
|
||||
const T * __restrict__ kernel,
|
||||
float * __restrict__ output,
|
||||
const conv_params P) {
|
||||
const int64_t global_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (global_idx >= P.TOTAL) {
|
||||
return;
|
||||
}
|
||||
|
||||
int64_t n, c_out, out_y, out_x;
|
||||
Layout::unpack_indices(global_idx, P, n, c_out, out_y, out_x);
|
||||
|
||||
float acc = 0.0f;
|
||||
|
||||
for (int64_t c_in = 0; c_in < P.IC; ++c_in) {
|
||||
kernel_bounds bounds = calculate_kernel_bounds(out_x, out_y, P);
|
||||
|
||||
for (int64_t ky = bounds.y_min; ky < bounds.y_max; ++ky) {
|
||||
const int64_t in_y = calculate_input_coord(out_y, ky, P.ST_Y, P.DL_Y, P.PD_Y);
|
||||
|
||||
for (int64_t kx = bounds.x_min; kx < bounds.x_max; ++kx) {
|
||||
const int64_t in_x = calculate_input_coord(out_x, kx, P.ST_X, P.DL_X, P.PD_X);
|
||||
|
||||
const float input_val = input[Layout::input_index(n, c_in, in_y, in_x, P)];
|
||||
const T kernel_val = kernel[Layout::kernel_index(c_out, c_in, ky, kx, P)];
|
||||
acc += (input_val * ggml_cuda_cast<float>(kernel_val));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// [N, OC, OH, OW]
|
||||
output[Layout::output_index(n, c_out, out_y, out_x, P)] = acc;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void conv2d_cuda(const float * X_D, const T * K_D, float * Y_D, const conv_params P, cudaStream_t st) {
|
||||
const int blocks = (P.TOTAL + CUDA_CONV2D_BLOCK_SIZE - 1) / CUDA_CONV2D_BLOCK_SIZE;
|
||||
conv2d_kernel<T, whcn_layout><<<blocks, CUDA_CONV2D_BLOCK_SIZE, 0, st>>>(X_D, K_D, Y_D, P);
|
||||
}
|
||||
|
||||
static void conv2d_cuda_f16(const float * X_D, const half * K_D, float * Y_D, const conv_params P, cudaStream_t st) {
|
||||
conv2d_cuda<half>(X_D, K_D, Y_D, P, st);
|
||||
}
|
||||
|
||||
static void conv2d_cuda_f32(const float * X_D, const float * K_D, float * Y_D, const conv_params P, cudaStream_t st) {
|
||||
conv2d_cuda<float>(X_D, K_D, Y_D, P, st);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_conv2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * kernel = dst->src[0];
|
||||
const ggml_tensor * input = dst->src[1];
|
||||
float * K_D = (float *) kernel->data;
|
||||
const float * X_D = (const float *) input->data;
|
||||
float * Y_D = (float *) dst->data;
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(kernel));
|
||||
GGML_ASSERT(kernel->type == GGML_TYPE_F16 || kernel->type == GGML_TYPE_F32);
|
||||
|
||||
// same number of input channels
|
||||
GGML_ASSERT(input->ne[2] == kernel->ne[2]);
|
||||
|
||||
cudaStream_t st = ctx.stream();
|
||||
|
||||
const int32_t * p = (const int32_t *) dst->op_params;
|
||||
const int ST_X = p[0]; // stride_x
|
||||
const int ST_Y = p[1]; // stride_y
|
||||
const int PD_X = p[2]; // padding_x
|
||||
const int PD_Y = p[3]; // padding_y
|
||||
const int DL_X = p[4]; // dilation_x
|
||||
const int DL_Y = p[5]; // dilation_y
|
||||
|
||||
// No cwhn
|
||||
GGML_ASSERT(p[6] == false);
|
||||
|
||||
const int IW = input->ne[0]; // input_w
|
||||
const int IH = input->ne[1]; // input_h
|
||||
const int OW = dst->ne[0]; // output_w
|
||||
const int OH = dst->ne[1]; // output_h
|
||||
const int KW = kernel->ne[0]; // kernel_w
|
||||
const int KH = kernel->ne[1]; // kernel_h
|
||||
const int IC = input->ne[2]; // input_channels
|
||||
const int OC = kernel->ne[3]; // ouptut_chanles
|
||||
const int B = input->ne[3]; // n_batches
|
||||
|
||||
const int64_t total = B * OC * OH * OW;
|
||||
conv_params params = { IW, IH, OW, OH, KW, KH, ST_X, ST_Y, PD_X, PD_Y, DL_X, DL_Y, IC, OC, B, total };
|
||||
|
||||
if (kernel->type == GGML_TYPE_F16) {
|
||||
conv2d_cuda_f16(X_D, (half *) K_D, Y_D, params, st);
|
||||
} else {
|
||||
conv2d_cuda_f32(X_D, K_D, Y_D, params, st);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
#pragma once
|
||||
#include "common.cuh"
|
||||
|
||||
#define CUDA_CONV2D_BLOCK_SIZE 256
|
||||
void ggml_cuda_op_conv2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
@@ -0,0 +1,825 @@
|
||||
#include "convert.cuh"
|
||||
#include "dequantize.cuh"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#define CUDA_Q8_0_NE_ALIGN 2048
|
||||
|
||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02,
|
||||
const int64_t s01, const int64_t s02, const int64_t s03) {
|
||||
const int64_t i00 = 2 * (int64_t(blockDim.x)*blockIdx.x + threadIdx.x);
|
||||
|
||||
if (i00 >= ne00) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t i01 = blockIdx.y;
|
||||
const int64_t i02 = blockIdx.z % ne02;
|
||||
const int64_t i03 = blockIdx.z / ne02;
|
||||
|
||||
const int64_t ibx0 = i03*s03 + i02*s02 + i01*s01;
|
||||
|
||||
const int64_t ib = ibx0 + i00/qk; // block index
|
||||
const int64_t iqs = (i00%qk)/qr; // quant index
|
||||
const int64_t iybs = i00 - i00%qk; // y block start index
|
||||
const int64_t y_offset = qr == 1 ? 1 : qk/2;
|
||||
|
||||
// dequantize
|
||||
float2 v;
|
||||
dequantize_kernel(vx, ib, iqs, v);
|
||||
|
||||
const int64_t iy0 = ((i03*ne02 + i02)*ne01 + i01)*ne00 + iybs + iqs;
|
||||
y[iy0 + 0] = ggml_cuda_cast<dst_t>(v.x);
|
||||
y[iy0 + y_offset] = ggml_cuda_cast<dst_t>(v.y);
|
||||
}
|
||||
|
||||
template <bool need_check>
|
||||
static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int64_t k) {
|
||||
#if __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
||||
constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
|
||||
|
||||
const int64_t i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
|
||||
const int * x0 = ((int *) vx) + blockIdx.x * nint;
|
||||
half2 * y2 = (half2 *) (y + i0);
|
||||
|
||||
__shared__ int vals[nint];
|
||||
|
||||
#pragma unroll
|
||||
for (int ix0 = 0; ix0 < nint; ix0 += WARP_SIZE) {
|
||||
if (need_check && i0*sizeof(block_q8_0)/QK8_0 + sizeof(int)*(ix0 + threadIdx.x) >= k*sizeof(block_q8_0)/QK8_0) {
|
||||
break;
|
||||
}
|
||||
|
||||
const int ix = ix0 + threadIdx.x;
|
||||
vals[ix] = x0[ix];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int iy = 0; iy < CUDA_Q8_0_NE_ALIGN; iy += 2*WARP_SIZE) {
|
||||
if (need_check && i0 + iy + 2*threadIdx.x >= k) {
|
||||
return;
|
||||
}
|
||||
|
||||
const half * b0 = ((const half *) vals) + (sizeof(block_q8_0)/sizeof(half)) * ((iy + 2*threadIdx.x)/QK8_0);
|
||||
const half d = *b0;
|
||||
const char2 qs = ((const char2 *) (b0 + 1))[threadIdx.x % (QK8_0/2)];
|
||||
|
||||
y2[iy/2 + threadIdx.x] = __hmul2(make_half2(qs.x, qs.y), __half2half2(d));
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED_VARS(vx, y, k);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
||||
|
||||
const int64_t i = blockIdx.x;
|
||||
|
||||
// assume 32 threads
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t il = tid/8;
|
||||
const int64_t ir = tid%8;
|
||||
const int64_t ib = 8*i + ir;
|
||||
if (ib >= nb32) {
|
||||
return;
|
||||
}
|
||||
|
||||
dst_t * y = yy + 256*i + 32*ir + 4*il;
|
||||
|
||||
const block_q4_0 * x = (const block_q4_0 *)vx + ib;
|
||||
const float d = __half2float(x->d);
|
||||
const float dm = -8*d;
|
||||
|
||||
const uint8_t * q = x->qs + 4*il;
|
||||
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
y[l+ 0] = d * (q[l] & 0xF) + dm;
|
||||
y[l+16] = d * (q[l] >> 4) + dm;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
||||
|
||||
const int64_t i = blockIdx.x;
|
||||
|
||||
// assume 32 threads
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t il = tid/8;
|
||||
const int64_t ir = tid%8;
|
||||
const int64_t ib = 8*i + ir;
|
||||
if (ib >= nb32) {
|
||||
return;
|
||||
}
|
||||
|
||||
dst_t * y = yy + 256*i + 32*ir + 4*il;
|
||||
|
||||
const block_q4_1 * x = (const block_q4_1 *)vx + ib;
|
||||
const float2 d = __half22float2(x->dm);
|
||||
|
||||
const uint8_t * q = x->qs + 4*il;
|
||||
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
y[l+ 0] = d.x * (q[l] & 0xF) + d.y;
|
||||
y[l+16] = d.x * (q[l] >> 4) + d.y;
|
||||
}
|
||||
}
|
||||
|
||||
//================================== k-quants
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int64_t i = blockIdx.x;
|
||||
const block_q2_K * x = (const block_q2_K *) vx;
|
||||
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t n = tid/32;
|
||||
const int64_t l = tid - 32*n;
|
||||
const int64_t is = 8*n + l/16;
|
||||
|
||||
const uint8_t q = x[i].qs[32*n + l];
|
||||
dst_t * y = yy + i*QK_K + 128*n;
|
||||
|
||||
float dall = __low2half(x[i].dm);
|
||||
float dmin = __high2half(x[i].dm);
|
||||
y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
||||
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
||||
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
||||
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int64_t i = blockIdx.x;
|
||||
const block_q3_K * x = (const block_q3_K *) vx;
|
||||
|
||||
const int64_t r = threadIdx.x/4;
|
||||
const int64_t tid = r/2;
|
||||
const int64_t is0 = r%2;
|
||||
const int64_t l0 = 16*is0 + 4*(threadIdx.x%4);
|
||||
const int64_t n = tid / 4;
|
||||
const int64_t j = tid - 4*n;
|
||||
|
||||
uint8_t m = 1 << (4*n + j);
|
||||
int64_t is = 8*n + 2*j + is0;
|
||||
int shift = 2*j;
|
||||
|
||||
int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
|
||||
is < 8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
|
||||
is < 12 ? (x[i].scales[is-8] >> 4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
|
||||
(x[i].scales[is-8] >> 4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
|
||||
float d_all = x[i].d;
|
||||
float dl = d_all * (us - 32);
|
||||
|
||||
dst_t * y = yy + i*QK_K + 128*n + 32*j;
|
||||
const uint8_t * q = x[i].qs + 32*n;
|
||||
const uint8_t * hm = x[i].hmask;
|
||||
|
||||
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
|
||||
}
|
||||
|
||||
static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
||||
if (j < 4) {
|
||||
d = q[j] & 63; m = q[j + 4] & 63;
|
||||
} else {
|
||||
d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
|
||||
m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
const block_q4_K * x = (const block_q4_K *) vx;
|
||||
|
||||
const int64_t i = blockIdx.x;
|
||||
|
||||
// assume 32 threads
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t il = tid/8;
|
||||
const int64_t ir = tid%8;
|
||||
const int64_t is = 2*il;
|
||||
const int64_t n = 4;
|
||||
|
||||
dst_t * y = yy + i*QK_K + 64*il + n*ir;
|
||||
|
||||
const float dall = __low2half(x[i].dm);
|
||||
const float dmin = __high2half(x[i].dm);
|
||||
|
||||
const uint8_t * q = x[i].qs + 32*il + n*ir;
|
||||
|
||||
uint8_t sc, m;
|
||||
get_scale_min_k4(is + 0, x[i].scales, sc, m);
|
||||
const float d1 = dall * sc; const float m1 = dmin * m;
|
||||
get_scale_min_k4(is + 1, x[i].scales, sc, m);
|
||||
const float d2 = dall * sc; const float m2 = dmin * m;
|
||||
for (int l = 0; l < n; ++l) {
|
||||
y[l + 0] = d1 * (q[l] & 0xF) - m1;
|
||||
y[l +32] = d2 * (q[l] >> 4) - m2;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
const block_q5_K * x = (const block_q5_K *) vx;
|
||||
|
||||
const int64_t i = blockIdx.x;
|
||||
|
||||
// assume 64 threads - this is very slightly better than the one below
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t il = tid/16; // il is in 0...3
|
||||
const int64_t ir = tid%16; // ir is in 0...15
|
||||
const int64_t is = 2*il; // is is in 0...6
|
||||
|
||||
dst_t * y = yy + i*QK_K + 64*il + 2*ir;
|
||||
|
||||
const float dall = __low2half(x[i].dm);
|
||||
const float dmin = __high2half(x[i].dm);
|
||||
|
||||
const uint8_t * ql = x[i].qs + 32*il + 2*ir;
|
||||
const uint8_t * qh = x[i].qh + 2*ir;
|
||||
|
||||
uint8_t sc, m;
|
||||
get_scale_min_k4(is + 0, x[i].scales, sc, m);
|
||||
const float d1 = dall * sc; const float m1 = dmin * m;
|
||||
get_scale_min_k4(is + 1, x[i].scales, sc, m);
|
||||
const float d2 = dall * sc; const float m2 = dmin * m;
|
||||
|
||||
uint8_t hm = 1 << (2*il);
|
||||
y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
|
||||
y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
|
||||
hm <<= 1;
|
||||
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
|
||||
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
const block_q6_K * x = (const block_q6_K *) vx;
|
||||
|
||||
const int64_t i = blockIdx.x;
|
||||
|
||||
// assume 64 threads - this is very slightly better than the one below
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t ip = tid/32; // ip is 0 or 1
|
||||
const int64_t il = tid - 32*ip; // 0...32
|
||||
const int64_t is = 8*ip + il/16;
|
||||
|
||||
dst_t * y = yy + i*QK_K + 128*ip + il;
|
||||
|
||||
const float d = x[i].d;
|
||||
|
||||
const uint8_t * ql = x[i].ql + 64*ip + il;
|
||||
const uint8_t qh = x[i].qh[32*ip + il];
|
||||
const int8_t * sc = x[i].scales + is;
|
||||
|
||||
y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
|
||||
y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
|
||||
y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
||||
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int64_t i = blockIdx.x;
|
||||
const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
|
||||
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
const uint16_t * q2 = x[i].qs + 4*ib;
|
||||
const uint8_t * aux8 = (const uint8_t *)q2;
|
||||
const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[il]);
|
||||
const uint32_t aux32 = q2[2] | (q2[3] << 16);
|
||||
const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
|
||||
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
|
||||
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int64_t i = blockIdx.x;
|
||||
const block_iq2_xs * x = (const block_iq2_xs *) vx;
|
||||
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
const uint16_t * q2 = x[i].qs + 4*ib;
|
||||
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
|
||||
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
||||
const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
|
||||
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int64_t i = blockIdx.x;
|
||||
const block_iq2_s * x = (const block_iq2_s *) vx;
|
||||
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
|
||||
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
||||
const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
|
||||
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int64_t i = blockIdx.x;
|
||||
const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
|
||||
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
const uint8_t * q3 = x[i].qs + 8*ib;
|
||||
const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]);
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]);
|
||||
const uint32_t aux32 = gas[0] | (gas[1] << 16);
|
||||
const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.5f;
|
||||
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
||||
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int64_t i = blockIdx.x;
|
||||
const block_iq3_s * x = (const block_iq3_s *) vx;
|
||||
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
const uint8_t * qs = x[i].qs + 8*ib;
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
|
||||
const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
|
||||
const uint8_t signs = x[i].signs[4*ib + il];
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
||||
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int64_t i = blockIdx.x;
|
||||
const block_iq1_s * x = (const block_iq1_s *) vx;
|
||||
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
|
||||
const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
|
||||
uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
|
||||
grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)];
|
||||
grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
|
||||
grid32[0] &= 0x0f0f0f0f;
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
y[j] = d * (q[j] + delta);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int64_t i = blockIdx.x;
|
||||
const block_iq1_m * x = (const block_iq1_m *) vx;
|
||||
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
||||
iq1m_scale_t scale;
|
||||
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
||||
const int64_t ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
|
||||
const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1);
|
||||
const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
|
||||
uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
|
||||
grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[2*ib+il/2] >> 4*(il%2)) & 7) << 8)];
|
||||
grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
|
||||
grid32[0] &= 0x0f0f0f0f;
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
y[j] = d * (q[j] + delta);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int64_t i = blockIdx.x;
|
||||
const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
|
||||
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
||||
const uint8_t * q4 = x[ib].qs + 4*il;
|
||||
const float d = (float)x[ib].d;
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
|
||||
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
||||
}
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
const int64_t i = blockIdx.x;
|
||||
const block_iq4_xs * x = (const block_iq4_xs *)vx;
|
||||
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
||||
const uint8_t * q4 = x[i].qs + 16*ib + 4*il;
|
||||
const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
|
||||
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
||||
}
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_mxfp4(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int64_t i = blockIdx.x;
|
||||
const block_mxfp4 * x = (const block_mxfp4 *) vx + i*(QK_K/QK_MXFP4);
|
||||
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
||||
const uint8_t * q4 = x[ib].qs + 4*il;
|
||||
const float d = ggml_cuda_e8m0_to_fp32(x[ib].e);
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
y[j+ 0] = d * kvalues_mxfp4[q4[j] & 0xf]*0.5f;
|
||||
y[j+16] = d * kvalues_mxfp4[q4[j] >> 4]*0.5f;
|
||||
}
|
||||
}
|
||||
|
||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||
static void dequantize_block_cuda(const void * vx, dst_t * y,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
|
||||
const int64_t s01, const int64_t s02, const int64_t s03, cudaStream_t stream) {
|
||||
const dim3 num_blocks((ne00 + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE), ne01, ne02*ne03);
|
||||
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>
|
||||
(vx, y, ne00, ne01, ne02, s01, s02, s03);
|
||||
}
|
||||
|
||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||
static void dequantize_block_cont_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
||||
dequantize_block_cuda<qk, qr, dequantize_kernel, dst_t>(vx, y, k, 1, 1, 1, k/qk, k/qk, k/qk, stream);
|
||||
}
|
||||
|
||||
static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
|
||||
if (k % CUDA_Q8_0_NE_ALIGN == 0) {
|
||||
const bool need_check = false;
|
||||
dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
|
||||
} else {
|
||||
const bool need_check = true;
|
||||
dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb32 = k / 32;
|
||||
const int nb = (k + 255) / 256;
|
||||
dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb32 = k / 32;
|
||||
const int nb = (k + 255) / 256;
|
||||
dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = (k + QK_K - 1) / QK_K;
|
||||
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_iq1_m<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = (k + QK_K - 1) / QK_K;
|
||||
dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_mxfp4_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = (k + QK_K - 1) / QK_K;
|
||||
dequantize_block_mxfp4<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
static __global__ void convert_unary(
|
||||
const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01, const int64_t ne02,
|
||||
const int64_t s01, const int64_t s02, const int64_t s03) {
|
||||
const int64_t i00 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i00 >= ne00) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t i01 = blockIdx.y;
|
||||
const int64_t i02 = blockIdx.z % ne02;
|
||||
const int64_t i03 = blockIdx.z / ne02;
|
||||
|
||||
const src_t * x = (const src_t *) vx;
|
||||
|
||||
const int64_t ix = i03*s03 + i02*s02 + i01*s01 + i00;
|
||||
const int64_t iy = ((i03*ne02 + i02)*ne01 + i01)*ne00 + i00;
|
||||
y[iy] = ggml_cuda_cast<dst_t>(x[ix]);
|
||||
}
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
static void convert_unary_cuda(const void * vx, dst_t * y,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
|
||||
const int64_t s01, const int64_t s02, const int64_t s03, cudaStream_t stream) {
|
||||
const dim3 num_blocks((ne00 + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE, ne01, ne02*ne03);
|
||||
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>
|
||||
(vx, y, ne00, ne01, ne02, s01, s02, s03);
|
||||
}
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
static void convert_unary_cont_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
convert_unary_cuda<src_t>(vx, y, k, 1, 1, 1, k, k, k, stream);
|
||||
}
|
||||
|
||||
to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_cont_cuda<float>;
|
||||
case GGML_TYPE_F16:
|
||||
return convert_unary_cont_cuda<half>;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
return dequantize_row_q4_0_cuda;
|
||||
case GGML_TYPE_Q4_1:
|
||||
return dequantize_row_q4_1_cuda;
|
||||
case GGML_TYPE_Q5_0:
|
||||
return dequantize_block_cont_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
||||
case GGML_TYPE_Q5_1:
|
||||
return dequantize_block_cont_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
||||
case GGML_TYPE_Q8_0:
|
||||
if (fp16_available(ggml_cuda_info().devices[ggml_cuda_get_device()].cc)) {
|
||||
return dequantize_block_q8_0_f16_cuda;
|
||||
}
|
||||
return dequantize_block_cont_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
||||
case GGML_TYPE_Q2_K:
|
||||
return dequantize_row_q2_K_cuda;
|
||||
case GGML_TYPE_Q3_K:
|
||||
return dequantize_row_q3_K_cuda;
|
||||
case GGML_TYPE_Q4_K:
|
||||
return dequantize_row_q4_K_cuda;
|
||||
case GGML_TYPE_Q5_K:
|
||||
return dequantize_row_q5_K_cuda;
|
||||
case GGML_TYPE_Q6_K:
|
||||
return dequantize_row_q6_K_cuda;
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
return dequantize_row_iq2_xxs_cuda;
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
return dequantize_row_iq2_xs_cuda;
|
||||
case GGML_TYPE_IQ2_S:
|
||||
return dequantize_row_iq2_s_cuda;
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
return dequantize_row_iq3_xxs_cuda;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
return dequantize_row_iq1_s_cuda;
|
||||
case GGML_TYPE_IQ1_M:
|
||||
return dequantize_row_iq1_m_cuda;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
return dequantize_row_iq4_nl_cuda;
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
return dequantize_row_iq4_xs_cuda;
|
||||
case GGML_TYPE_IQ3_S:
|
||||
return dequantize_row_iq3_s_cuda;
|
||||
case GGML_TYPE_MXFP4:
|
||||
return dequantize_row_mxfp4_cuda;
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_cont_cuda<float>;
|
||||
case GGML_TYPE_BF16:
|
||||
return convert_unary_cont_cuda<nv_bfloat16>;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
return dequantize_row_q4_0_cuda;
|
||||
case GGML_TYPE_Q4_1:
|
||||
return dequantize_row_q4_1_cuda;
|
||||
case GGML_TYPE_Q5_0:
|
||||
return dequantize_block_cont_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
||||
case GGML_TYPE_Q5_1:
|
||||
return dequantize_block_cont_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
||||
case GGML_TYPE_Q8_0:
|
||||
return dequantize_block_cont_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
||||
case GGML_TYPE_Q2_K:
|
||||
return dequantize_row_q2_K_cuda;
|
||||
case GGML_TYPE_Q3_K:
|
||||
return dequantize_row_q3_K_cuda;
|
||||
case GGML_TYPE_Q4_K:
|
||||
return dequantize_row_q4_K_cuda;
|
||||
case GGML_TYPE_Q5_K:
|
||||
return dequantize_row_q5_K_cuda;
|
||||
case GGML_TYPE_Q6_K:
|
||||
return dequantize_row_q6_K_cuda;
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
return dequantize_row_iq2_xxs_cuda;
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
return dequantize_row_iq2_xs_cuda;
|
||||
case GGML_TYPE_IQ2_S:
|
||||
return dequantize_row_iq2_s_cuda;
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
return dequantize_row_iq3_xxs_cuda;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
return dequantize_row_iq1_s_cuda;
|
||||
case GGML_TYPE_IQ1_M:
|
||||
return dequantize_row_iq1_m_cuda;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
return dequantize_row_iq4_nl_cuda;
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
return dequantize_row_iq4_xs_cuda;
|
||||
case GGML_TYPE_IQ3_S:
|
||||
return dequantize_row_iq3_s_cuda;
|
||||
case GGML_TYPE_MXFP4:
|
||||
return dequantize_row_mxfp4_cuda;
|
||||
case GGML_TYPE_F16:
|
||||
return convert_unary_cont_cuda<half>;
|
||||
case GGML_TYPE_BF16:
|
||||
return convert_unary_cont_cuda<nv_bfloat16>;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_cuda<float>;
|
||||
case GGML_TYPE_Q4_0:
|
||||
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
||||
case GGML_TYPE_Q4_1:
|
||||
return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
|
||||
case GGML_TYPE_Q5_0:
|
||||
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
||||
case GGML_TYPE_Q5_1:
|
||||
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
||||
case GGML_TYPE_Q8_0:
|
||||
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
||||
case GGML_TYPE_BF16:
|
||||
return convert_unary_cuda<nv_bfloat16>;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_cuda<float, nv_bfloat16>;
|
||||
case GGML_TYPE_Q4_0:
|
||||
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
||||
case GGML_TYPE_Q4_1:
|
||||
return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
|
||||
case GGML_TYPE_Q5_0:
|
||||
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
||||
case GGML_TYPE_Q5_1:
|
||||
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
||||
case GGML_TYPE_Q8_0:
|
||||
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
||||
case GGML_TYPE_F16:
|
||||
return convert_unary_cuda<half, nv_bfloat16>;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F16:
|
||||
return convert_unary_cuda<half, float>;
|
||||
case GGML_TYPE_Q4_0:
|
||||
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
||||
case GGML_TYPE_Q4_1:
|
||||
return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
|
||||
case GGML_TYPE_Q5_0:
|
||||
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
||||
case GGML_TYPE_Q5_1:
|
||||
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
||||
case GGML_TYPE_Q8_0:
|
||||
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
||||
case GGML_TYPE_BF16:
|
||||
return convert_unary_cuda<nv_bfloat16, float>;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,56 @@
|
||||
#pragma once
|
||||
#include "common.cuh"
|
||||
|
||||
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
||||
|
||||
template<typename T>
|
||||
using to_t_cuda_t = void (*)(const void * x, T * y, int64_t k, cudaStream_t stream);
|
||||
|
||||
typedef to_t_cuda_t<float> to_fp32_cuda_t;
|
||||
typedef to_t_cuda_t<half> to_fp16_cuda_t;
|
||||
typedef to_t_cuda_t<nv_bfloat16> to_bf16_cuda_t;
|
||||
|
||||
to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);
|
||||
|
||||
to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type);
|
||||
|
||||
to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);
|
||||
|
||||
// TODO more general support for non-contiguous inputs
|
||||
|
||||
template<typename T>
|
||||
using to_t_nc_cuda_t = void (*)(const void * x, T * y,
|
||||
int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03,
|
||||
int64_t s01, int64_t s02, int64_t s03, cudaStream_t stream);
|
||||
|
||||
typedef to_t_nc_cuda_t<float> to_fp32_nc_cuda_t;
|
||||
typedef to_t_nc_cuda_t<half> to_fp16_nc_cuda_t;
|
||||
typedef to_t_nc_cuda_t<nv_bfloat16> to_bf16_nc_cuda_t;
|
||||
|
||||
to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type);
|
||||
to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type);
|
||||
to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type);
|
||||
|
||||
template<typename dst_t, typename src_t>
|
||||
__host__ __device__ inline dst_t ggml_cuda_cast(src_t x) {
|
||||
if constexpr (std::is_same_v<dst_t, src_t>) {
|
||||
return x;
|
||||
} else if constexpr(std::is_same_v<dst_t, nv_bfloat16>) {
|
||||
return __float2bfloat16(float(x));
|
||||
} else if constexpr(std::is_same_v<src_t, nv_bfloat16>) {
|
||||
return __bfloat162float(x);
|
||||
} else if constexpr(std::is_same_v<src_t, float2> && std::is_same_v<dst_t, half2>) {
|
||||
return __float22half2_rn(x);
|
||||
} else if constexpr(std::is_same_v<src_t, float2> && std::is_same_v<dst_t, nv_bfloat162>) {
|
||||
// bypass compile error on cuda 12.0.1
|
||||
#ifdef GGML_USE_HIP
|
||||
return __float22bfloat162_rn(x);
|
||||
#else
|
||||
return {x.x, x.y};
|
||||
#endif // GGML_USE_HIP
|
||||
} else if constexpr(std::is_same_v<dst_t, int32_t>) {
|
||||
return int32_t(x);
|
||||
} else {
|
||||
return float(x);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,64 @@
|
||||
#include "common.cuh"
|
||||
#include "count-equal.cuh"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
template <typename T>
|
||||
static __global__ void count_equal(const T * __restrict__ x, const T * __restrict__ y, int64_t * __restrict__ dst, const int64_t dk, const int64_t k) {
|
||||
const int64_t i0 = (int64_t) blockIdx.x*dk;
|
||||
const int64_t i1 = min(i0 + dk, k);
|
||||
|
||||
int nequal = 0;
|
||||
|
||||
for (int64_t i = i0 + threadIdx.x; i < i1; i += WARP_SIZE) {
|
||||
const T xi = x[i];
|
||||
const T yi = y[i];
|
||||
nequal += xi == yi;
|
||||
}
|
||||
|
||||
nequal = warp_reduce_sum(nequal);
|
||||
|
||||
if (threadIdx.x != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
atomicAdd((int *) dst, nequal);
|
||||
}
|
||||
|
||||
void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
GGML_ASSERT(src0->type == src1->type);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_I64);
|
||||
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, src1));
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous(src1));
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
|
||||
int64_t * dst_d = (int64_t *) dst->data;
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
|
||||
|
||||
const int64_t ne = ggml_nelements(src0);
|
||||
GGML_ASSERT(ne < (1 << 30) && "atomicAdd implementation only supports int");
|
||||
const int64_t dne = GGML_PAD((ne + 4*nsm - 1) / (4*nsm), CUDA_COUNT_EQUAL_CHUNK_SIZE);
|
||||
|
||||
CUDA_CHECK(cudaMemsetAsync(dst_d, 0, ggml_nbytes(dst), stream));
|
||||
|
||||
const dim3 blocks_dim(WARP_SIZE, 1, 1);
|
||||
const dim3 blocks_num(std::min((int64_t)4*nsm, (ne + CUDA_COUNT_EQUAL_CHUNK_SIZE - 1)/CUDA_COUNT_EQUAL_CHUNK_SIZE), 1, 1);
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_I32: {
|
||||
const int * src0_d = (const int *) src0->data;
|
||||
const int * src1_d = (const int *) src1->data;
|
||||
count_equal<<<blocks_num, blocks_dim, 0, stream>>>(src0_d, src1_d, dst_d, dne, ne);
|
||||
} break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
#include "common.cuh"
|
||||
|
||||
#define CUDA_COUNT_EQUAL_CHUNK_SIZE 128
|
||||
|
||||
void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
@@ -0,0 +1,57 @@
|
||||
// Simplified API for asynchronous data loading.
|
||||
|
||||
#include "common.cuh"
|
||||
|
||||
|
||||
static __device__ __forceinline__ unsigned int ggml_cuda_cvta_generic_to_shared(void * generic_ptr) {
|
||||
#ifdef CP_ASYNC_AVAILABLE
|
||||
return __cvta_generic_to_shared(generic_ptr);
|
||||
#else
|
||||
GGML_UNUSED(generic_ptr);
|
||||
NO_DEVICE_CODE;
|
||||
return 0;
|
||||
#endif // CP_ASYNC_AVAILABLE
|
||||
}
|
||||
|
||||
// Copies data from global to shared memory, cg == cache global.
|
||||
// Both the src and dst pointers must be aligned to 16 bit.
|
||||
// Shared memory uses 32 bit addressing, the pointer is passed as unsigned int.
|
||||
// Generic pointers can be converted to 32 bit shared memory pointers using __cvta_generic_to_shared.
|
||||
// Only the 16 bit copy is exposed because 4 and 8 bit copies did not yield performance improvements.
|
||||
template <int preload>
|
||||
static __device__ __forceinline__ void cp_async_cg_16(const unsigned int dst, const void * src) {
|
||||
static_assert(preload == 0 || preload == 64 || preload == 128 || preload == 256, "bad preload");
|
||||
#ifdef CP_ASYNC_AVAILABLE
|
||||
#if CUDART_VERSION >= 11040
|
||||
if (preload == 256) {
|
||||
asm volatile("cp.async.cg.shared.global.L2::256B [%0], [%1], 16;"
|
||||
: : "r"(dst), "l"(src));
|
||||
} else if (preload == 128) {
|
||||
asm volatile("cp.async.cg.shared.global.L2::128B [%0], [%1], 16;"
|
||||
: : "r"(dst), "l"(src));
|
||||
} else if (preload == 64) {
|
||||
asm volatile("cp.async.cg.shared.global.L2::64B [%0], [%1], 16;"
|
||||
: : "r"(dst), "l"(src));
|
||||
} else
|
||||
#endif // CUDART_VERSION >= 11040
|
||||
{
|
||||
asm volatile("cp.async.cg.shared.global [%0], [%1], 16;"
|
||||
: : "r"(dst), "l"(src));
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(dst);
|
||||
GGML_UNUSED(src);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // CP_ASYNC_AVAILABLE
|
||||
}
|
||||
|
||||
// Makes each thread wait until its asynchronous data copies are done.
|
||||
// This does NOT provide any additional synchronization.
|
||||
// In particular, when copying data with multiple warps a call to __syncthreads will be needed.
|
||||
static __device__ __forceinline__ void cp_async_wait_all() {
|
||||
#ifdef CP_ASYNC_AVAILABLE
|
||||
asm volatile("cp.async.wait_all;");
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // CP_ASYNC_AVAILABLE
|
||||
}
|
||||
@@ -0,0 +1,217 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml-common.h"
|
||||
#include "convert.cuh"
|
||||
|
||||
static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val, float x) {
|
||||
if (x <= val[0]) return 0;
|
||||
if (x >= val[n-1]) return n-1;
|
||||
int ml = 0, mu = n-1;
|
||||
while (mu-ml > 1) {
|
||||
int mav = (ml+mu)/2;
|
||||
if (x < val[mav]) mu = mav; else ml = mav;
|
||||
}
|
||||
return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
|
||||
}
|
||||
|
||||
static __device__ void quantize_f32_q4_0_block(const float * __restrict__ x, block_q4_0 * __restrict__ y) {
|
||||
float amax = 0.0f;
|
||||
float vmax = 0.0f;
|
||||
|
||||
for (int j = 0; j < QK4_0; ++j) {
|
||||
const float v = x[j];
|
||||
if (amax < fabsf(v)) {
|
||||
amax = fabsf(v);
|
||||
vmax = v;
|
||||
}
|
||||
}
|
||||
|
||||
const float d = vmax / -8;
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
y->d = d;
|
||||
|
||||
for (int j = 0; j < QK4_0/2; ++j) {
|
||||
const float x0 = x[0 + j]*id;
|
||||
const float x1 = x[QK4_0/2 + j]*id;
|
||||
|
||||
const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f));
|
||||
const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f));
|
||||
|
||||
y->qs[j] = xi0;
|
||||
y->qs[j] |= xi1 << 4;
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ void quantize_f32_q4_1_block(const float * __restrict__ x, block_q4_1 * __restrict__ y) {
|
||||
float vmin = FLT_MAX;
|
||||
float vmax = -FLT_MAX;
|
||||
|
||||
for (int j = 0; j < QK4_1; ++j) {
|
||||
const float v = x[j];
|
||||
if (v < vmin) vmin = v;
|
||||
if (v > vmax) vmax = v;
|
||||
}
|
||||
|
||||
const float d = (vmax - vmin) / ((1 << 4) - 1);
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
y->dm.x = d;
|
||||
y->dm.y = vmin;
|
||||
|
||||
for (int j = 0; j < QK4_1/2; ++j) {
|
||||
const float x0 = (x[0 + j] - vmin)*id;
|
||||
const float x1 = (x[QK4_1/2 + j] - vmin)*id;
|
||||
|
||||
const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f));
|
||||
const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f));
|
||||
|
||||
y->qs[j] = xi0;
|
||||
y->qs[j] |= xi1 << 4;
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ void quantize_f32_q5_0_block(const float * __restrict__ x, block_q5_0 * __restrict__ y) {
|
||||
float amax = 0.0f;
|
||||
float vmax = 0.0f;
|
||||
|
||||
for (int j = 0; j < QK5_0; ++j) {
|
||||
const float v = x[j];
|
||||
if (amax < fabsf(v)) {
|
||||
amax = fabsf(v);
|
||||
vmax = v;
|
||||
}
|
||||
}
|
||||
|
||||
const float d = vmax / -16;
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
y->d = d;
|
||||
|
||||
uint32_t qh = 0;
|
||||
for (int j = 0; j < QK5_0/2; ++j) {
|
||||
const float x0 = x[0 + j]*id;
|
||||
const float x1 = x[QK5_0/2 + j]*id;
|
||||
|
||||
const uint8_t xi0 = min(31, (int8_t)(x0 + 16.5f));
|
||||
const uint8_t xi1 = min(31, (int8_t)(x1 + 16.5f));
|
||||
|
||||
y->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
|
||||
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
|
||||
qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
|
||||
}
|
||||
memcpy(y->qh, &qh, sizeof(qh));
|
||||
}
|
||||
|
||||
static __device__ void quantize_f32_q5_1_block(const float * __restrict__ x, block_q5_1 * __restrict__ y) {
|
||||
float min = x[0];
|
||||
float max = x[0];
|
||||
|
||||
for (int j = 1; j < QK5_1; ++j) {
|
||||
const float v = x[j];
|
||||
min = v < min ? v : min;
|
||||
max = v > max ? v : max;
|
||||
}
|
||||
|
||||
const float d = (max - min) / 31;
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
y->dm.x = d;
|
||||
y->dm.y = min;
|
||||
|
||||
uint32_t qh = 0;
|
||||
for (int j = 0; j < QK5_1/2; ++j) {
|
||||
const float x0 = (x[0 + j] - min)*id;
|
||||
const float x1 = (x[QK5_1/2 + j] - min)*id;
|
||||
|
||||
const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
|
||||
const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
|
||||
|
||||
y->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
|
||||
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
|
||||
qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1/2);
|
||||
}
|
||||
memcpy(y->qh, &qh, sizeof(qh));
|
||||
}
|
||||
|
||||
static __device__ void quantize_f32_q8_0_block(const float * __restrict__ x, block_q8_0 * __restrict__ y) {
|
||||
float amax = 0.0f; // absolute max
|
||||
|
||||
for (int j = 0; j < QK8_0; j++) {
|
||||
const float v = x[j];
|
||||
amax = fmaxf(amax, fabsf(v));
|
||||
}
|
||||
|
||||
const float d = amax / ((1 << 7) - 1);
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
y->d = d;
|
||||
|
||||
for (int j = 0; j < QK8_0; ++j) {
|
||||
const float x0 = x[j]*id;
|
||||
y->qs[j] = roundf(x0);
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ void quantize_f32_iq4_nl_block(const float * __restrict__ x, block_iq4_nl * __restrict__ y) {
|
||||
float amax = 0.0f;
|
||||
float vmax = 0.0f;
|
||||
|
||||
for (int j = 0; j < QK4_NL; ++j) {
|
||||
const float v = x[j];
|
||||
if (amax < fabsf(v)) {
|
||||
amax = fabsf(v);
|
||||
vmax = v;
|
||||
}
|
||||
}
|
||||
|
||||
float d = vmax / kvalues_iq4nl[0];
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
float sumqx = 0, sumq2 = 0;
|
||||
for (int j = 0; j < QK4_NL/2; ++j) {
|
||||
const float x0 = x[0 + j]*id;
|
||||
const float x1 = x[QK4_NL/2 + j]*id;
|
||||
const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0);
|
||||
const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1);
|
||||
y->qs[j] = xi0 | (xi1 << 4);
|
||||
const float v0 = kvalues_iq4nl[xi0];
|
||||
const float v1 = kvalues_iq4nl[xi1];
|
||||
const float w0 = x[0 + j]*x[0 + j];
|
||||
const float w1 = x[QK4_NL/2 + j]*x[QK4_NL/2 + j];
|
||||
sumqx += w0*v0*x[j] + w1*v1*x[QK4_NL/2 + j];
|
||||
sumq2 += w0*v0*v0 + w1*v1*v1;
|
||||
}
|
||||
|
||||
y->d = sumq2 > 0 ? sumqx/sumq2 : d;
|
||||
}
|
||||
|
||||
// Wrapper functions for cpy.cu compatibility
|
||||
static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
|
||||
quantize_f32_q4_0_block((const float *)cxi, (block_q4_0 *)cdsti);
|
||||
}
|
||||
|
||||
static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
|
||||
quantize_f32_q4_1_block((const float *)cxi, (block_q4_1 *)cdsti);
|
||||
}
|
||||
|
||||
static __device__ void cpy_blck_f32_q5_0(const char * cxi, char * cdsti) {
|
||||
quantize_f32_q5_0_block((const float *)cxi, (block_q5_0 *)cdsti);
|
||||
}
|
||||
|
||||
static __device__ void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) {
|
||||
quantize_f32_q5_1_block((const float *)cxi, (block_q5_1 *)cdsti);
|
||||
}
|
||||
|
||||
static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
|
||||
quantize_f32_q8_0_block((const float *)cxi, (block_q8_0 *)cdsti);
|
||||
}
|
||||
|
||||
static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
|
||||
quantize_f32_iq4_nl_block((const float *)cxi, (block_iq4_nl *)cdsti);
|
||||
}
|
||||
|
||||
template<typename src_t, typename dst_t>
|
||||
static __device__ void cpy_1_scalar(const char * cxi, char * cdsti) {
|
||||
*(dst_t *) cdsti = ggml_cuda_cast<dst_t>(*(const src_t *) cxi);
|
||||
}
|
||||
@@ -0,0 +1,555 @@
|
||||
#include "cpy.cuh"
|
||||
#include "dequantize.cuh"
|
||||
#include "cpy-utils.cuh"
|
||||
#if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY)
|
||||
#include "ggml-musa/mudnn.cuh"
|
||||
#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY
|
||||
|
||||
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
||||
|
||||
const int CUDA_CPY_TILE_DIM_2D = 32; // 2D tile dimension for transposed blocks
|
||||
const int CUDA_CPY_BLOCK_NM = 8; // block size of 3rd dimension if available
|
||||
const int CUDA_CPY_BLOCK_ROWS = 8; // block dimension for marching through rows
|
||||
|
||||
template <cpy_kernel_t cpy_1>
|
||||
static __global__ void cpy_scalar(const char * cx, char * cdst, const int64_t ne,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
|
||||
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11,
|
||||
const int64_t nb12, const int64_t nb13) {
|
||||
const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
// determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
|
||||
// then combine those indices with the corresponding byte offsets to get the total offsets
|
||||
const int64_t i03 = i/(ne00 * ne01 * ne02);
|
||||
const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
||||
const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
||||
const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
|
||||
const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
|
||||
|
||||
const int64_t i13 = i/(ne10 * ne11 * ne12);
|
||||
const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
|
||||
const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
|
||||
const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
||||
const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
|
||||
|
||||
cpy_1(cx + x_offset, cdst + dst_offset);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const int64_t ne,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
|
||||
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11,
|
||||
const int64_t nb12, const int64_t nb13) {
|
||||
|
||||
const T* src = reinterpret_cast<const T*>(cx);
|
||||
T* dst = reinterpret_cast<T*>(cdst);
|
||||
|
||||
const int64_t nmat = ne / (ne00 * ne01);
|
||||
const int64_t n = ne00 * ne01;
|
||||
|
||||
const int x = blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.x;
|
||||
const int y = blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.y;
|
||||
const int tx = blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.x; // transpose block offset
|
||||
const int ty = blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.y;
|
||||
|
||||
__shared__ float tile[CUDA_CPY_TILE_DIM_2D][CUDA_CPY_TILE_DIM_2D+1];
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < CUDA_CPY_BLOCK_NM; ++i) {
|
||||
|
||||
const unsigned int imat = blockIdx.z * CUDA_CPY_BLOCK_NM + i;
|
||||
if (imat >= nmat)
|
||||
break;
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < CUDA_CPY_TILE_DIM_2D; j += CUDA_CPY_BLOCK_ROWS) {
|
||||
if(x < ne01 && y + j < ne00){
|
||||
const int row = threadIdx.y+j;
|
||||
const int col = threadIdx.x * sizeof(float)/sizeof(T);
|
||||
T *tile2 = reinterpret_cast<T*>(tile[row]);
|
||||
tile2[col] = src[imat*n + (y+j)*ne01 + x];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < CUDA_CPY_TILE_DIM_2D; j += CUDA_CPY_BLOCK_ROWS) {
|
||||
if (ty + j < ne01 && tx < ne00) {
|
||||
const int col = (threadIdx.y+j)*sizeof(float)/sizeof(T);
|
||||
const T *tile2 = reinterpret_cast<const T*>(tile[threadIdx.x]);
|
||||
dst[imat*n + (ty+j)*ne00 + tx] = tile2[col];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GGML_UNUSED_VARS(ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11,
|
||||
nb12, nb13);
|
||||
}
|
||||
|
||||
static __device__ void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) {
|
||||
float * cdstf = (float *)(cdsti);
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < QK8_0; j += 2) {
|
||||
float2 dq;
|
||||
dequantize_q8_0(cxi, 0, j, dq);
|
||||
*(cdstf + j) = dq.x;
|
||||
*(cdstf + j + 1) = dq.y;
|
||||
}
|
||||
}
|
||||
|
||||
template<dequantize_kernel_t dequant, int qk>
|
||||
static __device__ void cpy_blck_q_f32(const char * cxi, char * cdsti) {
|
||||
float * cdstf = (float *)(cdsti);
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < qk/2; j++) {
|
||||
float2 dq;
|
||||
dequant(cxi, 0, j, dq);
|
||||
*(cdstf + j) = dq.x;
|
||||
*(cdstf + j + qk/2) = dq.y;
|
||||
}
|
||||
}
|
||||
|
||||
template <cpy_kernel_t cpy_blck, int qk>
|
||||
static __global__ void cpy_f32_q(const char * cx, char * cdst, const int64_t ne,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
|
||||
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11,
|
||||
const int64_t nb12, const int64_t nb13) {
|
||||
const int64_t i = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*qk;
|
||||
|
||||
if (i >= ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t i03 = i/(ne00 * ne01 * ne02);
|
||||
const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
||||
const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
||||
const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
|
||||
const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
|
||||
|
||||
const int64_t i13 = i/(ne10 * ne11 * ne12);
|
||||
const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
|
||||
const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
|
||||
const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
||||
const int64_t dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
|
||||
|
||||
cpy_blck(cx + x_offset, cdst + dst_offset);
|
||||
}
|
||||
|
||||
template <cpy_kernel_t cpy_blck, int qk>
|
||||
static __global__ void cpy_q_f32(const char * cx, char * cdst, const int64_t ne,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
|
||||
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11,
|
||||
const int64_t nb12, const int64_t nb13) {
|
||||
const int64_t i = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*qk;
|
||||
|
||||
if (i >= ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t i03 = i/(ne00 * ne01 * ne02);
|
||||
const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
||||
const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
||||
const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
|
||||
const int64_t x_offset = (i00/qk)*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
|
||||
|
||||
const int64_t i13 = i/(ne10 * ne11 * ne12);
|
||||
const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
|
||||
const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
|
||||
const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
||||
const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
|
||||
|
||||
cpy_blck(cx + x_offset, cdst + dst_offset);
|
||||
}
|
||||
|
||||
template<typename src_t, typename dst_t>
|
||||
static __global__ void cpy_scalar_contiguous(const char * cx, char * cdst, const int64_t ne) {
|
||||
const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const src_t * x = (const src_t *) cx;
|
||||
dst_t * dst = (dst_t *) cdst;
|
||||
|
||||
dst[i] = ggml_cuda_cast<dst_t>(x[i]);
|
||||
}
|
||||
|
||||
template<typename src_t, typename dst_t>
|
||||
static void ggml_cpy_scalar_contiguous_cuda(
|
||||
const char * cx, char * cdst, const int64_t ne,
|
||||
cudaStream_t stream) {
|
||||
|
||||
const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
GGML_ASSERT(num_blocks < UINT_MAX);
|
||||
cpy_scalar_contiguous<src_t, dst_t><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne);
|
||||
}
|
||||
|
||||
template<typename src_t, typename dst_t, bool transposed = false>
|
||||
static void ggml_cpy_scalar_cuda(
|
||||
const char * cx, char * cdst, const int64_t ne,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
|
||||
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
|
||||
|
||||
if (transposed) {
|
||||
GGML_ASSERT(ne == ne00*ne01*ne02); // ne[3] is 1 assumed
|
||||
int64_t ne00n, ne01n, ne02n;
|
||||
if (nb00 <= nb02) { // most likely safe to handle nb00 = nb02 case here
|
||||
ne00n = ne00;
|
||||
ne01n = ne01;
|
||||
ne02n = ne02;
|
||||
} else {
|
||||
ne00n = ne00;
|
||||
ne01n = ne01*ne02;
|
||||
ne02n = 1;
|
||||
}
|
||||
|
||||
int64_t grid_x = (ne01n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D;
|
||||
int64_t grid_y = (ne00n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D;
|
||||
int64_t grid_z = (ne/(ne01n*ne00n) + CUDA_CPY_BLOCK_NM - 1) / CUDA_CPY_BLOCK_NM;
|
||||
GGML_ASSERT(grid_x < UINT_MAX);
|
||||
GGML_ASSERT(grid_y < USHRT_MAX);
|
||||
GGML_ASSERT(grid_z < USHRT_MAX);
|
||||
dim3 dimGrid(grid_x, grid_y, grid_z);
|
||||
dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1);
|
||||
cpy_scalar_transpose<dst_t><<<dimGrid, dimBlock, 0, stream>>>
|
||||
(cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
} else {
|
||||
const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
GGML_ASSERT(num_blocks < UINT_MAX);
|
||||
cpy_scalar<cpy_1_scalar<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q8_0_cuda(
|
||||
const char * cx, char * cdst, const int64_t ne,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
|
||||
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
|
||||
|
||||
GGML_ASSERT(ne % QK8_0 == 0);
|
||||
const int64_t num_blocks = ne / QK8_0;
|
||||
GGML_ASSERT(num_blocks < UINT_MAX);
|
||||
cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_q8_0_f32_cuda(
|
||||
const char * cx, char * cdst, const int64_t ne,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
|
||||
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
|
||||
|
||||
const int64_t num_blocks = ne;
|
||||
GGML_ASSERT(num_blocks < UINT_MAX);
|
||||
cpy_q_f32<cpy_blck_q8_0_f32, QK8_0><<<num_blocks, 1, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q4_0_cuda(
|
||||
const char * cx, char * cdst, const int64_t ne,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
|
||||
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
|
||||
|
||||
GGML_ASSERT(ne % QK4_0 == 0);
|
||||
const int64_t num_blocks = ne / QK4_0;
|
||||
GGML_ASSERT(num_blocks < UINT_MAX);
|
||||
cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_q4_0_f32_cuda(
|
||||
const char * cx, char * cdst, const int64_t ne,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02,
|
||||
const int64_t nb00, const int64_t nb01, const int64_t nb02,
|
||||
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12,
|
||||
const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
|
||||
cudaStream_t stream) {
|
||||
const int64_t num_blocks = ne;
|
||||
GGML_ASSERT(num_blocks < UINT_MAX);
|
||||
cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0><<<num_blocks, 1, 0, stream>>>(
|
||||
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q4_1_cuda(
|
||||
const char * cx, char * cdst, const int64_t ne,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
|
||||
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
|
||||
|
||||
GGML_ASSERT(ne % QK4_1 == 0);
|
||||
const int64_t num_blocks = ne / QK4_1;
|
||||
GGML_ASSERT(num_blocks < UINT_MAX);
|
||||
cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_q4_1_f32_cuda(
|
||||
const char * cx, char * cdst, const int64_t ne,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02,
|
||||
const int64_t nb00, const int64_t nb01, const int64_t nb02,
|
||||
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12,
|
||||
const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
|
||||
cudaStream_t stream) {
|
||||
const int64_t num_blocks = ne;
|
||||
GGML_ASSERT(num_blocks < UINT_MAX);
|
||||
cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1>, QK4_1><<<num_blocks, 1, 0, stream>>>(
|
||||
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q5_0_cuda(
|
||||
const char * cx, char * cdst, const int64_t ne,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
|
||||
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
|
||||
|
||||
GGML_ASSERT(ne % QK5_0 == 0);
|
||||
const int64_t num_blocks = ne / QK5_0;
|
||||
GGML_ASSERT(num_blocks < UINT_MAX);
|
||||
cpy_f32_q<cpy_blck_f32_q5_0, QK5_0><<<num_blocks, 1, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_q5_0_f32_cuda(
|
||||
const char * cx, char * cdst, const int64_t ne,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02,
|
||||
const int64_t nb00, const int64_t nb01, const int64_t nb02,
|
||||
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12,
|
||||
const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
|
||||
cudaStream_t stream) {
|
||||
const int64_t num_blocks = ne;
|
||||
GGML_ASSERT(num_blocks < UINT_MAX);
|
||||
cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0>, QK5_0><<<num_blocks, 1, 0, stream>>>(
|
||||
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q5_1_cuda(
|
||||
const char * cx, char * cdst, const int64_t ne,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
|
||||
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
|
||||
|
||||
GGML_ASSERT(ne % QK5_1 == 0);
|
||||
const int64_t num_blocks = ne / QK5_1;
|
||||
GGML_ASSERT(num_blocks < UINT_MAX);
|
||||
cpy_f32_q<cpy_blck_f32_q5_1, QK5_1><<<num_blocks, 1, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_q5_1_f32_cuda(
|
||||
const char * cx, char * cdst, const int64_t ne,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02,
|
||||
const int64_t nb00, const int64_t nb01, const int64_t nb02,
|
||||
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12,
|
||||
const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
|
||||
cudaStream_t stream) {
|
||||
const int64_t num_blocks = ne;
|
||||
GGML_ASSERT(num_blocks < UINT_MAX);
|
||||
cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1><<<num_blocks, 1, 0, stream>>>(
|
||||
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_iq4_nl_cuda(
|
||||
const char * cx, char * cdst, const int64_t ne,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
|
||||
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
|
||||
|
||||
GGML_ASSERT(ne % QK4_NL == 0);
|
||||
const int64_t num_blocks = ne / QK4_NL;
|
||||
GGML_ASSERT(num_blocks < UINT_MAX);
|
||||
cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL><<<num_blocks, 1, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
|
||||
const int64_t ne = ggml_nelements(src0);
|
||||
GGML_ASSERT(ne == ggml_nelements(src1));
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
|
||||
//GGML_ASSERT(src0->ne[3] == 1);
|
||||
|
||||
const int64_t nb00 = src0->nb[0];
|
||||
const int64_t nb01 = src0->nb[1];
|
||||
const int64_t nb02 = src0->nb[2];
|
||||
const int64_t nb03 = src0->nb[3];
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
const int64_t ne12 = src1->ne[2];
|
||||
|
||||
//GGML_ASSERT(src1->ne[3] == 1);
|
||||
|
||||
const int64_t nb10 = src1->nb[0];
|
||||
const int64_t nb11 = src1->nb[1];
|
||||
const int64_t nb12 = src1->nb[2];
|
||||
const int64_t nb13 = src1->nb[3];
|
||||
|
||||
cudaStream_t main_stream = ctx.stream();
|
||||
|
||||
char * src0_ddc = (char *) src0->data;
|
||||
char * src1_ddc = (char *) src1->data;
|
||||
|
||||
const bool contiguous_srcs = ggml_is_contiguous(src0) && ggml_is_contiguous(src1);
|
||||
const bool can_be_transposed = nb01 == (int64_t)ggml_element_size(src0) &&
|
||||
src0->ne[3] == 1 && nb02 == ne00 * ne01 * (int64_t)ggml_element_size(src0);
|
||||
|
||||
if (src0->type == src1->type && contiguous_srcs) {
|
||||
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
|
||||
#if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY)
|
||||
if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) {
|
||||
CUDA_CHECK(mudnnMemcpyAsync(ctx, src1, src0));
|
||||
} else
|
||||
#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY
|
||||
{
|
||||
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||
if (can_be_transposed) {
|
||||
ggml_cpy_scalar_cuda<float, float, true>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else {
|
||||
ggml_cpy_scalar_cuda<float, float>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
|
||||
if (contiguous_srcs) {
|
||||
ggml_cpy_scalar_contiguous_cuda<float, nv_bfloat16>
|
||||
(src0_ddc, src1_ddc, ne, main_stream);
|
||||
} else {
|
||||
ggml_cpy_scalar_cuda<float, nv_bfloat16>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
||||
if (contiguous_srcs) {
|
||||
ggml_cpy_scalar_contiguous_cuda<float, half>
|
||||
(src0_ddc, src1_ddc, ne, main_stream);
|
||||
} else {
|
||||
ggml_cpy_scalar_cuda<float, half>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
||||
ggml_cpy_f32_q8_0_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_q8_0_f32_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
||||
ggml_cpy_f32_q4_0_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_q4_0_f32_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
||||
ggml_cpy_f32_q4_1_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_Q4_1 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_q4_1_f32_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
|
||||
ggml_cpy_f32_q5_0_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_Q5_0 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_q5_0_f32_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
|
||||
ggml_cpy_f32_iq4_nl_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
|
||||
ggml_cpy_f32_q5_1_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_q5_1_f32_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
||||
if (can_be_transposed) {
|
||||
ggml_cpy_scalar_cuda<half, half, true>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else {
|
||||
ggml_cpy_scalar_cuda<half, half>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_BF16) {
|
||||
if (contiguous_srcs) {
|
||||
ggml_cpy_scalar_contiguous_cuda<half, nv_bfloat16>
|
||||
(src0_ddc, src1_ddc, ne, main_stream);
|
||||
} else {
|
||||
ggml_cpy_scalar_cuda<half, nv_bfloat16>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||
if (contiguous_srcs) {
|
||||
ggml_cpy_scalar_contiguous_cuda<half, float>
|
||||
(src0_ddc, src1_ddc, ne, main_stream);
|
||||
} else {
|
||||
ggml_cpy_scalar_cuda<half, float>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) {
|
||||
if (can_be_transposed) {
|
||||
ggml_cpy_scalar_cuda<nv_bfloat16, nv_bfloat16, true>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else {
|
||||
ggml_cpy_scalar_cuda<nv_bfloat16, nv_bfloat16>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) {
|
||||
if (contiguous_srcs) {
|
||||
ggml_cpy_scalar_contiguous_cuda<nv_bfloat16, half>
|
||||
(src0_ddc, src1_ddc, ne, main_stream);
|
||||
} else {
|
||||
ggml_cpy_scalar_cuda<nv_bfloat16, half>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32) {
|
||||
if (contiguous_srcs) {
|
||||
ggml_cpy_scalar_contiguous_cuda<nv_bfloat16, float>
|
||||
(src0_ddc, src1_ddc, ne, main_stream);
|
||||
} else {
|
||||
ggml_cpy_scalar_cuda<nv_bfloat16, float>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
|
||||
if (can_be_transposed) {
|
||||
ggml_cpy_scalar_cuda<int32_t, int32_t, true>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else {
|
||||
ggml_cpy_scalar_cuda<int32_t, int32_t>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_I32) {
|
||||
if (contiguous_srcs) {
|
||||
ggml_cpy_scalar_contiguous_cuda<float, int32_t>
|
||||
(src0_ddc, src1_ddc, ne, main_stream);
|
||||
} else {
|
||||
ggml_cpy_scalar_cuda<float, int32_t>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_F32) {
|
||||
if (contiguous_srcs) {
|
||||
ggml_cpy_scalar_contiguous_cuda<int32_t, float>
|
||||
(src0_ddc, src1_ddc, ne, main_stream);
|
||||
} else {
|
||||
ggml_cpy_scalar_cuda<int32_t, float>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else {
|
||||
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
ggml_cuda_cpy(ctx, src0, dst);
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
#include "common.cuh"
|
||||
|
||||
#define CUDA_CPY_BLOCK_SIZE 64
|
||||
|
||||
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1);
|
||||
|
||||
void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
@@ -0,0 +1,177 @@
|
||||
#include "common.cuh"
|
||||
#include "cross-entropy-loss.cuh"
|
||||
#include "sum.cuh"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
|
||||
template <bool use_shared>
|
||||
static __global__ void cross_entropy_loss_f32(
|
||||
const float * __restrict__ logits, const float * __restrict__ labels, float * __restrict__ dst, const int nclasses, const int k) {
|
||||
extern __shared__ float tmp[];
|
||||
|
||||
logits += int64_t(blockIdx.x)*nclasses;
|
||||
labels += int64_t(blockIdx.x)*nclasses;
|
||||
|
||||
// Find maximum for softmax:
|
||||
float max_logit = -INFINITY;
|
||||
for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
|
||||
const float val = logits[i];
|
||||
max_logit = fmaxf(max_logit, val);
|
||||
|
||||
if (use_shared) {
|
||||
tmp[i] = val;
|
||||
}
|
||||
}
|
||||
max_logit = warp_reduce_max(max_logit);
|
||||
|
||||
// Calculate log(softmax(logits)) which is just logits - max:
|
||||
float sum = 0.0f;
|
||||
for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
|
||||
const float logit_i = use_shared ? tmp[i] : logits[i];
|
||||
sum += expf(logit_i - max_logit);
|
||||
}
|
||||
sum = warp_reduce_sum(sum);
|
||||
sum = logf(sum);
|
||||
|
||||
// log(exp(logits - max) / sum) = (logits - max) - log(sum)
|
||||
float loss = 0.0f;
|
||||
for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
|
||||
const float logit_i = use_shared ? tmp[i] : logits[i];
|
||||
loss += (logit_i - max_logit - sum) * labels[i];
|
||||
}
|
||||
loss = -warp_reduce_sum(loss) / (float)k;
|
||||
|
||||
if (threadIdx.x != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
dst[blockIdx.x] = loss;
|
||||
}
|
||||
|
||||
template <bool use_shared>
|
||||
static __global__ void cross_entropy_loss_back_f32(
|
||||
const float * __restrict__ grad, const float * __restrict__ logits, const float * __restrict__ labels,
|
||||
float * __restrict__ dst, const int nclasses) {
|
||||
extern __shared__ float tmp[];
|
||||
|
||||
logits += int64_t(blockIdx.x)*nclasses;
|
||||
labels += int64_t(blockIdx.x)*nclasses;
|
||||
dst += int64_t(blockIdx.x)*nclasses;
|
||||
|
||||
float maxval = -INFINITY;
|
||||
for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
|
||||
const float val = logits[i];
|
||||
maxval = fmaxf(maxval, val);
|
||||
|
||||
if (use_shared) {
|
||||
tmp[i] = val;
|
||||
}
|
||||
}
|
||||
maxval = warp_reduce_max(maxval);
|
||||
|
||||
float sum = 0.0f;
|
||||
for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
|
||||
const float val = expf((use_shared ? tmp[i] : logits[i]) - maxval);
|
||||
sum += val;
|
||||
|
||||
if (use_shared) {
|
||||
tmp[i] = val;
|
||||
} else {
|
||||
dst[i] = val;
|
||||
}
|
||||
}
|
||||
sum = warp_reduce_sum(sum);
|
||||
const float sm_scale = 1.0f/sum;
|
||||
|
||||
const float d_by_nrows = *grad/gridDim.x;
|
||||
for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
|
||||
const float val = use_shared ? tmp[i] : dst[i];
|
||||
dst[i] = (val*sm_scale - labels[i])*d_by_nrows;
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous(src1));
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t nrows = ggml_nrows(src0);
|
||||
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
ggml_cuda_pool & pool = ctx.pool();
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
const dim3 blocks_dim(WARP_SIZE, 1, 1);
|
||||
const dim3 blocks_num(nrows, 1, 1);
|
||||
const size_t nbytes_shared = ne00*sizeof(float);
|
||||
|
||||
const int id = ggml_cuda_get_device();
|
||||
const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
|
||||
|
||||
ggml_cuda_pool_alloc<float> dst_tmp(pool, blocks_num.x);
|
||||
|
||||
if (nbytes_shared <= smpbo) {
|
||||
CUDA_SET_SHARED_MEMORY_LIMIT((cross_entropy_loss_f32<true>), smpbo);
|
||||
cross_entropy_loss_f32<true><<<blocks_num, blocks_dim, nbytes_shared, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
|
||||
} else {
|
||||
cross_entropy_loss_f32<false><<<blocks_num, blocks_dim, 0, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
|
||||
}
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
// Combine results from individual blocks:
|
||||
sum_f32_cuda(pool, dst_tmp.ptr, dst_d, blocks_num.x, stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * grad = dst->src[0];
|
||||
const ggml_tensor * src0f = dst->src[1];
|
||||
const ggml_tensor * src1f = dst->src[2];
|
||||
|
||||
GGML_ASSERT(src0f->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1f->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( grad->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_ASSERT(ggml_is_scalar(grad));
|
||||
GGML_ASSERT(ggml_is_contiguous(src0f));
|
||||
GGML_ASSERT(ggml_is_contiguous(src1f));
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
GGML_ASSERT(ggml_are_same_shape(src0f, src1f));
|
||||
GGML_ASSERT(ggml_are_same_shape(src0f, dst));
|
||||
|
||||
const int64_t ne00 = src0f->ne[0];
|
||||
const int64_t nrows = ggml_nrows(src0f);
|
||||
|
||||
const float * grad_d = (const float *) grad->data;
|
||||
const float * src0f_d = (const float *) src0f->data;
|
||||
const float * src1f_d = (const float *) src1f->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
const dim3 blocks_dim(WARP_SIZE, 1, 1);
|
||||
const dim3 blocks_num(nrows, 1, 1);
|
||||
const size_t nbytes_shared = ne00*sizeof(float);
|
||||
|
||||
const int id = ggml_cuda_get_device();
|
||||
const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
|
||||
|
||||
if (nbytes_shared <= smpbo) {
|
||||
CUDA_SET_SHARED_MEMORY_LIMIT((cross_entropy_loss_back_f32<true>), smpbo);
|
||||
cross_entropy_loss_back_f32<true><<<blocks_num, blocks_dim, nbytes_shared, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);
|
||||
} else {
|
||||
cross_entropy_loss_back_f32<false><<<blocks_num, blocks_dim, 0, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
#include "common.cuh"
|
||||
|
||||
#define CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE 256
|
||||
|
||||
void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
@@ -0,0 +1,307 @@
|
||||
#include <algorithm>
|
||||
#include "cumsum.cuh"
|
||||
#include "convert.cuh"
|
||||
#include "ggml-cuda/common.cuh"
|
||||
#include "ggml.h"
|
||||
|
||||
#ifdef GGML_CUDA_USE_CUB
|
||||
# include <cub/cub.cuh>
|
||||
#endif // GGML_CUDA_USE_CUB
|
||||
|
||||
template<typename T, int BLOCK_SIZE>
|
||||
static __global__ void cumsum_cub_kernel(
|
||||
const T * __restrict__ src,
|
||||
T * __restrict__ dst,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
|
||||
const int64_t s01, const int64_t s02, const int64_t s03,
|
||||
const int64_t s1, const int64_t s2, const int64_t s3) {
|
||||
#ifdef GGML_CUDA_USE_CUB
|
||||
using BlockScanT = cub::BlockScan<T, BLOCK_SIZE>;
|
||||
|
||||
__shared__ typename BlockScanT::TempStorage temp_storage;
|
||||
__shared__ T block_carry;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
constexpr int UNROLL_FACTOR = 4;
|
||||
constexpr int TILE_SIZE = BLOCK_SIZE * UNROLL_FACTOR;
|
||||
|
||||
const int64_t i1 = blockIdx.x;
|
||||
const int64_t i2 = blockIdx.y;
|
||||
const int64_t i3 = blockIdx.z;
|
||||
|
||||
if (i1 >= ne01 || i2 >= ne02 || i3 >= ne03) {
|
||||
return;
|
||||
}
|
||||
|
||||
const T * src_row = src + i1 * s01 + i2 * s02 + i3 * s03;
|
||||
T * dst_row = dst + i1 * s1 + i2 * s2 + i3 * s3;
|
||||
|
||||
if (tid == 0) {
|
||||
block_carry = 0;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
for (int64_t start = 0; start < ne00; start += TILE_SIZE) {
|
||||
T items[UNROLL_FACTOR];
|
||||
T thread_sum = T(0);
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < UNROLL_FACTOR; i++) {
|
||||
int64_t idx = start + tid * UNROLL_FACTOR + i;
|
||||
T val = (idx < ne00) ? src_row[idx] : T(0);
|
||||
thread_sum += val;
|
||||
items[i] = thread_sum;
|
||||
}
|
||||
|
||||
// Block-wide scan on thread sums
|
||||
T thread_prefix;
|
||||
T block_total;
|
||||
BlockScanT(temp_storage).InclusiveSum(thread_sum, thread_prefix, block_total);
|
||||
__syncthreads();
|
||||
|
||||
// Add offset to each item and store
|
||||
T thread_offset = thread_prefix - thread_sum + block_carry;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < UNROLL_FACTOR; i++) {
|
||||
int64_t idx = start + tid * UNROLL_FACTOR + i;
|
||||
if (idx < ne00) {
|
||||
dst_row[idx] = items[i] + thread_offset;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Update carry for next tile
|
||||
if (tid == 0) {
|
||||
block_carry += block_total;
|
||||
}
|
||||
}
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // GGML_CUDA_USE_CUB
|
||||
}
|
||||
|
||||
// Fallback kernel implementation
|
||||
template<typename T>
|
||||
static __global__ void cumsum_kernel(
|
||||
const T * src, T * dst,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
|
||||
const int64_t s00, const int64_t s01, const int64_t s02, const int64_t s03,
|
||||
const int64_t s0, const int64_t s1, const int64_t s2, const int64_t s3) {
|
||||
|
||||
GGML_UNUSED_VARS(s00, s0);
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
const int lane = tid % warp_size;
|
||||
const int warp = tid / warp_size;
|
||||
const int warps_per_block = blockDim.x / warp_size;
|
||||
|
||||
extern __shared__ float smem[];
|
||||
float * s_vals = smem;
|
||||
float * s_warp_sums = smem + blockDim.x;
|
||||
float * s_carry = smem + blockDim.x + warps_per_block;
|
||||
float * s_chunk_total = s_carry + 1;
|
||||
|
||||
// Initialize carry
|
||||
if (tid == 0) {
|
||||
*s_carry = 0.0f;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
const int64_t i3 = blockIdx.z;
|
||||
const int64_t i2 = blockIdx.y;
|
||||
const int64_t i1 = blockIdx.x;
|
||||
if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
const T * src_row = src + i1 * s01 + i2 * s02 + i3 * s03;
|
||||
T * dst_row = dst + i1 * s1 + i2 * s2 + i3 * s3;
|
||||
|
||||
// register blocking: process 4 elements per thread to hide latency
|
||||
// and reduce synchronization overhead
|
||||
constexpr int num_unroll = 4;
|
||||
T temp[num_unroll];
|
||||
|
||||
for (int64_t i = 0; i < ne00; i += num_unroll * blockDim.x) {
|
||||
int64_t idx = i + tid * num_unroll;
|
||||
|
||||
// thread local sequential scan
|
||||
temp[0] = (idx < ne00 ? src_row[idx] : T(0));
|
||||
#pragma unroll
|
||||
for (int64_t j = 1; j < num_unroll; j++) {
|
||||
temp[j] = temp[j - 1];
|
||||
if (idx + j < ne00) {
|
||||
temp[j] += src_row[idx + j];
|
||||
} else {
|
||||
temp[j] += 0;
|
||||
}
|
||||
}
|
||||
|
||||
// last emenent is sum of all values assigned to thread
|
||||
float val = (idx < ne00) ? ggml_cuda_cast<float, T>(temp[num_unroll - 1]) : 0.0f;
|
||||
|
||||
// Warp inclusive scan
|
||||
val = warp_prefix_inclusive_sum<T, warp_size>(val);
|
||||
s_vals[tid] = val;
|
||||
|
||||
if (lane == warp_size - 1) {
|
||||
s_warp_sums[warp] = val;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// Exclusive scan of warp sums (warp 0 only)
|
||||
if (warp == 0) {
|
||||
float w = (tid < warps_per_block) ? s_warp_sums[tid] : 0.0f;
|
||||
float inc = warp_prefix_inclusive_sum<T, warp_size>(w);
|
||||
if (tid < warps_per_block) {
|
||||
s_warp_sums[tid] = inc - w; // exclusive sum
|
||||
}
|
||||
if (tid == warps_per_block - 1) {
|
||||
*s_chunk_total = inc; // total sum of this chunk
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// write back results
|
||||
float carry = *s_carry;
|
||||
// calculate sum offset for this thread
|
||||
float final_val_offset = s_vals[tid] + s_warp_sums[warp] + carry - temp[num_unroll - 1];
|
||||
|
||||
#pragma unroll
|
||||
for (int32_t j = 0; j < num_unroll; j++) {
|
||||
if (idx + j < ne00) {
|
||||
dst_row[idx + j] = temp[j] + ggml_cuda_cast<T, float>(final_val_offset);
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Update carry for next chunk
|
||||
if (tid == 0) {
|
||||
*s_carry += *s_chunk_total;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_CUDA_USE_CUB
|
||||
template <typename T>
|
||||
static void cumsum_cub(ggml_cuda_pool & pool,
|
||||
const T * src,
|
||||
T * dst,
|
||||
int64_t ne,
|
||||
cudaStream_t stream) {
|
||||
size_t tmp_size = 0;
|
||||
|
||||
// Query how much temp storage CUDA UnBound (CUB) needs
|
||||
cub::DeviceScan::InclusiveSum(nullptr, // d_temp_storage (null = just query size)
|
||||
tmp_size, // reference to size (will be set by CUB)
|
||||
src, // input pointer
|
||||
dst, // output pointer
|
||||
ne, // number of elements
|
||||
stream // CUDA stream to use
|
||||
);
|
||||
|
||||
ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
|
||||
|
||||
// Perform the inclusive scan
|
||||
cub::DeviceScan::InclusiveSum((void *) tmp_alloc.get(), tmp_size, src, dst, ne, stream);
|
||||
}
|
||||
#endif // GGML_CUDA_USE_CUB
|
||||
|
||||
template<typename T>
|
||||
static void cumsum_cuda(
|
||||
[[maybe_unused]] ggml_backend_cuda_context & ctx, const T * src, T * dst,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
|
||||
const int64_t nb00, const int64_t nb01, const int64_t nb02, const int64_t nb03,
|
||||
const int64_t nb0, const int64_t nb1, const int64_t nb2, const int64_t nb3,
|
||||
cudaStream_t stream) {
|
||||
|
||||
const size_t type_size = sizeof(T);
|
||||
bool use_cub = false;
|
||||
#ifdef GGML_CUDA_USE_CUB
|
||||
// Check if we can use CUB (data must be contiguous along innermost dimension)
|
||||
const bool is_contiguous = (nb00 == type_size) && (nb0 == type_size);
|
||||
|
||||
if (is_contiguous) {
|
||||
use_cub = true;
|
||||
const int64_t nrows = ne01 * ne02 * ne03;
|
||||
// TODO: Compare with DeviceSegmentedScan::InclusiveSegmentedSum for nrows > 1 once InclusiveSegmentedSum is released
|
||||
// Heuristics were determined as part of https://github.com/ggml-org/llama.cpp/pull/17004
|
||||
if (((nrows == 1) && (ne00 > 1024)) || (ne00 / nrows > 4096)) {
|
||||
for (int i=0; i<nrows; i++) {
|
||||
cumsum_cub(ctx.pool(), src + i * ne00, dst + i * ne00, ne00, stream);
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
#endif // GGML_CUDA_USE_CUB
|
||||
dim3 grid_dims(ne01, ne02, ne03);
|
||||
const auto &info = ggml_cuda_info().devices[ggml_cuda_get_device()];
|
||||
const int warp_size = info.warp_size;
|
||||
const int num_warps = (ne00 + warp_size - 1) / warp_size;
|
||||
int block_size = num_warps * warp_size;
|
||||
block_size = std::min(block_size, CUDA_CUMSUM_BLOCK_SIZE);
|
||||
dim3 block_dims(block_size, 1, 1);
|
||||
const int warps_per_block = block_size / warp_size;
|
||||
const size_t shmem_size = (block_size + warps_per_block + 2) * sizeof(float);
|
||||
|
||||
if (use_cub && ne00 >= 1024) {
|
||||
cumsum_cub_kernel<T, CUDA_CUMSUM_BLOCK_SIZE><<<grid_dims, CUDA_CUMSUM_BLOCK_SIZE, 0, stream>>>(
|
||||
src, dst,
|
||||
ne00, ne01, ne02, ne03,
|
||||
nb01 / type_size, nb02 / type_size, nb03 / type_size,
|
||||
nb1 / type_size, nb2 / type_size, nb3 / type_size
|
||||
);
|
||||
} else {
|
||||
cumsum_kernel<<<grid_dims, block_dims, shmem_size, stream>>>(
|
||||
src, dst,
|
||||
ne00, ne01, ne02, ne03,
|
||||
nb00 / type_size, nb01 / type_size, nb02 / type_size, nb03 / type_size,
|
||||
nb0 / type_size, nb1 / type_size, nb2 / type_size, nb3 / type_size
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_op_cumsum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == dst->type);
|
||||
switch(src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
cumsum_cuda(
|
||||
ctx, (const float *)src0->data, (float *)dst->data,
|
||||
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
||||
dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3],
|
||||
stream
|
||||
);
|
||||
} break;
|
||||
// We do not support those on CPU for now anyway, so comment them out because they cause errors on some CI platforms
|
||||
/*case GGML_TYPE_F16:
|
||||
{
|
||||
cumsum_cuda(
|
||||
(const half *)src0->data, (half *)dst->data,
|
||||
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
||||
dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3],
|
||||
stream
|
||||
);
|
||||
} break;
|
||||
case GGML_TYPE_BF16:
|
||||
{
|
||||
cumsum_cuda(
|
||||
(const nv_bfloat16 *)src0->data, (nv_bfloat16 *)dst->data,
|
||||
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
||||
dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3],
|
||||
stream
|
||||
);
|
||||
} break;*/
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
#include "common.cuh"
|
||||
|
||||
#define CUDA_CUMSUM_BLOCK_SIZE 256
|
||||
|
||||
void ggml_cuda_op_cumsum(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
@@ -0,0 +1,77 @@
|
||||
#include "common.cuh"
|
||||
|
||||
static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, float2 & v){
|
||||
const block_q4_0 * x = (const block_q4_0 *) vx;
|
||||
|
||||
const float d = x[ib].d;
|
||||
|
||||
const int vui = x[ib].qs[iqs];
|
||||
|
||||
v.x = vui & 0xF;
|
||||
v.y = vui >> 4;
|
||||
|
||||
v.x = (v.x - 8.0f) * d;
|
||||
v.y = (v.y - 8.0f) * d;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int64_t ib, const int iqs, float2 & v){
|
||||
const block_q4_1 * x = (const block_q4_1 *) vx;
|
||||
|
||||
const float2 dm = __half22float2(x[ib].dm);
|
||||
|
||||
const int vui = x[ib].qs[iqs];
|
||||
|
||||
v.x = vui & 0xF;
|
||||
v.y = vui >> 4;
|
||||
|
||||
v.x = (v.x * dm.x) + dm.y;
|
||||
v.y = (v.y * dm.x) + dm.y;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int64_t ib, const int iqs, float2 & v){
|
||||
const block_q5_0 * x = (const block_q5_0 *) vx;
|
||||
|
||||
const float d = x[ib].d;
|
||||
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x[ib].qh, sizeof(qh));
|
||||
|
||||
const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
|
||||
const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
|
||||
|
||||
v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
|
||||
v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
|
||||
|
||||
v.x = (v.x - 16.0f) * d;
|
||||
v.y = (v.y - 16.0f) * d;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int64_t ib, const int iqs, float2 & v){
|
||||
const block_q5_1 * x = (const block_q5_1 *) vx;
|
||||
|
||||
const float2 dm = __half22float2(x[ib].dm);
|
||||
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x[ib].qh, sizeof(qh));
|
||||
|
||||
const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
|
||||
const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
|
||||
|
||||
v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
|
||||
v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
|
||||
|
||||
v.x = (v.x * dm.x) + dm.y;
|
||||
v.y = (v.y * dm.x) + dm.y;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int64_t ib, const int iqs, float2 & v){
|
||||
const block_q8_0 * x = (const block_q8_0 *) vx;
|
||||
|
||||
const float d = x[ib].d;
|
||||
|
||||
v.x = x[ib].qs[iqs + 0];
|
||||
v.y = x[ib].qs[iqs + 1];
|
||||
|
||||
v.x *= d;
|
||||
v.y *= d;
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user