Skip to content

Commit

Permalink
ENH: Adopt new macOS Accelerate BLAS/LAPACK Interfaces, including ILP…
Browse files Browse the repository at this point in the history
…64 (numpy#24053)

macOS 13.3 shipped with an updated Accelerate framework that provides BLAS / LAPACK.
The new version is aligned with Netlib's v3.9.1 and also supports ILP64.  The changes here
adopt those new interfaces when available.

- New interfaces are used when ACCELERATE_NEW_LAPACK is defined.
- ILP64 interfaces are used when both ACCELERATE_NEW_LAPACK and ACCELERATE_LAPACK_ILP64 are defined.

macOS 13.3 now ships with 3 different sets of BLAS / LAPACK interfaces:
- LP64 / LAPACK v3.2.1 - legacy interfaces kept for compatibility
- LP64 / LAPACK v3.9.1 - new interfaces
- ILP64 / LAPACK v3.9.1 - new interfaces with ILP64 support

For LP64, we want to support building against macOS 13.3+ SDK, but having it work on pre-13.3 systems.
To that end, we created wrappers for each API that do a runtime check on which set of API is available
and should be used. However, these were deemed potentially too complex to include during review
of numpygh-24053, and left out in this commit. Please see numpygh-24053 for those.

ILP64 is only supported on macOS 13.3+ and does not use additional wrappers.

We've included support for both distutils and Meson builds. All tests pass on Apple silicon
and Intel based Macs. A new CI job for Accelerate ILP64 on x86-64 was added as well.

Benchmarks
ILP64 Accelerate vs OpenBLAS
       before           after         ratio
     [73f0cf4f]       [d1572653]
     <openblas-ilp64>       <accelerate-ilp64>
              n/a              n/a      n/a  bench_linalg.Linalg.time_op('det', 'float16')
              n/a              n/a      n/a  bench_linalg.Linalg.time_op('pinv', 'float16')
              n/a              n/a      n/a  bench_linalg.Linalg.time_op('svd', 'float16')
           failed           failed      n/a  bench_linalg.LinalgSmallArrays.time_det_small_array
+      3.96±0.1μs       5.04±0.4μs     1.27  bench_linalg.Linalg.time_op('norm', 'float32')
      1.43±0.04ms         1.43±0ms     1.00  bench_linalg.Einsum.time_einsum_outer(<class 'numpy.float32'>)
       12.7±0.4μs       12.7±0.3μs     1.00  bench_linalg.Einsum.time_einsum_sum_mul2(<class 'numpy.float32'>)
       24.1±0.8μs      24.1±0.04μs     1.00  bench_linalg.Linalg.time_op('norm', 'float16')
       9.48±0.2ms       9.48±0.3ms     1.00  bench_linalg.Einsum.time_einsum_outer(<class 'numpy.float64'>)
         609±20μs          609±2μs     1.00  bench_linalg.Einsum.time_einsum_noncon_outer(<class 'numpy.float32'>)
         64.9±2μs      64.7±0.07μs     1.00  bench_linalg.Einsum.time_einsum_contig_outstride0(<class 'numpy.float64'>)
      1.24±0.03ms      1.24±0.01ms     1.00  bench_linalg.Einsum.time_einsum_noncon_outer(<class 'numpy.float64'>)
          102±3μs        102±0.2μs     1.00  bench_linalg.Einsum.time_einsum_contig_contig(<class 'numpy.float64'>)
       21.9±0.8μs      21.8±0.02μs     1.00  bench_linalg.Einsum.time_einsum_multiply(<class 'numpy.float64'>)
       22.8±0.2ms       22.7±0.3ms     0.99  bench_linalg.Eindot.time_einsum_ijk_jil_kl
       13.3±0.4μs      13.3±0.02μs     0.99  bench_linalg.Einsum.time_einsum_sum_mul2(<class 'numpy.float64'>)
       9.56±0.3μs       9.49±0.2μs     0.99  bench_linalg.Einsum.time_einsum_noncon_contig_contig(<class 'numpy.float64'>)
       7.31±0.2μs      7.26±0.08μs     0.99  bench_linalg.Einsum.time_einsum_noncon_contig_outstride0(<class 'numpy.float32'>)
       5.60±0.2ms      5.55±0.02ms     0.99  bench_linalg.Eindot.time_einsum_ij_jk_a_b
         37.1±1μs       36.7±0.1μs     0.99  bench_linalg.Einsum.time_einsum_contig_outstride0(<class 'numpy.float32'>)
       13.5±0.4μs      13.4±0.05μs     0.99  bench_linalg.Einsum.time_einsum_sum_mul(<class 'numpy.float64'>)
      1.03±0.03μs         1.02±0μs     0.99  bench_linalg.LinalgSmallArrays.time_norm_small_array
         51.6±2μs      51.0±0.09μs     0.99  bench_linalg.Einsum.time_einsum_contig_contig(<class 'numpy.float32'>)
       15.2±0.5μs      15.0±0.04μs     0.99  bench_linalg.Einsum.time_einsum_noncon_sum_mul2(<class 'numpy.float64'>)
       13.9±0.4μs      13.7±0.02μs     0.99  bench_linalg.Einsum.time_einsum_noncon_sum_mul2(<class 'numpy.float32'>)
         415±10μs        409±0.4μs     0.99  bench_linalg.Eindot.time_einsum_i_ij_j
       9.29±0.3μs      9.01±0.03μs     0.97  bench_linalg.Einsum.time_einsum_noncon_mul(<class 'numpy.float64'>)
       18.2±0.6μs      17.6±0.04μs     0.97  bench_linalg.Einsum.time_einsum_multiply(<class 'numpy.float32'>)
         509±40μs         492±10μs     0.97  bench_linalg.Einsum.time_einsum_mul(<class 'numpy.float64'>)
       9.63±0.3μs      9.28±0.09μs     0.96  bench_linalg.Einsum.time_einsum_noncon_contig_contig(<class 'numpy.float32'>)
       9.08±0.2μs      8.73±0.02μs     0.96  bench_linalg.Einsum.time_einsum_noncon_mul(<class 'numpy.float32'>)
       15.6±0.5μs      15.0±0.04μs     0.96  bench_linalg.Einsum.time_einsum_noncon_sum_mul(<class 'numpy.float64'>)
       7.74±0.2μs      7.39±0.04μs     0.95  bench_linalg.Einsum.time_einsum_noncon_contig_outstride0(<class 'numpy.float64'>)
       18.6±0.6μs      17.7±0.03μs     0.95  bench_linalg.Einsum.time_einsum_noncon_multiply(<class 'numpy.float32'>)
       14.5±0.4μs      13.7±0.03μs     0.95  bench_linalg.Einsum.time_einsum_noncon_sum_mul(<class 'numpy.float32'>)
       13.3±0.6μs       12.5±0.3μs     0.94  bench_linalg.Einsum.time_einsum_sum_mul(<class 'numpy.float32'>)
       23.5±0.5μs      21.9±0.05μs     0.93  bench_linalg.Einsum.time_einsum_noncon_multiply(<class 'numpy.float64'>)
         264±20μs          243±4μs     0.92  bench_linalg.Einsum.time_einsum_mul(<class 'numpy.float32'>)
-        177±50μs        132±0.6μs     0.75  bench_linalg.Eindot.time_dot_trans_at_a
-      10.7±0.3μs      7.13±0.01μs     0.67  bench_linalg.Linalg.time_op('norm', 'int16')
-        97.5±2μs       64.7±0.1μs     0.66  bench_linalg.Eindot.time_matmul_trans_a_at
-      8.87±0.3μs         5.76±0μs     0.65  bench_linalg.Linalg.time_op('norm', 'longfloat')
-      8.90±0.3μs      5.77±0.01μs     0.65  bench_linalg.Linalg.time_op('norm', 'float64')
-      8.48±0.3μs      5.40±0.01μs     0.64  bench_linalg.Linalg.time_op('norm', 'int64')
-         106±2μs         66.5±8μs     0.63  bench_linalg.Eindot.time_inner_trans_a_a
-      8.25±0.3μs         5.16±0μs     0.62  bench_linalg.Linalg.time_op('norm', 'int32')
-         103±5ms       64.6±0.5ms     0.62  bench_import.Import.time_linalg
-         106±3μs       66.0±0.1μs     0.62  bench_linalg.Eindot.time_dot_trans_a_at
-        202±20μs        124±0.6μs     0.61  bench_linalg.Eindot.time_matmul_trans_at_a
-       31.5±10μs      19.3±0.02μs     0.61  bench_linalg.Eindot.time_dot_d_dot_b_c
-       32.4±20μs      19.7±0.03μs     0.61  bench_linalg.Eindot.time_matmul_d_matmul_b_c
-        5.05±1ms      3.06±0.09ms     0.61  bench_linalg.Linalg.time_op('svd', 'complex128')
-      5.35±0.9ms      3.09±0.09ms     0.58  bench_linalg.Linalg.time_op('svd', 'complex64')
-        6.37±3ms       3.27±0.1ms     0.51  bench_linalg.Linalg.time_op('pinv', 'complex128')
-        7.26±8ms       3.24±0.1ms     0.45  bench_linalg.Linalg.time_op('pinv', 'complex64')
-       519±100μs        219±0.8μs     0.42  bench_linalg.Linalg.time_op('det', 'complex64')
-      31.3±0.9μs       12.8±0.1μs     0.41  bench_linalg.Linalg.time_op('norm', 'complex128')
-      2.44±0.7ms          924±1μs     0.38  bench_linalg.Linalg.time_op('pinv', 'float64')
-      29.9±0.8μs      10.8±0.01μs     0.36  bench_linalg.Linalg.time_op('norm', 'complex64')
-      2.56±0.5ms          924±1μs     0.36  bench_linalg.Linalg.time_op('pinv', 'float32')
-      2.63±0.5ms        924±0.6μs     0.35  bench_linalg.Linalg.time_op('pinv', 'int64')
-      2.68±0.7ms         927±10μs     0.35  bench_linalg.Linalg.time_op('pinv', 'int32')
-      2.68±0.5ms         927±10μs     0.35  bench_linalg.Linalg.time_op('pinv', 'int16')
-      2.93±0.6ms          925±2μs     0.32  bench_linalg.Linalg.time_op('pinv', 'longfloat')
-       809±500μs        215±0.2μs     0.27  bench_linalg.Linalg.time_op('det', 'complex128')
-      3.67±0.9ms         895±20μs     0.24  bench_linalg.Eindot.time_tensordot_a_b_axes_1_0_0_1
-       489±100μs         114±20μs     0.23  bench_linalg.Eindot.time_inner_trans_a_ac
-      3.64±0.7ms        777±0.3μs     0.21  bench_linalg.Lstsq.time_numpy_linalg_lstsq_a__b_float64
-        755±90μs         157±10μs     0.21  bench_linalg.Eindot.time_dot_a_b
-        4.63±1ms          899±9μs     0.19  bench_linalg.Linalg.time_op('svd', 'longfloat')
-        5.19±1ms         922±10μs     0.18  bench_linalg.Linalg.time_op('svd', 'float64')
-       599±200μs         89.4±2μs     0.15  bench_linalg.Eindot.time_matmul_trans_atc_a
-       956±200μs         140±10μs     0.15  bench_linalg.Eindot.time_matmul_a_b
-        6.45±3ms         903±10μs     0.14  bench_linalg.Linalg.time_op('svd', 'float32')
-        6.42±3ms        896±0.7μs     0.14  bench_linalg.Linalg.time_op('svd', 'int32')
-        6.47±4ms          902±5μs     0.14  bench_linalg.Linalg.time_op('svd', 'int64')
-        6.52±1ms          899±2μs     0.14  bench_linalg.Linalg.time_op('svd', 'int16')
-       799±300μs          109±2μs     0.14  bench_linalg.Eindot.time_dot_trans_atc_a
-       502±100μs       65.0±0.2μs     0.13  bench_linalg.Eindot.time_dot_trans_a_atc
-       542±300μs      64.2±0.05μs     0.12  bench_linalg.Eindot.time_matmul_trans_a_atc
-       458±300μs      41.6±0.09μs     0.09  bench_linalg.Linalg.time_op('det', 'int32')
-       471±100μs      41.9±0.03μs     0.09  bench_linalg.Linalg.time_op('det', 'float32')
-       510±100μs      43.6±0.06μs     0.09  bench_linalg.Linalg.time_op('det', 'int16')
-       478±200μs      39.6±0.05μs     0.08  bench_linalg.Linalg.time_op('det', 'longfloat')
-       599±200μs      39.6±0.09μs     0.07  bench_linalg.Linalg.time_op('det', 'float64')
-       758±300μs       41.6±0.1μs     0.05  bench_linalg.Linalg.time_op('det', 'int64')

Co-authored-by: Ralf Gommers <ralf.gommers@gmail.com>
  • Loading branch information
2 people authored and charris committed Sep 1, 2023
1 parent 4fb4d7a commit fdbed5e
Show file tree
Hide file tree
Showing 9 changed files with 303 additions and 14 deletions.
5 changes: 4 additions & 1 deletion .cirrus.star
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,7 @@ def main(ctx):
if wheel:
return fs.read("tools/ci/cirrus_wheels.yml")

return fs.read("tools/ci/cirrus_macosx_arm64.yml")
if int(pr_number) < 0:
return []

return fs.read("tools/ci/cirrus_arm.yml")
135 changes: 135 additions & 0 deletions .github/workflows/macos.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
name: macOS tests (meson)

on:
pull_request:
branches:
- main
- maintenance/**

permissions:
contents: read # to fetch code (actions/checkout)

env:
CCACHE_DIR: "${{ github.workspace }}/.ccache"

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true

jobs:
x86_conda:
name: macOS x86-64 conda
if: "github.repository == 'numpy/numpy'"
runs-on: macos-latest
strategy:
matrix:
python-version: ["3.11"]

steps:
- uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0
with:
submodules: recursive
fetch-depth: 0

- name: Prepare cache dirs and timestamps
id: prep-ccache
shell: bash -l {0}
run: |
mkdir -p "${CCACHE_DIR}"
echo "dir=$CCACHE_DIR" >> $GITHUB_OUTPUT
NOW=$(date -u +"%F-%T")
echo "timestamp=${NOW}" >> $GITHUB_OUTPUT
echo "today=$(/bin/date -u '+%Y%m%d')" >> $GITHUB_OUTPUT
- name: Setup compiler cache
uses: actions/cache@88522ab9f39a2ea568f7027eddc7d8d8bc9d59c8 # v3.3.1
id: cache-ccache
with:
path: ${{ steps.prep-ccache.outputs.dir }}
key: ${{ github.workflow }}-${{ matrix.python-version }}-ccache-macos-${{ steps.prep-ccache.outputs.timestamp }}
restore-keys: |
${{ github.workflow }}-${{ matrix.python-version }}-ccache-macos-
- name: Setup Mambaforge
uses: conda-incubator/setup-miniconda@3b0f2504dd76ef23b6d31f291f4913fb60ab5ff3 # v2.2.0
with:
python-version: ${{ matrix.python-version }}
channels: conda-forge
channel-priority: true
activate-environment: numpy-dev
use-only-tar-bz2: false
miniforge-variant: Mambaforge
miniforge-version: latest
use-mamba: true

# Updates if `environment.yml` or the date changes. The latter is needed to
# ensure we re-solve once a day (since we don't lock versions). Could be
# replaced by a conda-lock based approach in the future.
- name: Cache conda environment
uses: actions/cache@88522ab9f39a2ea568f7027eddc7d8d8bc9d59c8 # v3.3.1
env:
# Increase this value to reset cache if environment.yml has not changed
CACHE_NUMBER: 1
with:
path: ${{ env.CONDA }}/envs/numpy-dev
key:
${{ runner.os }}--${{ steps.prep-ccache.outputs.today }}-conda-${{ env.CACHE_NUMBER }}-${{ hashFiles('environment.yml') }}
id: envcache

- name: Update Conda Environment
run: mamba env update -n numpy-dev -f environment.yml
if: steps.envcache.outputs.cache-hit != 'true'

- name: Build and Install NumPy
shell: bash -l {0}
run: |
conda activate numpy-dev
CC="ccache $CC" spin build -j2
- name: Run test suite (full)
shell: bash -l {0}
run: |
conda activate numpy-dev
export OMP_NUM_THREADS=2
spin test -j2 -m full
- name: Ccache statistics
shell: bash -l {0}
run: |
conda activate numpy-dev
ccache -s
accelerate:
name: Accelerate ILP64
if: "github.repository == 'numpy/numpy'"
runs-on: macos-13
steps:
- uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0
with:
submodules: recursive
fetch-depth: 0

- uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0
with:
python-version: '3.10'

- uses: maxim-lobanov/setup-xcode@9a697e2b393340c3cacd97468baa318e4c883d98 # v1.5.1
with:
xcode-version: '14.3'

- name: Install dependencies
run: |
pip install -r build_requirements.txt
pip install pytest pytest-xdist hypothesis
- name: Build NumPy against Accelerate (ILP64)
run: |
spin build -- -Dblas=accelerate -Dlapack=accelerate -Duse-ilp64=true
- name: Show meson-log.txt
if: always()
run: 'cat build/meson-logs/meson-log.txt'

- name: Test
run: |
spin test -j2
1 change: 1 addition & 0 deletions build_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ Cython>=3.0
wheel==0.38.1
ninja
spin==0.5
build
5 changes: 5 additions & 0 deletions doc/release/upcoming_changes/24053.new_feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Support for the updated Accelerate BLAS/LAPACK library, including ILP64 (64-bit
integer) support, in macOS 13.3 has been added. This brings arm64 support, and
significant performance improvements of up to 10x for commonly used linear
algebra operations. When Accelerate is selected at build time, the 13.3+
version will automatically be used if available.
15 changes: 15 additions & 0 deletions numpy/core/src/common/npy_cblas.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,21 @@ enum CBLAS_SIDE {CblasLeft=141, CblasRight=142};

#define CBLAS_INDEX size_t /* this may vary between platforms */

#ifdef ACCELERATE_NEW_LAPACK
#if __MAC_OS_X_VERSION_MAX_ALLOWED < 130300
#ifdef HAVE_BLAS_ILP64
#error "Accelerate ILP64 support is only available with macOS 13.3 SDK or later"
#endif
#else
#define NO_APPEND_FORTRAN
#ifdef HAVE_BLAS_ILP64
#define BLAS_SYMBOL_SUFFIX $NEWLAPACK$ILP64
#else
#define BLAS_SYMBOL_SUFFIX $NEWLAPACK
#endif
#endif
#endif

#ifdef NO_APPEND_FORTRAN
#define BLAS_FORTRAN_SUFFIX
#else
Expand Down
27 changes: 23 additions & 4 deletions numpy/distutils/system_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
_numpy_info:Numeric
_pkg_config_info:None
accelerate_info:accelerate
accelerate_lapack_info:accelerate
agg2_info:agg2
amd_info:amd
atlas_3_10_blas_info:atlas
Expand Down Expand Up @@ -534,6 +535,7 @@ def get_info(name, notfound_action=0):
'lapack_ssl2': lapack_ssl2_info,
'blas_ssl2': blas_ssl2_info,
'accelerate': accelerate_info, # use blas_opt instead
'accelerate_lapack': accelerate_lapack_info,
'openblas64_': openblas64__info,
'openblas64__lapack': openblas64__lapack_info,
'openblas_ilp64': openblas_ilp64_info,
Expand Down Expand Up @@ -2015,14 +2017,17 @@ def _check_info(self, info):

class lapack_ilp64_opt_info(lapack_opt_info, _ilp64_opt_info_mixin):
notfounderror = LapackILP64NotFoundError
lapack_order = ['openblas64_', 'openblas_ilp64']
lapack_order = ['openblas64_', 'openblas_ilp64', 'accelerate']
order_env_var_name = 'NPY_LAPACK_ILP64_ORDER'

def _calc_info(self, name):
print('lapack_ilp64_opt_info._calc_info(name=%s)' % (name))
info = get_info(name + '_lapack')
if self._check_info(info):
self.set_info(**info)
return True
else:
print('%s_lapack does not exist' % (name))
return False


Expand Down Expand Up @@ -2163,7 +2168,7 @@ def calc_info(self):

class blas_ilp64_opt_info(blas_opt_info, _ilp64_opt_info_mixin):
notfounderror = BlasILP64NotFoundError
blas_order = ['openblas64_', 'openblas_ilp64']
blas_order = ['openblas64_', 'openblas_ilp64', 'accelerate']
order_env_var_name = 'NPY_BLAS_ILP64_ORDER'

def _calc_info(self, name):
Expand Down Expand Up @@ -2625,13 +2630,27 @@ def calc_info(self):
link_args.extend(['-Wl,-framework', '-Wl,vecLib'])

if args:
macros = [
('NO_ATLAS_INFO', 3),
('HAVE_CBLAS', None),
('ACCELERATE_NEW_LAPACK', None),
]
if(os.getenv('NPY_USE_BLAS_ILP64', None)):
print('Setting HAVE_BLAS_ILP64')
macros += [
('HAVE_BLAS_ILP64', None),
('ACCELERATE_LAPACK_ILP64', None),
]
self.set_info(extra_compile_args=args,
extra_link_args=link_args,
define_macros=[('NO_ATLAS_INFO', 3),
('HAVE_CBLAS', None)])
define_macros=macros)

return

class accelerate_lapack_info(accelerate_info):
def _calc_info(self):
return super()._calc_info()

class blas_src_info(system_info):
# BLAS_SRC is deprecated, please do not use this!
# Build or install a BLAS library via your package manager or from
Expand Down
10 changes: 6 additions & 4 deletions numpy/linalg/meson.build
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
# Note that `python_xerbla.c` was excluded on Windows in setup.py;
# unclear why and it seems needed, so unconditionally used here.
lapack_lite_sources = [
'lapack_lite/python_xerbla.c',
]
python_xerbla_sources = ['lapack_lite/python_xerbla.c']

lapack_lite_sources = []
if not have_lapack
lapack_lite_sources += [
lapack_lite_sources = [
'lapack_lite/f2c.c',
'lapack_lite/f2c_c_lapack.c',
'lapack_lite/f2c_d_lapack.c',
Expand All @@ -19,6 +19,7 @@ endif
py.extension_module('lapack_lite',
[
'lapack_litemodule.c',
python_xerbla_sources,
lapack_lite_sources,
],
dependencies: [np_core_dep, blas_dep, lapack_dep],
Expand All @@ -29,6 +30,7 @@ py.extension_module('lapack_lite',
py.extension_module('_umath_linalg',
[
'umath_linalg.cpp',
python_xerbla_sources,
lapack_lite_sources,
],
dependencies: [np_core_dep, blas_dep, lapack_dep],
Expand Down
59 changes: 55 additions & 4 deletions numpy/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,13 @@ else
]
endif

macOS13_3_or_later = false
if host_machine.system() == 'darwin'
r = run_command('xcrun', '-sdk', 'macosx', '--show-sdk-version', check: true)
sdkVersion = r.stdout().strip()

macOS13_3_or_later = sdkVersion.version_compare('>=13.3')
endif

# This is currently injected directly into CFLAGS/CXXFLAGS for wheel builds
# (see cibuildwheel settings in pyproject.toml), but used by CI jobs already
Expand Down Expand Up @@ -81,6 +88,7 @@ endif
# https://github.com/mesonbuild/meson/issues/2835
blas_name = get_option('blas')
lapack_name = get_option('lapack')

# pkg-config uses a lower-case name while CMake uses a capitalized name, so try
# that too to make the fallback detection with CMake work
if blas_name == 'openblas'
Expand All @@ -90,6 +98,23 @@ if blas_name == 'openblas'
_openblas_names = ['openblas', 'OpenBLAS']
endif
blas = dependency(_openblas_names, required: false)
elif blas_name.to_lower() == 'accelerate'
# macOS 13.3+ has updated interfaces aligned with BLAS/LAPACK 3.9.1. Use them if available.
if macOS13_3_or_later
accelerate_compile_args = ['-DACCELERATE_NEW_LAPACK']
if(use_ilp64)
accelerate_compile_args += '-DACCELERATE_LAPACK_ILP64'
endif
blas = declare_dependency(
compile_args: accelerate_compile_args,
dependencies: dependency('Accelerate')
)
else
if(use_ilp64)
error('macOS SDK 13.3+ is required for ILP64 support.')
endif
blas = dependency('Accelerate')
endif
else
blas = dependency(blas_name, required: false)
endif
Expand All @@ -112,14 +137,22 @@ if have_blas
# `dependency('blas', modules: cblas)`
# see https://github.com/mesonbuild/meson/pull/10921.
have_cblas = false
if cc.links('''
if blas_name.to_lower() == 'accelerate'
_cblas_header = '<Accelerate/Accelerate.h>'
elif blas_name.to_lower().startswith('mkl')
_cblas_header = '<mkl_cblas.h>'
else
_cblas_header = '<cblas.h>'
endif
if cc.links(f'''
#ifndef BLAS_SYMBOL_SUFFIX
# define BLAS_SYMBOL_SUFFIX
#endif
#define EXPAND(suffix) cblas_ddot ## suffix
#define DDOT(suffix) EXPAND(suffix)
#include <cblas.h>
#include @_cblas_header@
int main(int argc, const char *argv[])
{
double a[4] = {1,2,3,4};
Expand Down Expand Up @@ -178,9 +211,27 @@ else
endif

if lapack_name == 'openblas'
lapack_name = ['openblas', 'OpenBLAS']
lapack_dep = dependency(['openblas', 'OpenBLAS'], required: false)
elif lapack_name.to_lower() == 'accelerate'
# macOS 13.3+ has updated interfaces aligned with BLAS/LAPACK 3.9.1. Use them if available.
if macOS13_3_or_later
accelerate_compile_args = ['-DACCELERATE_NEW_LAPACK']
if(use_ilp64)
accelerate_compile_args += '-DACCELERATE_LAPACK_ILP64'
endif
lapack_dep = declare_dependency(
compile_args: accelerate_compile_args,
dependencies: dependency('Accelerate')
)
else
if(use_ilp64)
error('macOS SDK 13.3+ is required for ILP64 support.')
endif
lapack_dep = dependency('Accelerate')
endif
else
lapack_dep = dependency(lapack_name, required: false)
endif
lapack_dep = dependency(lapack_name, required: false)
have_lapack = lapack_dep.found()
if not have_lapack and not allow_noblas
error('No LAPACK library detected! Install one, or use the ' + \
Expand Down

0 comments on commit fdbed5e

Please sign in to comment.