IntelPython · vchamarthi · Apr 14, 2026 · Apr 14, 2026
@@ -9,3 +9,7 @@ mkl_fft/_pydfti.c
 mkl_fft/_pydfti.cpython*.so
 mkl_fft/_pydfti.*-win_amd64.pyd
 mkl_fft/src/mklfft.c
+
+# ASV benchmark artifacts
+.asv/
+benchmarks/.asv/
@@ -0,0 +1,85 @@
+# mkl_fft ASV Benchmarks
+
+Performance benchmarks for [mkl_fft](https://github.com/IntelPython/mkl_fft) using
+[Airspeed Velocity (ASV)](https://asv.readthedocs.io/en/stable/).
+
+## Structure
+
+```
+benchmarks/
+├── asv.conf.json          # ASV configuration (CI-only, no env/build settings)
+└── benchmarks/
+    ├── __init__.py        # Thread pinning (MKL_NUM_THREADS)
+    ├── bench_fft1d.py     # mkl_fft root API — 1-D transforms
+    ├── bench_fftnd.py     # mkl_fft root API — 2-D and N-D transforms
+    ├── bench_numpy_fft.py # mkl_fft.interfaces.numpy_fft — full coverage
+    ├── bench_scipy_fft.py # mkl_fft.interfaces.scipy_fft — full coverage
+    └── bench_memory.py    # Peak RSS memory benchmarks
+```
+
+### Coverage
+
+| File | API | Transforms |
+|------|-----|-----------|
+| `bench_fft1d.py` | `mkl_fft` | `fft`, `ifft`, `rfft`, `irfft` — power-of-two and non-power-of-two |
+| `bench_fftnd.py` | `mkl_fft` | `fft2`, `ifft2`, `rfft2`, `irfft2`, `fftn`, `ifftn`, `rfftn`, `irfftn` |
+| `bench_numpy_fft.py` | `mkl_fft.interfaces.numpy_fft` | All exported functions including Hermitian (`hfft`, `ihfft`) |
+| `bench_scipy_fft.py` | `mkl_fft.interfaces.scipy_fft` | All exported functions including Hermitian 2-D/N-D (`hfft2`, `hfftn`) |
+| `bench_memory.py` | `mkl_fft` | Peak RSS for 1-D, 2-D, and 3-D transforms |
+
+Benchmarks cover float32, float64, complex64, complex128 dtypes, power-of-two
+and non-power-of-two sizes, square and non-square/non-cubic shapes.
+
+## Threading
+
+`__init__.py` pins `MKL_NUM_THREADS` to **4** when the machine has 4 or more
+physical cores, or falls back to **1** (single-threaded) otherwise. This keeps
+results comparable across CI machines in the shared pool regardless of their
+total core count. Physical cores are read from `/proc/cpuinfo` — hyperthreads
+are excluded per MKL recommendation.
+
+Override by setting `MKL_NUM_THREADS` in the environment before running ASV.
+
+## Running Locally
+
+> Benchmarks are designed for CI. Local runs require `mkl_fft` to be installed
+> in the active Python environment. Benchmarks that exercise SciPy interface
+> (`bench_scipy_fft.py`) also require SciPy:
+>
+> ```bash
+> python -m pip install -e ..
+> python -m pip install scipy
+> ```
+
+```bash
+cd benchmarks/
+
+# Quick smoke-run against the current working tree (no env management)
+asv run --python=same --quick --show-stderr HEAD^!
+
+# Run a specific benchmark file
+asv run --python=same --quick --bench bench_fft1d HEAD^!
+
+# View and publish results
+asv publish          # generates .asv/html/
+asv preview          # serves at http://localhost:8080
+```
+
+## CI
+
+Benchmarks run automatically in Jenkins on the `auto-bench` node via
+`benchmarkHelper.performanceTest()` from the shared library. The pipeline uses:
+
+```bash
+asv run --environment existing:<python> --set-commit-hash $COMMIT_SHA
+```
+
+This bypasses ASV environment management entirely — mkl_fft is pre-installed
+into a conda environment by the pipeline before ASV is invoked.
+
+- **Nightly (prod):** results are published to the benchmark dashboard
+- **PR (dev):** `asv compare` output is evaluated for regressions; a 30% slowdown
+  triggers a failed GitHub commit status
+
+Results are stored in the `mkl_fft-results` branch of
+`intel-innersource/libraries.python.intel.infrastructure.benchmark-dashboards`.
@@ -0,0 +1,19 @@
+{
+    "version": 1,
+    "project": "mkl_fft",
+    "project_url": "https://github.com/IntelPython/mkl_fft",
+    "show_commit_url": "https://github.com/IntelPython/mkl_fft/commit/",
+    "repo": "..",
+    "branches": [
+        "master"
+    ],
+    "benchmark_dir": "benchmarks",
+    "env_dir": ".asv/env",
+    "results_dir": ".asv/results",
+    "html_dir": ".asv/html",
+    "build_cache_size": 2,
+    "default_benchmark_timeout": 500,
+    "regressions_thresholds": {
+        ".*": 0.3
+    }
+}
@@ -0,0 +1,50 @@
+"""ASV benchmarks for mkl_fft.
+
+Thread control — design rationale
+----------------------------------
+Since we do not have a dedicated CI benchmark machine, benchmarks run on a shared CI pool
+whose machines vary in core count over time.
+Using the full physical core count of each machine would make results
+incomparable across runs on different machines.
+
+Strategy:
+  - Physical cores >= 4  →  fix MKL_NUM_THREADS = 4
+      4 is the lowest common denominator that guarantees multi-threaded MKL
+      behavior and is achievable on any modern CI machine.  Results from
+      different machines in the pool are therefore directly comparable.
+  - Physical cores < 4   →  fall back to MKL_NUM_THREADS = 1 (single-threaded)
+      Prevents over-subscription on under-resourced machines and avoids
+      misleading comparisons against 4-thread baselines.
+
+MKL recommendation: use physical cores, not logical (hyperthreaded) CPUs.
+"""
+
+import os
+import re
+
+_MIN_THREADS = 4  # minimum physical cores required for multi-threaded mode
+
+
+def _physical_cores():
+    """Return physical core count from /proc/cpuinfo; fall back to 1 (conservative)."""
+    try:
+        with open("/proc/cpuinfo") as f:
+            content = f.read()
+        cpu_cores = int(re.search(r"cpu cores\s*:\s*(\d+)", content).group(1))
+        sockets = max(
+            len(set(re.findall(r"physical id\s*:\s*(\d+)", content))), 1
+        )
+        return cpu_cores * sockets
+    except Exception:
+        return 1
+
+
+def _thread_count():
+    physical = _physical_cores()
+    return str(_MIN_THREADS) if physical >= _MIN_THREADS else "1"
+
+
+_THREADS = os.environ.get("MKL_NUM_THREADS", _thread_count())
+os.environ["MKL_NUM_THREADS"] = _THREADS
+os.environ.setdefault("OMP_NUM_THREADS", _THREADS)
+os.environ.setdefault("OPENBLAS_NUM_THREADS", _THREADS)
@@ -0,0 +1,16 @@
+"""Shared utilities for mkl_fft benchmarks."""
+
+import numpy as np
+
+
+def _make_input(rng, shape, dtype):
+    """Return an array of *shape* and *dtype*.
+
+    Complex dtypes get non-zero imaginary parts for a realistic signal.
+    *shape* may be an int (1-D) or a tuple.
+    """
+    dt = np.dtype(dtype)
+    s = (shape,) if isinstance(shape, int) else shape
+    if dt.kind == "c":
+        return (rng.standard_normal(s) + 1j * rng.standard_normal(s)).astype(dt)
+    return rng.standard_normal(s).astype(dt)
@@ -0,0 +1,124 @@
+"""Benchmarks for 1-D FFT operations using the mkl_fft root API."""
+
+import numpy as np
+
+import mkl_fft
+
+from ._utils import _make_input
+
+_RNG_SEED = 42
+
+
+# ---------------------------------------------------------------------------
+# Complex-to-complex 1-D (power-of-two sizes)
+# ---------------------------------------------------------------------------
+
+
+class TimeFFT1D:
+    """Forward and inverse complex FFT — power-of-two sizes."""
+
+    params = [
+        [64, 256, 1024, 4096, 16384, 65536],
+        ["float32", "float64", "complex64", "complex128"],
+    ]
+    param_names = ["n", "dtype"]
+
+    def setup(self, n, dtype):
+        rng = np.random.default_rng(_RNG_SEED)
+        self.x = _make_input(rng, n, dtype)
+
+    def time_fft(self, n, dtype):
+        mkl_fft.fft(self.x)
+
+    def time_ifft(self, n, dtype):
+        mkl_fft.ifft(self.x)
+
+
+# ---------------------------------------------------------------------------
+# Real-to-complex / complex-to-real 1-D (power-of-two sizes)
+# ---------------------------------------------------------------------------
+
+
+class TimeRFFT1D:
+    """Forward rfft and inverse irfft — power-of-two sizes."""
+
+    params = [
+        [64, 256, 1024, 4096, 16384, 65536],
+        ["float32", "float64"],
+    ]
+    param_names = ["n", "dtype"]
+
+    def setup(self, n, dtype):
+        rng = np.random.default_rng(_RNG_SEED)
+        cdtype = "complex64" if dtype == "float32" else "complex128"
+        self.x_real = rng.standard_normal(n).astype(dtype)
+        # irfft input: complex half-spectrum of length n//2+1
+        self.x_complex = (
+            rng.standard_normal(n // 2 + 1)
+            + 1j * rng.standard_normal(n // 2 + 1)
+        ).astype(cdtype)
+
+    def time_rfft(self, n, dtype):
+        mkl_fft.rfft(self.x_real)
+
+    def time_irfft(self, n, dtype):
+        mkl_fft.irfft(self.x_complex, n=n)
+
+
+# ---------------------------------------------------------------------------
+# Complex-to-complex 1-D (non-power-of-two sizes)
+# ---------------------------------------------------------------------------
+
+
+class TimeFFT1DNonPow2:
+    """Forward and inverse complex FFT — non-power-of-two sizes.
+
+    MKL uses a different code path for non-power-of-two transforms;
+    this suite catches regressions in that path.
+    """
+
+    params = [
+        [127, 509, 1000, 4001, 10007],
+        ["float64", "complex128", "complex64"],
+    ]
+    param_names = ["n", "dtype"]
+
+    def setup(self, n, dtype):
+        rng = np.random.default_rng(_RNG_SEED)
+        self.x = _make_input(rng, n, dtype)
+
+    def time_fft(self, n, dtype):
+        mkl_fft.fft(self.x)
+
+    def time_ifft(self, n, dtype):
+        mkl_fft.ifft(self.x)
+
+
+# ---------------------------------------------------------------------------
+# Real-to-complex / complex-to-real 1-D (non-power-of-two sizes)
+# ---------------------------------------------------------------------------
+
+
+class TimeRFFT1DNonPow2:
+    """Forward rfft and inverse irfft — non-power-of-two sizes."""
+
+    params = [
+        [127, 509, 1000, 4001, 10007],
+        ["float32", "float64"],
+    ]
+    param_names = ["n", "dtype"]
+
+    def setup(self, n, dtype):
+        rng = np.random.default_rng(_RNG_SEED)
+        cdtype = "complex64" if dtype == "float32" else "complex128"
+        self.x_real = rng.standard_normal(n).astype(dtype)
+        self.x_complex = (
+            rng.standard_normal(n // 2 + 1)
+            + 1j * rng.standard_normal(n // 2 + 1)
+        ).astype(cdtype)
+
+    def time_rfft(self, n, dtype):
+        mkl_fft.rfft(self.x_real)
+
+    def time_irfft(self, n, dtype):
+        mkl_fft.irfft(self.x_complex, n=n)