diff --git a/.flake8 b/.flake8
deleted file mode 100644
index 2d17eec10..000000000
--- a/.flake8
+++ /dev/null
@@ -1,11 +0,0 @@
-[flake8]
-max-line-length = 100
-show-source = True
-select = C,E,F,W,B,T
-ignore = E203, E402, W503
-per-file-ignores =
-    *__init__.py:F401
-    *cli.py:T201
-exclude =
-    venv
-    examples
diff --git a/.github/workflows/dist.yaml b/.github/workflows/dist.yaml
index 63641ae72..b81651cea 100644
--- a/.github/workflows/dist.yaml
+++ b/.github/workflows/dist.yaml
@@ -1,19 +1,37 @@
 name: dist-check
 
-on: [push, pull_request]
+on:
+  workflow_dispatch:
+
+  push:
+    branches:
+      - main
+      - develop
+    tags:
+      - "v*.*.*"
+
+  pull_request:
+    branches:
+      - main
+      - develop
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
 
 jobs:
   dist:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Setup Python
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version: 3.8
     - name: Build dist
       run: |
-        python setup.py sdist
+        pip install build
+        python -m build --sdist
     - name: Twine check
       run: |
         pip install twine
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index e601176b3..e50d67710 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -1,13 +1,30 @@
 name: Docs
-on: [pull_request, push]
+on:
+  workflow_dispatch:
+
+  push:
+    branches:
+      - main
+      - develop
+    tags:
+      - "v*.*.*"
+
+  pull_request:
+    branches:
+      - main
+      - develop
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
 
 jobs:
   build-and-deploy:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Setup Python
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version: 3.8
     - name: Install dependencies
diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml
index 074ae7add..9d1ab7fa8 100644
--- a/.github/workflows/pre-commit.yaml
+++ b/.github/workflows/pre-commit.yaml
@@ -1,14 +1,31 @@
 name: pre-commit
 
-on: [push]
+on:
+  workflow_dispatch:
+
+  push:
+    branches:
+      - main
+      - develop
+    tags:
+      - "v*.*.*"
+
+  pull_request:
+    branches:
+      - main
+      - develop
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
 
 jobs:
   run-all-files:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Setup Python 3.8
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version: 3.8
     - name: Install pre-commit
diff --git a/.github/workflows/release_docker.yaml b/.github/workflows/release_docker.yaml
index 6ceb1d060..c8f8c59f8 100644
--- a/.github/workflows/release_docker.yaml
+++ b/.github/workflows/release_docker.yaml
@@ -1,11 +1,20 @@
 name: release-docker
 
 on:
+  workflow_dispatch:
   push:
     branches:
-      - 'main'
       - 'develop'
       - 'docker'
+    tags:
+      - 'v*'
+  pull_request:
+    branches:
+      - 'develop'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
 
 jobs:
 
@@ -15,34 +24,46 @@ jobs:
 
     steps:
       - name: Set up QEMU
-        uses: docker/setup-qemu-action@v2
+        uses: docker/setup-qemu-action@v3
 
       - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
+        uses: docker/setup-buildx-action@v3
 
       - name: Login to DockerHub
-        uses: docker/login-action@v2
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
         with:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_TOKEN }}
 
       - name: Check out the repo
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Extract metadata (tags, labels) for Docker Hub
         id: meta_dockerhub
-        uses: docker/metadata-action@v4
+        uses: docker/metadata-action@v5
         with:
           images: "openml/openml-python"
 
       - name: Build and push
         id: docker_build
-        uses: docker/build-push-action@v4
+        uses: docker/build-push-action@v5
         with:
           context: ./docker/
-          push: true
           tags: ${{ steps.meta_dockerhub.outputs.tags }}
           labels: ${{ steps.meta_dockerhub.outputs.labels }}
+          platforms: linux/amd64,linux/arm64
+          push: ${{ github.event_name == 'push' }}
+
+      - name: Update repo description
+        if: ${{ startsWith(github.ref, 'refs/tags/v') }}
+        uses: peter-evans/dockerhub-description@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+          repository: openml/openml-python
+          short-description: "pre-installed openml-python environment"
+          readme-filepath: ./docker/readme.md
           
       - name: Image digest
         run: echo ${{ steps.docker_build.outputs.digest }}
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 246c38da4..ab60f59c6 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -1,6 +1,23 @@
 name: Tests
 
-on: [push, pull_request]
+on:
+  workflow_dispatch:
+
+  push:
+    branches:
+      - main
+      - develop
+    tags:
+      - "v*.*.*"
+
+  pull_request:
+    branches:
+      - main
+      - develop
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
 
 jobs:
   test:
@@ -8,62 +25,42 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        python-version: [3.7, 3.8]
-        scikit-learn: [0.21.2, 0.22.2, 0.23.1, 0.24]
+        python-version: ["3.8"]
+        # TODO(eddiebergman): We should consider testing against newer version I guess...
+        # We probably consider just having a `"1"` version to always test against latest
+        scikit-learn: ["0.23.1", "0.24"]
         os: [ubuntu-latest]
-        sklearn-only: ['true']
-        exclude:  # no scikit-learn 0.21.2 release for Python 3.8
-          - python-version: 3.8
-            scikit-learn: 0.21.2
+        sklearn-only: ["true"]
+        exclude:  # no scikit-learn 0.23 release for Python 3.9
+          - python-version: "3.9"
+            scikit-learn: "0.23.1"
         include:
-          - python-version: 3.6
-            scikit-learn: 0.18.2
-            scipy: 1.2.0
-            os: ubuntu-20.04 
-            sklearn-only: 'true'
-          - python-version: 3.6
-            scikit-learn: 0.19.2
-            os: ubuntu-20.04 
-            sklearn-only: 'true'
-          - python-version: 3.6
-            scikit-learn: 0.20.2
-            os: ubuntu-20.04 
-            sklearn-only: 'true'
-          - python-version: 3.6
-            scikit-learn: 0.21.2
-            os: ubuntu-20.04 
-            sklearn-only: 'true'
-          - python-version: 3.6
-            scikit-learn: 0.22.2
-            os: ubuntu-20.04 
-            sklearn-only: 'true'
-          - python-version: 3.6
-            scikit-learn: 0.23.1
-            os: ubuntu-20.04 
-            sklearn-only: 'true'
-          - python-version: 3.6
-            scikit-learn: 0.24
-            os: ubuntu-20.04 
-            sklearn-only: 'true'
-          - python-version: 3.8
+          - os: ubuntu-latest
+            python-version: "3.9"
+            scikit-learn: "0.24"
+            scipy: "1.10.0"
+            sklearn-only: "true"
+          # Include a code cov version
+          - code-cov: true
+            os: ubuntu-latest
+            python-version: "3.8"
             scikit-learn: 0.23.1
-            code-cov: true
             sklearn-only: 'false'
-            os: ubuntu-latest
+          # Include a windows test, for some reason on a later version of scikit-learn
           - os: windows-latest
-            sklearn-only: 'false'
+            python-version: "3.8"
             scikit-learn: 0.24.*
-            scipy: 1.10.0
+            scipy: "1.10.0"  # not sure why the explicit scipy version?
+            sklearn-only: 'false'
       fail-fast:  false
-      max-parallel: 4
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
       with:
         fetch-depth: 2
     - name: Setup Python ${{ matrix.python-version }}
       if: matrix.os != 'windows-latest'  # windows-latest only uses preinstalled Python (3.7.9)
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install test dependencies
diff --git a/.gitignore b/.gitignore
index 060db33be..90548b2c3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,6 +17,14 @@ doc/auto_examples/
 doc/modules/generated/
 doc/datasets/generated/
 
+# Some stuff from testing?
+tests/files/org/openml/test/datasets/1/
+tests/files/org/openml/test/datasets/2/features.xml.pkl
+tests/files/org/openml/test/datasets/2/qualities.xml.pkl
+tests/files/org/openml/test/locks/
+tests/files/org/openml/test/tasks/1/datasplits.pkl.py3
+tests/files/org/openml/test/tasks/1882/datasplits.pkl.py3
+
 # Distribution / packaging
 
 .Python
diff --git a/.nojekyll b/.nojekyll
deleted file mode 100644
index e69de29bb..000000000
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index fc1319d79..3505c316b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,42 +1,48 @@
+default_language_version:
+  python: python3
+files: |
+  (?x)^(
+    openml|
+    tests
+  )/.*\.py$
 repos:
-  - repo: https://github.com/psf/black
-    rev: 23.3.0
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.1.13
     hooks:
-      - id: black
-        args: [--line-length=100]
+      - id: ruff
+        args: [--fix, --exit-non-zero-on-fix, --no-cache]
+      - id: ruff-format
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.4.1
+    rev: v1.8.0
     hooks:
       - id: mypy
-        name: mypy openml
-        files: openml/.*
         additional_dependencies:
           - types-requests
           - types-python-dateutil
-      - id: mypy
-        name: mypy tests
-        files: tests/.*
-        additional_dependencies:
-          - types-requests
-          - types-python-dateutil
-      - id: mypy
-        name: mypy top-level-functions
-        files: openml/_api_calls.py
-        additional_dependencies:
-          - types-requests
-          - types-python-dateutil
-        args: [ --disallow-untyped-defs, --disallow-any-generics,
-                --disallow-any-explicit, --implicit-optional ]
-  - repo: https://github.com/pycqa/flake8
-    rev: 6.0.0
+  - repo: https://github.com/python-jsonschema/check-jsonschema
+    rev: 0.27.3
     hooks:
-      - id: flake8
-        name: flake8 openml
-        files: openml/.*
-        additional_dependencies:
-          - flake8-print==5.0.0
-      - id: flake8
-        name: flake8 tests
-        files: tests/.*
-        additional_dependencies:
-          - flake8-print==5.0.0
+      - id: check-github-workflows
+        files: '^github/workflows/.*\.ya?ml$'
+        types: ["yaml"]
+      - id: check-dependabot
+        files: '^\.github/dependabot\.ya?ml$'
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      - id: check-added-large-files
+        files: ".*"
+      - id: check-case-conflict
+        files: ".*"
+      - id: check-merge-conflict
+        files: ".*"
+      - id: check-yaml
+        files: ".*"
+      - id: end-of-file-fixer
+        files: ".*"
+        types: ["yaml"]
+      - id: check-toml
+        files: ".*"
+        types: ["toml"]
+      - id: debug-statements
+        files: '^src/.*\.py$'
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 87c8ae3c6..c2b4be187 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -214,28 +214,32 @@ Before each commit, it will automatically run:
    but make sure to make adjustments if it does fail.
     
 If you want to run the pre-commit tests without doing a commit, run:
-  ```bash
-  $ pre-commit run --all-files
-  ```
+```bash
+$ make check
+```
+or on a system without make, like Windows:
+```bash
+$ pre-commit run --all-files
+```
 Make sure to do this at least once before your first commit to check your setup works.
 
 Executing a specific unit test can be done by specifying the module, test case, and test.
 To obtain a hierarchical list of all tests, run
 
-  ```bash
-  $  pytest --collect-only
-
-    <Module 'tests/test_datasets/test_dataset.py'>
-      <UnitTestCase 'OpenMLDatasetTest'>
-        <TestCaseFunction 'test_dataset_format_constructor'>
-        <TestCaseFunction 'test_get_data'>
-        <TestCaseFunction 'test_get_data_rowid_and_ignore_and_target'>
-        <TestCaseFunction 'test_get_data_with_ignore_attributes'>
-        <TestCaseFunction 'test_get_data_with_rowid'>
-        <TestCaseFunction 'test_get_data_with_target'>
-      <UnitTestCase 'OpenMLDatasetTestOnTestServer'>
-        <TestCaseFunction 'test_tagging'>
-   ```
+```bash
+$  pytest --collect-only
+
+ <Module 'tests/test_datasets/test_dataset.py'>
+   <UnitTestCase 'OpenMLDatasetTest'>
+     <TestCaseFunction 'test_dataset_format_constructor'>
+     <TestCaseFunction 'test_get_data'>
+     <TestCaseFunction 'test_get_data_rowid_and_ignore_and_target'>
+     <TestCaseFunction 'test_get_data_with_ignore_attributes'>
+     <TestCaseFunction 'test_get_data_with_rowid'>
+     <TestCaseFunction 'test_get_data_with_target'>
+   <UnitTestCase 'OpenMLDatasetTestOnTestServer'>
+     <TestCaseFunction 'test_tagging'>
+```
 
 You may then run a specific module, test case, or unit test respectively:
 ```bash
diff --git a/Makefile b/Makefile
index 165bcea80..b097bd1f9 100644
--- a/Makefile
+++ b/Makefile
@@ -7,6 +7,9 @@ CTAGS ?= ctags
 
 all: clean inplace test
 
+check:
+	pre-commit run --all-files
+
 clean:
 	$(PYTHON) setup.py clean
 	rm -rf dist openml.egg-info
diff --git a/doc/conf.py b/doc/conf.py
index a10187486..61ba4a46c 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -119,7 +119,7 @@
 #
 # currently disabled because without intersphinx we cannot link to numpy.ndarray
 # nitpicky = True
-
+linkcheck_ignore = [r"https://test.openml.org/t/.*"]  # FIXME: to avoid test server bugs avoiding docs building
 # -- Options for HTML output ----------------------------------------------
 
 # The theme to use for HTML and HTML Help pages.  See the documentation for
diff --git a/doc/contributing.rst b/doc/contributing.rst
index e8d537338..34d1edb14 100644
--- a/doc/contributing.rst
+++ b/doc/contributing.rst
@@ -19,7 +19,7 @@ In particular, a few ways to contribute to openml-python are:
    For more information, see the :ref:`extensions` below.
 
  * Bug reports. If something doesn't work for you or is cumbersome, please open a new issue to let
-   us know about the problem. See `this section <https://github.com/openml/openml-python/blob/main/CONTRIBUTING.md#user-content-reporting-bugs>`_.
+   us know about the problem. See `this section <https://github.com/openml/openml-python/blob/main/CONTRIBUTING.md>`_.
 
  * `Cite OpenML <https://www.openml.org/cite>`_ if you use it in a scientific publication.
 
diff --git a/doc/progress.rst b/doc/progress.rst
index 493b029e5..13efd720b 100644
--- a/doc/progress.rst
+++ b/doc/progress.rst
@@ -6,6 +6,23 @@
 Changelog
 =========
 
+next
+~~~~~~
+
+ * ...
+
+0.14.2
+~~~~~~
+
+ * MAINT #1280: Use the server-provided ``parquet_url`` instead of ``minio_url`` to determine the location of the parquet file.
+ * ADD #716: add documentation for remaining attributes of classes and functions.
+ * ADD #1261: more annotations for type hints.
+ * MAINT #1294: update tests to new tag specification.
+ * FIX #1314: Update fetching a bucket from MinIO.
+ * FIX #1315: Make class label retrieval more lenient.
+ * ADD #1316: add feature descriptions ontologies support.
+ * MAINT #1310/#1307: switch to ruff and resolve all mypy errors.
+
 0.14.1
 ~~~~~~
 
diff --git a/docker/Dockerfile b/docker/Dockerfile
index c27abba40..a84723309 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -2,8 +2,8 @@
 # Useful building docs or running unix tests from a Windows host.
 FROM python:3.10
 
-RUN git clone  https://github.com/openml/openml-python.git omlp
-WORKDIR omlp
+RUN git clone  https://github.com/openml/openml-python.git openml
+WORKDIR openml
 RUN python -m venv venv
 RUN venv/bin/pip install wheel setuptools
 RUN venv/bin/pip install -e .[test,examples,docs,examples_unix]
@@ -11,6 +11,8 @@ RUN venv/bin/pip install -e .[test,examples,docs,examples_unix]
 WORKDIR /
 RUN mkdir scripts
 ADD startup.sh scripts/
+ADD readme.md /
+
 # Due to the nature of the Docker container it might often be built from Windows.
 # It is typical to have the files with \r\n line-ending, we want to remove it for the unix image.
 RUN sed -i 's/\r//g' scripts/startup.sh
diff --git a/docker/readme.md b/docker/readme.md
index 47ad6d23b..d0af9d9fe 100644
--- a/docker/readme.md
+++ b/docker/readme.md
@@ -1,86 +1,131 @@
 # OpenML Python Container
 
-This docker container has the latest development version of openml-python downloaded and pre-installed.
-It can be used to run the unit tests or build the docs in a fresh and/or isolated unix environment.
-Instructions only tested on a Windows host machine.
+This docker container has the latest version of openml-python downloaded and pre-installed.
+It can also be used by developers to run unit tests or build the docs in 
+a fresh and/or isolated unix environment. 
+This document contains information about:
 
-First pull the docker image:
+ 1. [Usage](#usage): how to use the image and its main modes.
+ 2. [Using local or remote code](#using-local-or-remote-code): useful when testing your own latest changes.
+ 3. [Versions](#versions): identify which image to use.
+ 4. [Development](#for-developers): information about the Docker image for developers.
 
-    docker pull openml/openml-python
+*note:* each docker image is shipped with a readme, which you can read with:
+`docker run --entrypoint=/bin/cat openml/openml-python:TAG readme.md`
 
 ## Usage
 
+There are three main ways to use the image: running a pre-installed Python environment,
+running tests, and building documentation.
 
-    docker run -it openml/openml-python [DOC,TEST] [BRANCH]
+### Running `Python` with pre-installed `OpenML-Python` (default):
 
-The image is designed to work with two specified directories which may be mounted ([`docker --mount documentation`](https://docs.docker.com/storage/bind-mounts/#start-a-container-with-a-bind-mount)).
-You can mount your openml-python folder to the `/code` directory to run tests or build docs on your local files.
-You can mount an `/output` directory to which the container will write output (currently only used for docs).
-Each can be mounted by adding a `--mount type=bind,source=SOURCE,destination=/DESTINATION` where `SOURCE` is the absolute path to your code or output directory, and `DESTINATION` is either `code` or `output`.
-  
-E.g. mounting a code directory: 
+To run `Python` with a pre-installed `OpenML-Python` environment run:
 
-    docker run -i --mount type=bind,source="E:\\repositories/openml-python",destination="/code" -t openml/openml-python
+```text
+docker run -it openml/openml-python
+```
 
-E.g. mounting an output directory: 
+this accepts the normal `Python` arguments, e.g.:
 
-    docker run -i --mount type=bind,source="E:\\files/output",destination="/output" -t openml/openml-python
+```text
+docker run openml/openml-python -c "import openml; print(openml.__version__)"
+```
 
-You can mount both at the same time.
+if you want to run a local script, it needs to be mounted first. Mount it into the
+`openml` folder:
 
-### Bash (default)
-By default bash is invoked, you should also use the `-i` flag when starting the container so it processes input: 
+```
+docker run -v PATH/TO/FILE:/openml/MY_SCRIPT.py openml/openml-python MY_SCRIPT.py
+```
 
-    docker run -it openml/openml-python
+### Running unit tests
 
-### Building Documentation
-There are two ways to build documentation, either directly from the `HEAD` of a branch on Github or from your local directory.
+You can run the unit tests by passing `test` as the first argument.
+It also requires a local or remote repository to be specified, which is explained 
+[below]((#using-local-or-remote-code). For this example, we specify to test the
+`develop` branch:
 
-#### Building from a local repository
-Building from a local directory requires you to mount it to the ``/code`` directory:
+```text
+docker run openml/openml-python test develop
+```
 
-    docker run --mount type=bind,source=PATH_TO_REPOSITORY,destination=/code -t openml/openml-python doc
+### Building documentation
 
-The produced documentation will be in your repository's ``doc/build`` folder.
-If an `/output` folder is mounted, the documentation will *also* be copied there.
+You can build the documentation by passing `doc` as the first argument, 
+you should [mount]((https://docs.docker.com/storage/bind-mounts/#start-a-container-with-a-bind-mount)) 
+an output directory in which the docs will be stored. You also need to provide a remote
+or local repository as explained in [the section below]((#using-local-or-remote-code).
+In this example, we build documentation for the `develop` branch.
+On Windows:
 
-#### Building from an online repository
-Building from a remote repository requires you to specify a branch.
-The branch may be specified by name directly if it exists on the original repository (https://github.com/openml/openml-python/):
+```text
+    docker run --mount type=bind,source="E:\\files/output",destination="/output" openml/openml-python doc develop
+```
 
-    docker run --mount type=bind,source=PATH_TO_OUTPUT,destination=/output -t openml/openml-python doc BRANCH
+on Linux:
+```text
+    docker run --mount type=bind,source="./output",destination="/output" openml/openml-python doc develop
+```
+    
+see [the section below]((#using-local-or-remote-code) for running against local changes
+or a remote branch.
 
-Where `BRANCH` is the name of the branch for which to generate the documentation.
-It is also possible to build the documentation from the branch on a fork, in this case the `BRANCH` should be specified as `GITHUB_NAME#BRANCH` (e.g. `PGijsbers#my_feature`) and the name of the forked repository should be `openml-python`.
+*Note: you can forgo mounting an output directory to test if the docs build successfully,
+but the result will only be available within the docker container under `/openml/docs/build`.*
 
-### Running tests
-There are two ways to run tests, either directly from the `HEAD` of a branch on Github or from your local directory.
-It works similar to building docs, but should specify `test` as mode.
-For example, to run tests on your local repository:
+## Using local or remote code
 
-    docker run --mount type=bind,source=PATH_TO_REPOSITORY,destination=/code -t openml/openml-python test
-    
-Running tests from the state of an online repository is supported similar to building documentation (i.e. specify `BRANCH` instead of mounting `/code`).
-    
-## Troubleshooting
+You can build docs or run tests against your local repository or a Github repository.
+In the examples below, change the `source` to match the location of your local repository.
+
+### Using a local repository
+
+To use a local directory, mount it in the `/code` directory,  on Windows:
+
+```text
+    docker run --mount type=bind,source="E:\\repositories/openml-python",destination="/code" openml/openml-python test
+```
 
-When you are mounting a directory you can check that it is mounted correctly by running the image in bash mode.
-Navigate to the `/code` and `/output` directories and see if the expected files are there.
-If e.g. there is no code in your mounted `/code`, you should double-check the provided path to your host directory.
+on Linux:
+```text
+    docker run --mount type=bind,source="/Users/pietergijsbers/repositories/openml-python",destination="/code" openml/openml-python test
+```
 
-## Notes for developers
-This section contains some notes about the structure of the image, intended for those who want to work on it.
+when building docs, you also need to mount an output directory as shown above, so add both:
+
+```text
+docker run --mount type=bind,source="./output",destination="/output" --mount type=bind,source="/Users/pietergijsbers/repositories/openml-python",destination="/code" openml/openml-python doc
+```
+
+### Using a Github repository
+Building from a remote repository requires you to specify a branch.
+The branch may be specified by name directly if it exists on the original repository (https://github.com/openml/openml-python/):
+
+    docker run --mount type=bind,source=PATH_TO_OUTPUT,destination=/output openml/openml-python [test,doc] BRANCH
+
+Where `BRANCH` is the name of the branch for which to generate the documentation.
+It is also possible to build the documentation from the branch on a fork,
+in this case the `BRANCH` should be specified as `GITHUB_NAME#BRANCH` (e.g. 
+`PGijsbers#my_feature_branch`) and the name of the forked repository should be `openml-python`.
+
+## For developers
+This section contains some notes about the structure of the image, 
+intended for those who want to work on it.
 
 ### Added Directories
 The `openml/openml-python` image is built on a vanilla `python:3` image.
-Additionally it contains the following files are directories:
-
- - `/omlp`: contains the openml-python repository in the state with which the image was built by default.
-            If working with a `BRANCH`, this repository will be set to the `HEAD` of `BRANCH`.
- - `/omlp/venv/`: contains the used virtual environment for `doc` and `test`. It has `openml-python` dependencies pre-installed.
-            When invoked with `doc` or `test`, the dependencies will be updated based on the `setup.py` of the `BRANCH` or mounted `/code`.
+Additionally, it contains the following files are directories:
+
+ - `/openml`: contains the openml-python repository in the state with which the image 
+   was built by default. If working with a `BRANCH`, this repository will be set to 
+   the `HEAD` of `BRANCH`.
+ - `/openml/venv/`: contains the used virtual environment for `doc` and `test`. It has
+   `openml-python` dependencies pre-installed.  When invoked with `doc` or `test`, the 
+   dependencies will be updated based on the `setup.py` of the `BRANCH` or mounted `/code`.
  - `/scripts/startup.sh`: the entrypoint of the image. Takes care of the automated features (e.g. `doc` and `test`).
 
 ## Building the image
-To build the image yourself, execute `docker build -f Dockerfile .` from this directory.
-It will use the `startup.sh` as is, so any local changes will be present in the image.
+To build the image yourself, execute `docker build -f Dockerfile .` from the `docker`
+directory of the `openml-python` repository. It will use the `startup.sh` as is, so any 
+local changes will be present in the image.
diff --git a/docker/startup.sh b/docker/startup.sh
index 2a75a621c..34a5c61f3 100644
--- a/docker/startup.sh
+++ b/docker/startup.sh
@@ -1,3 +1,6 @@
+# Entry script to switch between the different Docker functionalities.
+# By default, execute Python with OpenML pre-installed
+#
 # Entry script to allow docker to be ran for bash, tests and docs.
 # The script assumes a code repository can be mounted to ``/code`` and an output directory to ``/output``.
 # Executes ``mode`` on ``branch`` or the provided ``code`` directory.
@@ -10,10 +13,11 @@
 #        Can be a branch on a Github fork, specified with the USERNAME#BRANCH format.
 #        The test or doc build is executed on this branch.
 
-if [ -z "$1" ]; then
-  echo "Executing in BASH mode."
-  bash
-  exit
+if [[ ! ( $1 = "doc" || $1 = "test" ) ]]; then
+  cd openml
+  source venv/bin/activate
+  python "$@"
+  exit 0
 fi
 
 # doc and test modes require mounted directories and/or specified branches
@@ -32,8 +36,8 @@ if [ "$1" == "doc" ]  && [ -n "$2" ] && ! [ -d "/output" ]; then
 fi
 
 if [ -n "$2" ]; then
-  # if a branch is provided, we will pull it into the `omlp` local repository that was created with the image.
-  cd omlp
+  # if a branch is provided, we will pull it into the `openml` local repository that was created with the image.
+  cd openml
   if [[ $2 == *#* ]]; then
     # If a branch is specified on a fork (with NAME#BRANCH format), we have to construct the url before pulling
     # We add a trailing '#' delimiter so the second element doesn't get the trailing newline from <<<
@@ -52,12 +56,12 @@ if [ -n "$2" ]; then
     exit 1
   fi
   git pull
-  code_dir="/omlp"
+  code_dir="/openml"
 else
   code_dir="/code"
 fi
 
-source /omlp/venv/bin/activate
+source /openml/venv/bin/activate
 cd $code_dir
 # The most recent ``main`` is already installed, but we want to update any outdated dependencies
 pip install -e .[test,examples,docs,examples_unix]
@@ -71,6 +75,6 @@ if [ "$1" == "doc" ]; then
   make html
   make linkcheck
   if [ -d "/output" ]; then
-    cp -r /omlp/doc/build /output
+    cp -r /openml/doc/build /output
   fi
-fi
+fi
\ No newline at end of file
diff --git a/mypy.ini b/mypy.ini
deleted file mode 100644
index 7f3f8cefb..000000000
--- a/mypy.ini
+++ /dev/null
@@ -1,6 +0,0 @@
-[mypy]
-# Reports any config lines that are not recognized
-warn_unused_configs=True
-
-ignore_missing_imports=True
-follow_imports=skip
diff --git a/openml/__init__.py b/openml/__init__.py
index abb83ac0c..48d301eec 100644
--- a/openml/__init__.py
+++ b/openml/__init__.py
@@ -16,40 +16,46 @@
 """
 
 # License: BSD 3-Clause
-
-from . import _api_calls
-from . import config
-from .datasets import OpenMLDataset, OpenMLDataFeature
-from . import datasets
-from . import evaluations
+from __future__ import annotations
+
+from . import (
+    _api_calls,
+    config,
+    datasets,
+    evaluations,
+    exceptions,
+    extensions,
+    flows,
+    runs,
+    setups,
+    study,
+    tasks,
+    utils,
+)
+from .__version__ import __version__
+from .datasets import OpenMLDataFeature, OpenMLDataset
 from .evaluations import OpenMLEvaluation
-from . import extensions
-from . import exceptions
-from . import tasks
+from .flows import OpenMLFlow
+from .runs import OpenMLRun
+from .setups import OpenMLParameter, OpenMLSetup
+from .study import OpenMLBenchmarkSuite, OpenMLStudy
 from .tasks import (
-    OpenMLTask,
-    OpenMLSplit,
-    OpenMLSupervisedTask,
     OpenMLClassificationTask,
-    OpenMLRegressionTask,
     OpenMLClusteringTask,
     OpenMLLearningCurveTask,
+    OpenMLRegressionTask,
+    OpenMLSplit,
+    OpenMLSupervisedTask,
+    OpenMLTask,
 )
-from . import runs
-from .runs import OpenMLRun
-from . import flows
-from .flows import OpenMLFlow
-from . import study
-from .study import OpenMLStudy, OpenMLBenchmarkSuite
-from . import utils
-from . import setups
-from .setups import OpenMLSetup, OpenMLParameter
-
-
-from .__version__ import __version__  # noqa: F401
 
 
-def populate_cache(task_ids=None, dataset_ids=None, flow_ids=None, run_ids=None):
+def populate_cache(
+    task_ids: list[int] | None = None,
+    dataset_ids: list[int | str] | None = None,
+    flow_ids: list[int] | None = None,
+    run_ids: list[int] | None = None,
+) -> None:
     """
     Populate a cache for offline and parallel usage of the OpenML connector.
 
@@ -117,4 +123,5 @@ def populate_cache(task_ids=None, dataset_ids=None, flow_ids=None, run_ids=None)
 ]
 
 # Load the scikit-learn extension by default
-import openml.extensions.sklearn  # noqa: F401
+# TODO(eddiebergman): Not sure why this is at the bottom of the file
+import openml.extensions.sklearn  # noqa: E402, F401
diff --git a/openml/__version__.py b/openml/__version__.py
index d44a77ce2..d927c85ca 100644
--- a/openml/__version__.py
+++ b/openml/__version__.py
@@ -3,4 +3,6 @@
 # License: BSD 3-Clause
 
 # The following line *must* be the last in the module, exactly as formatted:
-__version__ = "0.14.1"
+from __future__ import annotations
+
+__version__ = "0.14.2"
diff --git a/openml/_api_calls.py b/openml/_api_calls.py
index 9ac49495d..9865c86df 100644
--- a/openml/_api_calls.py
+++ b/openml/_api_calls.py
@@ -1,34 +1,47 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
-import time
 import hashlib
 import logging
 import math
-import pathlib
 import random
-import requests
+import time
 import urllib.parse
 import xml
-import xmltodict
-from urllib3 import ProxyManager
-from typing import Dict, Optional, Tuple, Union
 import zipfile
+from pathlib import Path
+from typing import Dict, Tuple, Union
 
 import minio
+import requests
+import requests.utils
+import xmltodict
+from urllib3 import ProxyManager
 
 from . import config
 from .exceptions import (
+    OpenMLHashException,
     OpenMLServerError,
     OpenMLServerException,
     OpenMLServerNoResult,
-    OpenMLHashException,
 )
 
 DATA_TYPE = Dict[str, Union[str, int]]
 FILE_ELEMENTS_TYPE = Dict[str, Union[str, Tuple[str, str]]]
+DATABASE_CONNECTION_ERRCODE = 107
+
+
+def _robot_delay(n: int) -> float:
+    wait = (1 / (1 + math.exp(-(n * 0.5 - 4)))) * 60
+    variation = random.gauss(0, wait / 10)
+    return max(1.0, wait + variation)
 
 
-def resolve_env_proxies(url: str) -> Optional[str]:
+def _human_delay(n: int) -> float:
+    return max(1.0, n)
+
+
+def resolve_env_proxies(url: str) -> str | None:
     """Attempt to find a suitable proxy for this url.
 
     Relies on ``requests`` internals to remain consistent. To disable this from the
@@ -45,8 +58,7 @@ def resolve_env_proxies(url: str) -> Optional[str]:
         The proxy url if found, else None
     """
     resolved_proxies = requests.utils.get_environ_proxies(url)
-    selected_proxy = requests.utils.select_proxy(url, resolved_proxies)
-    return selected_proxy
+    return requests.utils.select_proxy(url, resolved_proxies)  # type: ignore
 
 
 def _create_url_from_endpoint(endpoint: str) -> str:
@@ -60,8 +72,8 @@ def _create_url_from_endpoint(endpoint: str) -> str:
 def _perform_api_call(
     call: str,
     request_method: str,
-    data: Optional[DATA_TYPE] = None,
-    file_elements: Optional[FILE_ELEMENTS_TYPE] = None,
+    data: DATA_TYPE | None = None,
+    file_elements: FILE_ELEMENTS_TYPE | None = None,
 ) -> str:
     """
     Perform an API call at the OpenML server.
@@ -111,17 +123,17 @@ def _perform_api_call(
 
 def _download_minio_file(
     source: str,
-    destination: Union[str, pathlib.Path],
-    exists_ok: bool = True,
-    proxy: Optional[str] = "auto",
+    destination: str | Path,
+    exists_ok: bool = True,  # noqa: FBT001, FBT002
+    proxy: str | None = "auto",
 ) -> None:
     """Download file ``source`` from a MinIO Bucket and store it at ``destination``.
 
     Parameters
     ----------
-    source : Union[str, pathlib.Path]
+    source : str
         URL to a file in a MinIO bucket.
-    destination : str
+    destination : str | Path
         Path to store the file to, if a directory is provided the original filename is used.
     exists_ok : bool, optional (default=True)
         If False, raise FileExists if a file already exists in ``destination``.
@@ -130,13 +142,13 @@ def _download_minio_file(
         automatically find the proxy to use. Pass None or the environment variable
         ``no_proxy="*"`` to disable proxies.
     """
-    destination = pathlib.Path(destination)
+    destination = Path(destination)
     parsed_url = urllib.parse.urlparse(source)
 
     # expect path format: /BUCKET/path/to/file.ext
     bucket, object_name = parsed_url.path[1:].split("/", maxsplit=1)
     if destination.is_dir():
-        destination = pathlib.Path(destination, object_name)
+        destination = Path(destination, object_name)
     if destination.is_file() and not exists_ok:
         raise FileExistsError(f"File already exists in {destination}.")
 
@@ -158,53 +170,52 @@ def _download_minio_file(
                 zip_ref.extractall(destination.parent)
 
     except minio.error.S3Error as e:
-        if e.message.startswith("Object does not exist"):
+        if e.message is not None and e.message.startswith("Object does not exist"):
             raise FileNotFoundError(f"Object at '{source}' does not exist.") from e
         # e.g. permission error, or a bucket does not exist (which is also interpreted as a
         # permission error on minio level).
         raise FileNotFoundError("Bucket does not exist or is private.") from e
 
 
-def _download_minio_bucket(
-    source: str,
-    destination: Union[str, pathlib.Path],
-    exists_ok: bool = True,
-) -> None:
+def _download_minio_bucket(source: str, destination: str | Path) -> None:
     """Download file ``source`` from a MinIO Bucket and store it at ``destination``.
 
     Parameters
     ----------
-    source : Union[str, pathlib.Path]
+    source : str
         URL to a MinIO bucket.
-    destination : str
+    destination : str | Path
         Path to a directory to store the bucket content in.
     exists_ok : bool, optional (default=True)
         If False, raise FileExists if a file already exists in ``destination``.
     """
-
-    destination = pathlib.Path(destination)
+    destination = Path(destination)
     parsed_url = urllib.parse.urlparse(source)
 
     # expect path format: /BUCKET/path/to/file.ext
-    bucket = parsed_url.path[1:]
+    _, bucket, *prefixes, _file = parsed_url.path.split("/")
+    prefix = "/".join(prefixes)
 
     client = minio.Minio(endpoint=parsed_url.netloc, secure=False)
 
-    for file_object in client.list_objects(bucket, recursive=True):
+    for file_object in client.list_objects(bucket, prefix=prefix, recursive=True):
+        if file_object.object_name is None:
+            raise ValueError("Object name is None.")
+
         _download_minio_file(
-            source=source + "/" + file_object.object_name,
-            destination=pathlib.Path(destination, file_object.object_name),
+            source=source.rsplit("/", 1)[0] + "/" + file_object.object_name.rsplit("/", 1)[1],
+            destination=Path(destination, file_object.object_name.rsplit("/", 1)[1]),
             exists_ok=True,
         )
 
 
 def _download_text_file(
     source: str,
-    output_path: Optional[str] = None,
-    md5_checksum: Optional[str] = None,
-    exists_ok: bool = True,
+    output_path: str | Path | None = None,
+    md5_checksum: str | None = None,
+    exists_ok: bool = True,  # noqa: FBT001, FBT002
     encoding: str = "utf8",
-) -> Optional[str]:
+) -> str | None:
     """Download the text file at `source` and store it in `output_path`.
 
     By default, do nothing if a file already exists in `output_path`.
@@ -214,7 +225,7 @@ def _download_text_file(
     ----------
     source : str
         url of the file to be downloaded
-    output_path : str, (optional)
+    output_path : str | Path | None (default=None)
         full path, including filename, of where the file should be stored. If ``None``,
         this function returns the downloaded file as string.
     md5_checksum : str, optional (default=None)
@@ -224,15 +235,14 @@ def _download_text_file(
     encoding : str, optional (default='utf8')
         The encoding with which the file should be stored.
     """
-    if output_path is not None:
-        try:
-            with open(output_path, encoding=encoding):
-                if exists_ok:
-                    return None
-                else:
-                    raise FileExistsError
-        except FileNotFoundError:
-            pass
+    if isinstance(output_path, str):
+        output_path = Path(output_path)
+
+    if output_path is not None and output_path.exists():
+        if not exists_ok:
+            raise FileExistsError
+
+        return None
 
     logging.info("Starting [%s] request for the URL %s", "get", source)
     start = time.time()
@@ -248,87 +258,93 @@ def _download_text_file(
         )
         return downloaded_file
 
-    else:
-        with open(output_path, "w", encoding=encoding) as fh:
-            fh.write(downloaded_file)
-
-        logging.info(
-            "%.7fs taken for [%s] request for the URL %s",
-            time.time() - start,
-            "get",
-            source,
-        )
+    with output_path.open("w", encoding=encoding) as fh:
+        fh.write(downloaded_file)
 
-        del downloaded_file
-        return None
+    logging.info(
+        "%.7fs taken for [%s] request for the URL %s",
+        time.time() - start,
+        "get",
+        source,
+    )
+    return None
 
 
-def _file_id_to_url(file_id: str, filename: Optional[str] = None) -> str:
+def _file_id_to_url(file_id: int, filename: str | None = None) -> str:
     """
     Presents the URL how to download a given file id
     filename is optional
     """
     openml_url = config.server.split("/api/")
-    url = openml_url[0] + "/data/download/%s" % file_id
+    url = openml_url[0] + f"/data/download/{file_id!s}"
     if filename is not None:
         url += "/" + filename
     return url
 
 
 def _read_url_files(
-    url: str, data: Optional[DATA_TYPE] = None, file_elements: Optional[FILE_ELEMENTS_TYPE] = None
+    url: str,
+    data: DATA_TYPE | None = None,
+    file_elements: FILE_ELEMENTS_TYPE | None = None,
 ) -> requests.Response:
-    """do a post request to url with data
-    and sending file_elements as files"""
-
+    """Do a post request to url with data
+    and sending file_elements as files
+    """
     data = {} if data is None else data
     data["api_key"] = config.apikey
     if file_elements is None:
         file_elements = {}
     # Using requests.post sets header 'Accept-encoding' automatically to
     # 'gzip,deflate'
-    response = _send_request(
+    return _send_request(
         request_method="post",
         url=url,
         data=data,
         files=file_elements,
     )
-    return response
 
 
 def __read_url(
     url: str,
     request_method: str,
-    data: Optional[DATA_TYPE] = None,
-    md5_checksum: Optional[str] = None,
+    data: DATA_TYPE | None = None,
+    md5_checksum: str | None = None,
 ) -> requests.Response:
     data = {} if data is None else data
     if config.apikey:
         data["api_key"] = config.apikey
     return _send_request(
-        request_method=request_method, url=url, data=data, md5_checksum=md5_checksum
+        request_method=request_method,
+        url=url,
+        data=data,
+        md5_checksum=md5_checksum,
     )
 
 
-def __is_checksum_equal(downloaded_file_binary: bytes, md5_checksum: Optional[str] = None) -> bool:
+def __is_checksum_equal(downloaded_file_binary: bytes, md5_checksum: str | None = None) -> bool:
     if md5_checksum is None:
         return True
-    md5 = hashlib.md5()
+    md5 = hashlib.md5()  # noqa: S324
     md5.update(downloaded_file_binary)
     md5_checksum_download = md5.hexdigest()
     return md5_checksum == md5_checksum_download
 
 
-def _send_request(
+def _send_request(  # noqa: C901
     request_method: str,
     url: str,
     data: DATA_TYPE,
-    files: Optional[FILE_ELEMENTS_TYPE] = None,
-    md5_checksum: Optional[str] = None,
+    files: FILE_ELEMENTS_TYPE | None = None,
+    md5_checksum: str | None = None,
 ) -> requests.Response:
     n_retries = max(1, config.connection_n_retries)
 
-    response: requests.Response
+    response: requests.Response | None = None
+    delay_method = _human_delay if config.retry_policy == "human" else _robot_delay
+
+    # Error to raise in case of retrying too often. Will be set to the last observed exception.
+    retry_raise_e: Exception | None = None
+
     with requests.Session() as session:
         # Start at one to have a non-zero multiplier for the sleep
         for retry_counter in range(1, n_retries + 1):
@@ -341,7 +357,9 @@ def _send_request(
                     response = session.post(url, data=data, files=files)
                 else:
                     raise NotImplementedError()
+
                 __check_response(response=response, url=url, file_elements=files)
+
                 if request_method == "get" and not __is_checksum_equal(
                     response.text.encode("utf-8"), md5_checksum
                 ):
@@ -352,85 +370,86 @@ def _send_request(
                             "because the text encoding is not UTF-8 when downloading {}. "
                             "There might be a sever-sided issue with the file, "
                             "see: https://github.com/openml/openml-python/issues/1180.".format(
-                                md5_checksum, url
-                            )
+                                md5_checksum,
+                                url,
+                            ),
                         )
 
                     raise OpenMLHashException(
                         "Checksum of downloaded file is unequal to the expected checksum {} "
-                        "when downloading {}.".format(md5_checksum, url)
+                        "when downloading {}.".format(md5_checksum, url),
                     )
-                break
+
+                return response
+            except OpenMLServerException as e:
+                # Propagate all server errors to the calling functions, except
+                # for 107 which represents a database connection error.
+                # These are typically caused by high server load,
+                # which means trying again might resolve the issue.
+                if e.code != DATABASE_CONNECTION_ERRCODE:
+                    raise e
+                retry_raise_e = e
+            except xml.parsers.expat.ExpatError as e:
+                if request_method != "get" or retry_counter >= n_retries:
+                    if response is not None:
+                        extra = f"Status code: {response.status_code}\n{response.text}"
+                    else:
+                        extra = "No response retrieved."
+
+                    raise OpenMLServerError(
+                        f"Unexpected server error when calling {url}. Please contact the "
+                        f"developers!\n{extra}"
+                    ) from e
+                retry_raise_e = e
             except (
                 requests.exceptions.ChunkedEncodingError,
                 requests.exceptions.ConnectionError,
                 requests.exceptions.SSLError,
-                OpenMLServerException,
-                xml.parsers.expat.ExpatError,
                 OpenMLHashException,
             ) as e:
-                if isinstance(e, OpenMLServerException) and e.code != 107:
-                    # Propagate all server errors to the calling functions, except
-                    # for 107 which represents a database connection error.
-                    # These are typically caused by high server load,
-                    # which means trying again might resolve the issue.
-                    raise
-                elif isinstance(e, xml.parsers.expat.ExpatError):
-                    if request_method != "get" or retry_counter >= n_retries:
-                        raise OpenMLServerError(
-                            "Unexpected server error when calling {}. Please contact the "
-                            "developers!\nStatus code: {}\n{}".format(
-                                url,
-                                response.status_code,
-                                response.text,
-                            )
-                        )
-                if retry_counter >= n_retries:
-                    raise
-                else:
+                retry_raise_e = e
 
-                    def robot(n: int) -> float:
-                        wait = (1 / (1 + math.exp(-(n * 0.5 - 4)))) * 60
-                        variation = random.gauss(0, wait / 10)
-                        return max(1.0, wait + variation)
+            # We can only be here if there was an exception
+            assert retry_raise_e is not None
+            if retry_counter >= n_retries:
+                raise retry_raise_e
+            delay = delay_method(retry_counter)
+            time.sleep(delay)
 
-                    def human(n: int) -> float:
-                        return max(1.0, n)
-
-                    delay = {"human": human, "robot": robot}[config.retry_policy](retry_counter)
-                    time.sleep(delay)
+    assert response is not None
     return response
 
 
 def __check_response(
-    response: requests.Response, url: str, file_elements: Optional[FILE_ELEMENTS_TYPE]
+    response: requests.Response,
+    url: str,
+    file_elements: FILE_ELEMENTS_TYPE | None,
 ) -> None:
     if response.status_code != 200:
         raise __parse_server_exception(response, url, file_elements=file_elements)
-    elif (
-        "Content-Encoding" not in response.headers or response.headers["Content-Encoding"] != "gzip"
-    ):
-        logging.warning("Received uncompressed content from OpenML for {}.".format(url))
+    if "Content-Encoding" not in response.headers or response.headers["Content-Encoding"] != "gzip":
+        logging.warning(f"Received uncompressed content from OpenML for {url}.")
 
 
 def __parse_server_exception(
     response: requests.Response,
     url: str,
-    file_elements: Optional[FILE_ELEMENTS_TYPE],
+    file_elements: FILE_ELEMENTS_TYPE | None,
 ) -> OpenMLServerError:
     if response.status_code == 414:
-        raise OpenMLServerError("URI too long! ({})".format(url))
+        raise OpenMLServerError(f"URI too long! ({url})")
+
     try:
         server_exception = xmltodict.parse(response.text)
-    except xml.parsers.expat.ExpatError:
-        raise
-    except Exception:
+    except xml.parsers.expat.ExpatError as e:
+        raise e
+    except Exception as e:  # noqa: BLE001
         # OpenML has a sophisticated error system
         # where information about failures is provided. try to parse this
         raise OpenMLServerError(
-            "Unexpected server error when calling {}. Please contact the developers!\n"
-            "Status code: {}\n{}".format(url, response.status_code, response.text)
-        )
+            f"Unexpected server error when calling {url}. Please contact the developers!\n"
+            f"Status code: {response.status_code}\n{response.text}",
+        ) from e
 
     server_error = server_exception["oml:error"]
     code = int(server_error["oml:code"])
@@ -438,7 +457,7 @@ def __parse_server_exception(
     additional_information = server_error.get("oml:additional_information")
     if code in [372, 512, 500, 482, 542, 674]:
         if additional_information:
-            full_message = "{} - {}".format(message, additional_information)
+            full_message = f"{message} - {additional_information}"
         else:
             full_message = message
 
@@ -457,5 +476,5 @@ def __parse_server_exception(
             additional_information,
         )
     else:
-        full_message = "{} - {}".format(message, additional_information)
+        full_message = f"{message} - {additional_information}"
     return OpenMLServerException(code=code, message=full_message, url=url)
diff --git a/openml/base.py b/openml/base.py
index 35a9ce58f..37693a2ec 100644
--- a/openml/base.py
+++ b/openml/base.py
@@ -1,32 +1,33 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
-from abc import ABC, abstractmethod
-from collections import OrderedDict
 import re
-from typing import Optional, List, Tuple, Union, Dict
 import webbrowser
+from abc import ABC, abstractmethod
+from typing import Iterable, Sequence
 
 import xmltodict
 
+import openml._api_calls
 import openml.config
-from .utils import _tag_openml_base, _get_rest_api_type_alias
+
+from .utils import _get_rest_api_type_alias, _tag_openml_base
 
 
 class OpenMLBase(ABC):
     """Base object for functionality that is shared across entities."""
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         body_fields = self._get_repr_body_fields()
         return self._apply_repr_template(body_fields)
 
     @property
     @abstractmethod
-    def id(self) -> Optional[int]:
+    def id(self) -> int | None:
         """The id of the entity, it is unique for its entity type."""
-        pass
 
     @property
-    def openml_url(self) -> Optional[str]:
+    def openml_url(self) -> str | None:
         """The URL of the object on the server, if it was uploaded, else None."""
         if self.id is None:
             return None
@@ -36,7 +37,7 @@ def openml_url(self) -> Optional[str]:
     def url_for_id(cls, id_: int) -> str:
         """Return the OpenML URL for the object of the class entity with the given id."""
         # Sample url for a flow: openml.org/f/123
-        return "{}/{}/{}".format(openml.config.get_server_base_url(), cls._entity_letter(), id_)
+        return f"{openml.config.get_server_base_url()}/{cls._entity_letter()}/{id_}"
 
     @classmethod
     def _entity_letter(cls) -> str:
@@ -45,21 +46,24 @@ def _entity_letter(cls) -> str:
         # which holds for all entities except studies and tasks, which overwrite this method.
         return cls.__name__.lower()[len("OpenML") :][0]
 
+    # TODO(eddiebergman): This would be much cleaner as an iterator...
     @abstractmethod
-    def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
+    def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str] | None]]:
         """Collect all information to display in the __repr__ body.
 
         Returns
-        ------
+        -------
         body_fields : List[Tuple[str, Union[str, int, List[str]]]]
             A list of (name, value) pairs to display in the body of the __repr__.
             E.g.: [('metric', 'accuracy'), ('dataset', 'iris')]
             If value is a List of str, then each item of the list will appear in a separate row.
         """
         # Should be implemented in the base class.
-        pass
 
-    def _apply_repr_template(self, body_fields: List[Tuple[str, str]]) -> str:
+    def _apply_repr_template(
+        self,
+        body_fields: Iterable[tuple[str, str | int | list[str] | None]],
+    ) -> str:
         """Generates the header and formats the body for string representation of the object.
 
         Parameters
@@ -69,33 +73,34 @@ def _apply_repr_template(self, body_fields: List[Tuple[str, str]]) -> str:
         """
         # We add spaces between capitals, e.g. ClassificationTask -> Classification Task
         name_with_spaces = re.sub(
-            r"(\w)([A-Z])", r"\1 \2", self.__class__.__name__[len("OpenML") :]
+            r"(\w)([A-Z])",
+            r"\1 \2",
+            self.__class__.__name__[len("OpenML") :],
         )
-        header_text = "OpenML {}".format(name_with_spaces)
+        header_text = f"OpenML {name_with_spaces}"
         header = "{}\n{}\n".format(header_text, "=" * len(header_text))
 
-        longest_field_name_length = max(len(name) for name, value in body_fields)
-        field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length)
-        body = "\n".join(field_line_format.format(name, value) for name, value in body_fields)
+        _body_fields: list[tuple[str, str | int | list[str]]] = [
+            (k, "None" if v is None else v) for k, v in body_fields
+        ]
+        longest_field_name_length = max(len(name) for name, _ in _body_fields)
+        field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}"
+        body = "\n".join(field_line_format.format(name, value) for name, value in _body_fields)
         return header + body
 
     @abstractmethod
-    def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
+    def _to_dict(self) -> dict[str, dict]:
         """Creates a dictionary representation of self.
 
-        Uses OrderedDict to ensure consistent ordering when converting to xml.
-        The return value (OrderedDict) will be used to create the upload xml file.
+        The return value will be used to create the upload xml file.
         The xml file must have the tags in exactly the order of the object's xsd.
         (see https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/).
 
         Returns
         -------
-        OrderedDict
-            Flow represented as OrderedDict.
-
+            Thing represented as dict.
         """
         # Should be implemented in the base class.
-        pass
 
     def _to_xml(self) -> str:
         """Generate xml representation of self for upload to server."""
@@ -104,10 +109,10 @@ def _to_xml(self) -> str:
 
         # A task may not be uploaded with the xml encoding specification:
         # <?xml version="1.0" encoding="utf-8"?>
-        encoding_specification, xml_body = xml_representation.split("\n", 1)
-        return xml_body
+        _encoding_specification, xml_body = xml_representation.split("\n", 1)
+        return str(xml_body)
 
-    def _get_file_elements(self) -> Dict:
+    def _get_file_elements(self) -> openml._api_calls.FILE_ELEMENTS_TYPE:
         """Get file_elements to upload to the server, called during Publish.
 
         Derived child classes should overwrite this method as necessary.
@@ -116,30 +121,37 @@ def _get_file_elements(self) -> Dict:
         return {}
 
     @abstractmethod
-    def _parse_publish_response(self, xml_response: Dict):
+    def _parse_publish_response(self, xml_response: dict[str, str]) -> None:
         """Parse the id from the xml_response and assign it to self."""
-        pass
 
-    def publish(self) -> "OpenMLBase":
+    def publish(self) -> OpenMLBase:
+        """Publish the object on the OpenML server."""
         file_elements = self._get_file_elements()
 
         if "description" not in file_elements:
             file_elements["description"] = self._to_xml()
 
-        call = "{}/".format(_get_rest_api_type_alias(self))
+        call = f"{_get_rest_api_type_alias(self)}/"
         response_text = openml._api_calls._perform_api_call(
-            call, "post", file_elements=file_elements
+            call,
+            "post",
+            file_elements=file_elements,
         )
         xml_response = xmltodict.parse(response_text)
 
         self._parse_publish_response(xml_response)
         return self
 
-    def open_in_browser(self):
+    def open_in_browser(self) -> None:
         """Opens the OpenML web page corresponding to this object in your default browser."""
+        if self.openml_url is None:
+            raise ValueError(
+                "Cannot open element on OpenML.org when attribute `openml_url` is `None`",
+            )
+
         webbrowser.open(self.openml_url)
 
-    def push_tag(self, tag: str):
+    def push_tag(self, tag: str) -> None:
         """Annotates this entity with a tag on the server.
 
         Parameters
@@ -149,7 +161,7 @@ def push_tag(self, tag: str):
         """
         _tag_openml_base(self, tag)
 
-    def remove_tag(self, tag: str):
+    def remove_tag(self, tag: str) -> None:
         """Removes a tag from this entity on the server.
 
         Parameters
diff --git a/openml/cli.py b/openml/cli.py
index 039ac227c..5732442d0 100644
--- a/openml/cli.py
+++ b/openml/cli.py
@@ -1,13 +1,13 @@
-"""" Command Line Interface for `openml` to configure its settings. """
+""""Command Line Interface for `openml` to configure its settings."""
+from __future__ import annotations
 
 import argparse
-import os
-import pathlib
 import string
-from typing import Union, Callable
+import sys
+from pathlib import Path
+from typing import Callable
 from urllib.parse import urlparse
 
-
 from openml import config
 
 
@@ -19,12 +19,14 @@ def looks_like_url(url: str) -> bool:
     # There's no thorough url parser, but we only seem to use netloc.
     try:
         return bool(urlparse(url).netloc)
-    except Exception:
+    except Exception:  # noqa: BLE001
         return False
 
 
 def wait_until_valid_input(
-    prompt: str, check: Callable[[str], str], sanitize: Union[Callable[[str], str], None]
+    prompt: str,
+    check: Callable[[str], str],
+    sanitize: Callable[[str], str] | None,
 ) -> str:
     """Asks `prompt` until an input is received which returns True for `check`.
 
@@ -43,7 +45,6 @@ def wait_until_valid_input(
     valid input
 
     """
-
     while True:
         response = input(prompt)
         if sanitize:
@@ -55,7 +56,7 @@ def wait_until_valid_input(
             return response
 
 
-def print_configuration():
+def print_configuration() -> None:
     file = config.determine_config_file_path()
     header = f"File '{file}' contains (or defaults to):"
     print(header)
@@ -65,7 +66,7 @@ def print_configuration():
         print(f"{field.ljust(max_key_length)}: {value}")
 
 
-def verbose_set(field, value):
+def verbose_set(field: str, value: str) -> None:
     config.set_field_in_config_file(field, value)
     print(f"{field} set to '{value}'.")
 
@@ -123,17 +124,20 @@ def replace_shorthand(server: str) -> str:
 
 def configure_cachedir(value: str) -> None:
     def check_cache_dir(path: str) -> str:
-        p = pathlib.Path(path)
-        if p.is_file():
-            return f"'{path}' is a file, not a directory."
-        expanded = p.expanduser()
+        _path = Path(path)
+        if _path.is_file():
+            return f"'{_path}' is a file, not a directory."
+
+        expanded = _path.expanduser()
         if not expanded.is_absolute():
-            return f"'{path}' is not absolute (even after expanding '~')."
+            return f"'{_path}' is not absolute (even after expanding '~')."
+
         if not expanded.exists():
             try:
-                os.mkdir(expanded)
+                expanded.mkdir()
             except PermissionError:
                 return f"'{path}' does not exist and there are not enough permissions to create it."
+
         return ""
 
     configure_field(
@@ -143,7 +147,6 @@ def check_cache_dir(path: str) -> str:
         intro_message="Configuring the cache directory. It can not be a relative path.",
         input_message="Specify the directory to use (or create) as cache directory: ",
     )
-    print("NOTE: Data from your old cache directory is not moved over.")
 
 
 def configure_connection_n_retries(value: str) -> None:
@@ -244,13 +247,13 @@ def autocomplete_policy(policy: str) -> str:
     )
 
 
-def configure_field(
+def configure_field(  # noqa: PLR0913
     field: str,
-    value: Union[None, str],
+    value: None | str,
     check_with_message: Callable[[str], str],
     intro_message: str,
     input_message: str,
-    sanitize: Union[Callable[[str], str], None] = None,
+    sanitize: Callable[[str], str] | None = None,
 ) -> None:
     """Configure `field` with `value`. If `value` is None ask the user for input.
 
@@ -284,7 +287,7 @@ def configure_field(
         malformed_input = check_with_message(value)
         if malformed_input:
             print(malformed_input)
-            quit()
+            sys.exit()
     else:
         print(intro_message)
         value = wait_until_valid_input(
@@ -295,7 +298,7 @@ def configure_field(
     verbose_set(field, value)
 
 
-def configure(args: argparse.Namespace):
+def configure(args: argparse.Namespace) -> None:
     """Calls the right submenu(s) to edit `args.field` in the configuration file."""
     set_functions = {
         "apikey": configure_apikey,
@@ -307,7 +310,7 @@ def configure(args: argparse.Namespace):
         "verbosity": configure_verbosity,
     }
 
-    def not_supported_yet(_):
+    def not_supported_yet(_: str) -> None:
         print(f"Setting '{args.field}' is not supported yet.")
 
     if args.field not in ["all", "none"]:
@@ -315,12 +318,11 @@ def not_supported_yet(_):
     else:
         if args.value is not None:
             print(f"Can not set value ('{args.value}') when field is specified as '{args.field}'.")
-            quit()
+            sys.exit()
         print_configuration()
 
     if args.field == "all":
         for set_field_function in set_functions.values():
-            print()  # Visually separating the output by field.
             set_field_function(args.value)
 
 
diff --git a/openml/config.py b/openml/config.py
index b68455a9b..4744dbe86 100644
--- a/openml/config.py
+++ b/openml/config.py
@@ -1,30 +1,38 @@
-"""
-Store module level information like the API key, cache directory and the server
-"""
+"""Store module level information like the API key, cache directory and the server"""
 
 # License: BSD 3-Clause
+from __future__ import annotations
 
+import configparser
 import logging
 import logging.handlers
 import os
-from pathlib import Path
 import platform
-from typing import Tuple, cast, Any, Optional
 import warnings
-
 from io import StringIO
-import configparser
+from pathlib import Path
+from typing import Any, cast
+from typing_extensions import Literal, TypedDict
 from urllib.parse import urlparse
 
 logger = logging.getLogger(__name__)
 openml_logger = logging.getLogger("openml")
-console_handler = None
-file_handler = None
+console_handler: logging.StreamHandler | None = None
+file_handler: logging.handlers.RotatingFileHandler | None = None
+
 
+class _Config(TypedDict):
+    apikey: str
+    server: str
+    cachedir: Path
+    avoid_duplicate_runs: bool
+    retry_policy: Literal["human", "robot"]
+    connection_n_retries: int
 
-def _create_log_handlers(create_file_handler=True):
+
+def _create_log_handlers(create_file_handler: bool = True) -> None:  # noqa: FBT001, FBT002
     """Creates but does not attach the log handlers."""
-    global console_handler, file_handler
+    global console_handler, file_handler  # noqa: PLW0603
     if console_handler is not None or file_handler is not None:
         logger.debug("Requested to create log handlers, but they are already created.")
         return
@@ -37,14 +45,17 @@ def _create_log_handlers(create_file_handler=True):
 
     if create_file_handler:
         one_mb = 2**20
-        log_path = os.path.join(_root_cache_directory, "openml_python.log")
+        log_path = _root_cache_directory / "openml_python.log"
         file_handler = logging.handlers.RotatingFileHandler(
-            log_path, maxBytes=one_mb, backupCount=1, delay=True
+            log_path,
+            maxBytes=one_mb,
+            backupCount=1,
+            delay=True,
         )
         file_handler.setFormatter(output_formatter)
 
 
-def _convert_log_levels(log_level: int) -> Tuple[int, int]:
+def _convert_log_levels(log_level: int) -> tuple[int, int]:
     """Converts a log level that's either defined by OpenML/Python to both specifications."""
     # OpenML verbosity level don't match Python values directly:
     openml_to_python = {0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG}
@@ -61,9 +72,9 @@ def _convert_log_levels(log_level: int) -> Tuple[int, int]:
     return openml_level, python_level
 
 
-def _set_level_register_and_store(handler: logging.Handler, log_level: int):
+def _set_level_register_and_store(handler: logging.Handler, log_level: int) -> None:
     """Set handler log level, register it if needed, save setting to config file if specified."""
-    oml_level, py_level = _convert_log_levels(log_level)
+    _oml_level, py_level = _convert_log_levels(log_level)
     handler.setLevel(py_level)
 
     if openml_logger.level > py_level or openml_logger.level == logging.NOTSET:
@@ -73,42 +84,38 @@ def _set_level_register_and_store(handler: logging.Handler, log_level: int):
         openml_logger.addHandler(handler)
 
 
-def set_console_log_level(console_output_level: int):
+def set_console_log_level(console_output_level: int) -> None:
     """Set console output to the desired level and register it with openml logger if needed."""
-    global console_handler
-    _set_level_register_and_store(cast(logging.Handler, console_handler), console_output_level)
+    global console_handler  # noqa: PLW0602
+    assert console_handler is not None
+    _set_level_register_and_store(console_handler, console_output_level)
 
 
-def set_file_log_level(file_output_level: int):
+def set_file_log_level(file_output_level: int) -> None:
     """Set file output to the desired level and register it with openml logger if needed."""
-    global file_handler
-    _set_level_register_and_store(cast(logging.Handler, file_handler), file_output_level)
+    global file_handler  # noqa: PLW0602
+    assert file_handler is not None
+    _set_level_register_and_store(file_handler, file_output_level)
 
 
 # Default values (see also https://github.com/openml/OpenML/wiki/Client-API-Standards)
-_defaults = {
+_user_path = Path("~").expanduser().absolute()
+_defaults: _Config = {
     "apikey": "",
     "server": "https://www.openml.org/api/v1/xml",
     "cachedir": (
-        os.environ.get(
-            "XDG_CACHE_HOME",
-            os.path.join(
-                "~",
-                ".cache",
-                "openml",
-            ),
-        )
+        Path(os.environ.get("XDG_CACHE_HOME", _user_path / ".cache" / "openml"))
         if platform.system() == "Linux"
-        else os.path.join("~", ".openml")
+        else _user_path / ".openml"
     ),
-    "avoid_duplicate_runs": "True",
+    "avoid_duplicate_runs": True,
     "retry_policy": "human",
-    "connection_n_retries": "5",
+    "connection_n_retries": 5,
 }
 
 # Default values are actually added here in the _setup() function which is
 # called at the end of this module
-server = str(_defaults["server"])  # so mypy knows it is a string
+server = _defaults["server"]
 
 
 def get_server_base_url() -> str:
@@ -117,32 +124,34 @@ def get_server_base_url() -> str:
     Turns ``"https://www.openml.org/api/v1/xml"`` in ``"https://www.openml.org/"``
 
     Returns
-    =======
+    -------
     str
     """
     return server.split("/api")[0]
 
 
-apikey = _defaults["apikey"]
+apikey: str = _defaults["apikey"]
 # The current cache directory (without the server name)
-_root_cache_directory = str(_defaults["cachedir"])  # so mypy knows it is a string
-avoid_duplicate_runs = True if _defaults["avoid_duplicate_runs"] == "True" else False
+_root_cache_directory = Path(_defaults["cachedir"])
+avoid_duplicate_runs = _defaults["avoid_duplicate_runs"]
 
 retry_policy = _defaults["retry_policy"]
-connection_n_retries = int(_defaults["connection_n_retries"])
+connection_n_retries = _defaults["connection_n_retries"]
 
 
-def set_retry_policy(value: str, n_retries: Optional[int] = None) -> None:
-    global retry_policy
-    global connection_n_retries
-    default_retries_by_policy = dict(human=5, robot=50)
+def set_retry_policy(value: Literal["human", "robot"], n_retries: int | None = None) -> None:
+    global retry_policy  # noqa: PLW0603
+    global connection_n_retries  # noqa: PLW0603
+    default_retries_by_policy = {"human": 5, "robot": 50}
 
     if value not in default_retries_by_policy:
         raise ValueError(
-            f"Detected retry_policy '{value}' but must be one of {default_retries_by_policy}"
+            f"Detected retry_policy '{value}' but must be one of "
+            f"{list(default_retries_by_policy.keys())}",
         )
     if n_retries is not None and not isinstance(n_retries, int):
         raise TypeError(f"`n_retries` must be of type `int` or `None` but is `{type(n_retries)}`.")
+
     if isinstance(n_retries, int) and n_retries < 1:
         raise ValueError(f"`n_retries` is '{n_retries}' but must be positive.")
 
@@ -160,14 +169,14 @@ class ConfigurationForExamples:
     _test_apikey = "c0c42819af31e706efe1f4b88c23c6c1"
 
     @classmethod
-    def start_using_configuration_for_example(cls):
+    def start_using_configuration_for_example(cls) -> None:
         """Sets the configuration to connect to the test server with valid apikey.
 
         To configuration as was before this call is stored, and can be recovered
         by using the `stop_use_example_configuration` method.
         """
-        global server
-        global apikey
+        global server  # noqa: PLW0603
+        global apikey  # noqa: PLW0603
 
         if cls._start_last_called and server == cls._test_server and apikey == cls._test_apikey:
             # Method is called more than once in a row without modifying the server or apikey.
@@ -182,26 +191,27 @@ def start_using_configuration_for_example(cls):
         server = cls._test_server
         apikey = cls._test_apikey
         warnings.warn(
-            "Switching to the test server {} to not upload results to the live server. "
-            "Using the test server may result in reduced performance of the API!".format(server)
+            f"Switching to the test server {server} to not upload results to the live server. "
+            "Using the test server may result in reduced performance of the API!",
+            stacklevel=2,
         )
 
     @classmethod
-    def stop_using_configuration_for_example(cls):
+    def stop_using_configuration_for_example(cls) -> None:
         """Return to configuration as it was before `start_use_example_configuration`."""
         if not cls._start_last_called:
             # We don't want to allow this because it will (likely) result in the `server` and
             # `apikey` variables being set to None.
             raise RuntimeError(
                 "`stop_use_example_configuration` called without a saved config."
-                "`start_use_example_configuration` must be called first."
+                "`start_use_example_configuration` must be called first.",
             )
 
-        global server
-        global apikey
+        global server  # noqa: PLW0603
+        global apikey  # noqa: PLW0603
 
-        server = cls._last_used_server
-        apikey = cls._last_used_key
+        server = cast(str, cls._last_used_server)
+        apikey = cast(str, cls._last_used_key)
         cls._start_last_called = False
 
 
@@ -211,11 +221,11 @@ def determine_config_file_path() -> Path:
     else:
         config_dir = Path("~") / ".openml"
     # Still use os.path.expanduser to trigger the mock in the unit test
-    config_dir = Path(os.path.expanduser(config_dir))
+    config_dir = Path(config_dir).expanduser().resolve()
     return config_dir / "config"
 
 
-def _setup(config=None):
+def _setup(config: _Config | None = None) -> None:
     """Setup openml package. Called on first import.
 
     Reads the config file and sets up apikey, server, cache appropriately.
@@ -224,58 +234,48 @@ def _setup(config=None):
     openml.config.server = SOMESERVER
     We could also make it a property but that's less clear.
     """
-    global apikey
-    global server
-    global _root_cache_directory
-    global avoid_duplicate_runs
+    global apikey  # noqa: PLW0603
+    global server  # noqa: PLW0603
+    global _root_cache_directory  # noqa: PLW0603
+    global avoid_duplicate_runs  # noqa: PLW0603
 
     config_file = determine_config_file_path()
     config_dir = config_file.parent
 
     # read config file, create directory for config file
-    if not os.path.exists(config_dir):
-        try:
-            os.makedirs(config_dir, exist_ok=True)
-            cache_exists = True
-        except PermissionError:
-            cache_exists = False
-    else:
-        cache_exists = True
+    try:
+        if not config_dir.exists():
+            config_dir.mkdir(exist_ok=True, parents=True)
+    except PermissionError:
+        pass
 
     if config is None:
         config = _parse_config(config_file)
 
-        def _get(config, key):
-            return config.get("FAKE_SECTION", key)
+    avoid_duplicate_runs = config.get("avoid_duplicate_runs", False)
+    apikey = config["apikey"]
+    server = config["server"]
+    short_cache_dir = config["cachedir"]
+    n_retries = config["connection_n_retries"]
 
-        avoid_duplicate_runs = config.getboolean("FAKE_SECTION", "avoid_duplicate_runs")
-    else:
-
-        def _get(config, key):
-            return config.get(key)
-
-        avoid_duplicate_runs = config.get("avoid_duplicate_runs")
+    set_retry_policy(config["retry_policy"], n_retries)
 
-    apikey = _get(config, "apikey")
-    server = _get(config, "server")
-    short_cache_dir = _get(config, "cachedir")
+    _root_cache_directory = short_cache_dir.expanduser().resolve()
 
-    n_retries = _get(config, "connection_n_retries")
-    if n_retries is not None:
-        n_retries = int(n_retries)
-
-    set_retry_policy(_get(config, "retry_policy"), n_retries)
+    try:
+        cache_exists = _root_cache_directory.exists()
+    except PermissionError:
+        cache_exists = False
 
-    _root_cache_directory = os.path.expanduser(short_cache_dir)
     # create the cache subdirectory
-    if not os.path.exists(_root_cache_directory):
-        try:
-            os.makedirs(_root_cache_directory, exist_ok=True)
-        except PermissionError:
-            openml_logger.warning(
-                "No permission to create openml cache directory at %s! This can result in "
-                "OpenML-Python not working properly." % _root_cache_directory
-            )
+    try:
+        if not _root_cache_directory.exists():
+            _root_cache_directory.mkdir(exist_ok=True, parents=True)
+    except PermissionError:
+        openml_logger.warning(
+            "No permission to create openml cache directory at %s! This can result in "
+            "OpenML-Python not working properly." % _root_cache_directory,
+        )
 
     if cache_exists:
         _create_log_handlers()
@@ -283,41 +283,43 @@ def _get(config, key):
         _create_log_handlers(create_file_handler=False)
         openml_logger.warning(
             "No permission to create OpenML directory at %s! This can result in OpenML-Python "
-            "not working properly." % config_dir
+            "not working properly." % config_dir,
         )
 
 
-def set_field_in_config_file(field: str, value: Any):
+def set_field_in_config_file(field: str, value: Any) -> None:
     """Overwrites the `field` in the configuration file with the new `value`."""
     if field not in _defaults:
-        return ValueError(f"Field '{field}' is not valid and must be one of '{_defaults.keys()}'.")
+        raise ValueError(f"Field '{field}' is not valid and must be one of '{_defaults.keys()}'.")
 
+    # TODO(eddiebergman): This use of globals has gone too far
     globals()[field] = value
     config_file = determine_config_file_path()
-    config = _parse_config(str(config_file))
-    with open(config_file, "w") as fh:
-        for f in _defaults.keys():
+    config = _parse_config(config_file)
+    with config_file.open("w") as fh:
+        for f in _defaults:
             # We can't blindly set all values based on globals() because when the user
             # sets it through config.FIELD it should not be stored to file.
             # There doesn't seem to be a way to avoid writing defaults to file with configparser,
             # because it is impossible to distinguish from an explicitly set value that matches
             # the default value, to one that was set to its default because it was omitted.
-            value = config.get("FAKE_SECTION", f)
+            value = config.get("FAKE_SECTION", f)  # type: ignore
             if f == field:
                 value = globals()[f]
             fh.write(f"{f} = {value}\n")
 
 
-def _parse_config(config_file: str):
+def _parse_config(config_file: str | Path) -> _Config:
     """Parse the config file, set up defaults."""
-    config = configparser.RawConfigParser(defaults=_defaults)
+    config_file = Path(config_file)
+    config = configparser.RawConfigParser(defaults=_defaults)  # type: ignore
 
     # The ConfigParser requires a [SECTION_HEADER], which we do not expect in our config file.
     # Cheat the ConfigParser module by adding a fake section header
     config_file_ = StringIO()
     config_file_.write("[FAKE_SECTION]\n")
     try:
-        with open(config_file) as fh:
+        with config_file.open("w") as fh:
             for line in fh:
                 config_file_.write(line)
     except FileNotFoundError:
@@ -326,21 +328,22 @@ def _parse_config(config_file: str):
         logger.info("Error opening file %s: %s", config_file, e.args[0])
     config_file_.seek(0)
     config.read_file(config_file_)
-    return config
+    return dict(config.items("FAKE_SECTION"))  # type: ignore
 
 
-def get_config_as_dict():
-    config = dict()
-    config["apikey"] = apikey
-    config["server"] = server
-    config["cachedir"] = _root_cache_directory
-    config["avoid_duplicate_runs"] = avoid_duplicate_runs
-    config["connection_n_retries"] = connection_n_retries
-    config["retry_policy"] = retry_policy
-    return config
+def get_config_as_dict() -> _Config:
+    return {
+        "apikey": apikey,
+        "server": server,
+        "cachedir": _root_cache_directory,
+        "avoid_duplicate_runs": avoid_duplicate_runs,
+        "connection_n_retries": connection_n_retries,
+        "retry_policy": retry_policy,
+    }
 
 
-def get_cache_directory():
+# NOTE: For backwards compatibility, we keep the `str`
+def get_cache_directory() -> str:
     """Get the current cache directory.
 
     This gets the cache directory for the current server relative
@@ -361,12 +364,11 @@ def get_cache_directory():
 
     """
     url_suffix = urlparse(server).netloc
-    reversed_url_suffix = os.sep.join(url_suffix.split(".")[::-1])
-    _cachedir = os.path.join(_root_cache_directory, reversed_url_suffix)
-    return _cachedir
+    reversed_url_suffix = os.sep.join(url_suffix.split(".")[::-1])  # noqa: PTH118
+    return os.path.join(_root_cache_directory, reversed_url_suffix)  # noqa: PTH118
 
 
-def set_root_cache_directory(root_cache_directory):
+def set_root_cache_directory(root_cache_directory: str | Path) -> None:
     """Set module-wide base cache directory.
 
     Sets the root cache directory, wherin the cache directories are
@@ -381,13 +383,12 @@ def set_root_cache_directory(root_cache_directory):
     root_cache_directory : string
          Path to use as cache directory.
 
-    See also
+    See Also
     --------
     get_cache_directory
     """
-
-    global _root_cache_directory
-    _root_cache_directory = root_cache_directory
+    global _root_cache_directory  # noqa: PLW0603
+    _root_cache_directory = Path(root_cache_directory)
 
 
 start_using_configuration_for_example = (
diff --git a/openml/datasets/__init__.py b/openml/datasets/__init__.py
index efa5a5d5b..480dd9576 100644
--- a/openml/datasets/__init__.py
+++ b/openml/datasets/__init__.py
@@ -1,20 +1,20 @@
 # License: BSD 3-Clause
 
+from .data_feature import OpenMLDataFeature
+from .dataset import OpenMLDataset
 from .functions import (
     attributes_arff_from_df,
     check_datasets_active,
     create_dataset,
+    delete_dataset,
+    edit_dataset,
+    fork_dataset,
     get_dataset,
     get_datasets,
     list_datasets,
-    status_update,
     list_qualities,
-    edit_dataset,
-    fork_dataset,
-    delete_dataset,
+    status_update,
 )
-from .dataset import OpenMLDataset
-from .data_feature import OpenMLDataFeature
 
 __all__ = [
     "attributes_arff_from_df",
diff --git a/openml/datasets/data_feature.py b/openml/datasets/data_feature.py
index b4550b5d7..218b0066d 100644
--- a/openml/datasets/data_feature.py
+++ b/openml/datasets/data_feature.py
@@ -1,9 +1,13 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
-from typing import List
+from typing import TYPE_CHECKING, Any, ClassVar, Sequence
 
+if TYPE_CHECKING:
+    from IPython.lib import pretty
 
-class OpenMLDataFeature(object):
+
+class OpenMLDataFeature:
     """
     Data Feature (a.k.a. Attribute) object.
 
@@ -18,52 +22,63 @@ class OpenMLDataFeature(object):
     nominal_values : list(str)
         list of the possible values, in case of nominal attribute
     number_missing_values : int
+        Number of rows that have a missing value for this feature.
+    ontologies : list(str)
+        list of ontologies attached to this feature. An ontology describes the
+        concept that are described in a feature. An ontology is defined by an
+        URL where the information is provided.
     """
 
-    LEGAL_DATA_TYPES = ["nominal", "numeric", "string", "date"]
+    LEGAL_DATA_TYPES: ClassVar[Sequence[str]] = ["nominal", "numeric", "string", "date"]
 
-    def __init__(
+    def __init__(  # noqa: PLR0913
         self,
         index: int,
         name: str,
         data_type: str,
-        nominal_values: List[str],
+        nominal_values: list[str],
         number_missing_values: int,
+        ontologies: list[str] | None = None,
     ):
-        if type(index) != int:
-            raise ValueError("Index is of wrong datatype")
+        if not isinstance(index, int):
+            raise TypeError(f"Index must be `int` but is {type(index)}")
+
         if data_type not in self.LEGAL_DATA_TYPES:
             raise ValueError(
-                "data type should be in %s, found: %s" % (str(self.LEGAL_DATA_TYPES), data_type)
+                f"data type should be in {self.LEGAL_DATA_TYPES!s}, found: {data_type}",
             )
+
         if data_type == "nominal":
             if nominal_values is None:
                 raise TypeError(
                     "Dataset features require attribute `nominal_values` for nominal "
-                    "feature type."
+                    "feature type.",
                 )
-            elif not isinstance(nominal_values, list):
+
+            if not isinstance(nominal_values, list):
                 raise TypeError(
                     "Argument `nominal_values` is of wrong datatype, should be list, "
-                    "but is {}".format(type(nominal_values))
+                    f"but is {type(nominal_values)}",
                 )
-        else:
-            if nominal_values is not None:
-                raise TypeError("Argument `nominal_values` must be None for non-nominal feature.")
-        if type(number_missing_values) != int:
-            raise ValueError("number_missing_values is of wrong datatype")
+        elif nominal_values is not None:
+            raise TypeError("Argument `nominal_values` must be None for non-nominal feature.")
+
+        if not isinstance(number_missing_values, int):
+            msg = f"number_missing_values must be int but is {type(number_missing_values)}"
+            raise TypeError(msg)
 
         self.index = index
         self.name = str(name)
         self.data_type = str(data_type)
         self.nominal_values = nominal_values
         self.number_missing_values = number_missing_values
+        self.ontologies = ontologies
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return "[%d - %s (%s)]" % (self.index, self.name, self.data_type)
 
-    def __eq__(self, other):
+    def __eq__(self, other: Any) -> bool:
         return isinstance(other, OpenMLDataFeature) and self.__dict__ == other.__dict__
 
-    def _repr_pretty_(self, pp, cycle):
+    def _repr_pretty_(self, pp: pretty.PrettyPrinter, cycle: bool) -> None:  # noqa: FBT001, ARG002
         pp.text(str(self))
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
index dcdef162d..0c9da1caf 100644
--- a/openml/datasets/dataset.py
+++ b/openml/datasets/dataset.py
@@ -1,13 +1,14 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
-from collections import OrderedDict
-import re
 import gzip
 import logging
-import os
 import pickle
-from typing import List, Optional, Union, Tuple, Iterable, Dict
+import re
 import warnings
+from pathlib import Path
+from typing import Any, Iterable, Sequence
+from typing_extensions import Literal
 
 import arff
 import numpy as np
@@ -16,8 +17,9 @@
 import xmltodict
 
 from openml.base import OpenMLBase
+from openml.exceptions import PyOpenMLError
+
 from .data_feature import OpenMLDataFeature
-from ..exceptions import PyOpenMLError
 
 logger = logging.getLogger(__name__)
 
@@ -88,66 +90,70 @@ class OpenMLDataset(OpenMLBase):
         MD5 checksum to check if the dataset is downloaded without corruption.
     data_file : str, optional
         Path to where the dataset is located.
-    features : dict, optional
+    features_file : dict, optional
         A dictionary of dataset features,
         which maps a feature index to a OpenMLDataFeature.
-    qualities : dict, optional
+    qualities_file : dict, optional
         A dictionary of dataset qualities,
         which maps a quality name to a quality value.
     dataset: string, optional
         Serialized arff dataset string.
-    minio_url: string, optional
-        URL to the MinIO bucket with dataset files
+    parquet_url: string, optional
+        This is the URL to the storage location where the dataset files are hosted.
+        This can be a MinIO bucket URL. If specified, the data will be accessed
+        from this URL when reading the files.
     parquet_file: string, optional
-        Path to the local parquet file.
+        Path to the local file.
     """
 
-    def __init__(
+    def __init__(  # noqa: C901, PLR0912, PLR0913, PLR0915
         self,
-        name,
-        description,
-        data_format="arff",
-        cache_format="pickle",
-        dataset_id=None,
-        version=None,
-        creator=None,
-        contributor=None,
-        collection_date=None,
-        upload_date=None,
-        language=None,
-        licence=None,
-        url=None,
-        default_target_attribute=None,
-        row_id_attribute=None,
-        ignore_attribute=None,
-        version_label=None,
-        citation=None,
-        tag=None,
-        visibility=None,
-        original_data_url=None,
-        paper_url=None,
-        update_comment=None,
-        md5_checksum=None,
-        data_file=None,
-        features_file: Optional[str] = None,
-        qualities_file: Optional[str] = None,
-        dataset=None,
-        minio_url: Optional[str] = None,
-        parquet_file: Optional[str] = None,
+        name: str,
+        description: str | None,
+        data_format: Literal["arff", "sparse_arff"] = "arff",
+        cache_format: Literal["feather", "pickle"] = "pickle",
+        dataset_id: int | None = None,
+        version: int | None = None,
+        creator: str | None = None,
+        contributor: str | None = None,
+        collection_date: str | None = None,
+        upload_date: str | None = None,
+        language: str | None = None,
+        licence: str | None = None,
+        url: str | None = None,
+        default_target_attribute: str | None = None,
+        row_id_attribute: str | None = None,
+        ignore_attribute: str | list[str] | None = None,
+        version_label: str | None = None,
+        citation: str | None = None,
+        tag: str | None = None,
+        visibility: str | None = None,
+        original_data_url: str | None = None,
+        paper_url: str | None = None,
+        update_comment: str | None = None,
+        md5_checksum: str | None = None,
+        data_file: str | None = None,
+        features_file: str | None = None,
+        qualities_file: str | None = None,
+        dataset: str | None = None,
+        parquet_url: str | None = None,
+        parquet_file: str | None = None,
     ):
-        def find_invalid_characters(string, pattern):
+        if cache_format not in ["feather", "pickle"]:
+            raise ValueError(
+                "cache_format must be one of 'feather' or 'pickle. "
+                f"Invalid format specified: {cache_format}",
+            )
+
+        def find_invalid_characters(string: str, pattern: str) -> str:
             invalid_chars = set()
             regex = re.compile(pattern)
             for char in string:
                 if not regex.match(char):
                     invalid_chars.add(char)
-            invalid_chars = ",".join(
-                [
-                    "'{}'".format(char) if char != "'" else '"{}"'.format(char)
-                    for char in invalid_chars
-                ]
+            return ",".join(
+                [f"'{char}'" if char != "'" else f'"{char}"' for char in invalid_chars],
             )
-            return invalid_chars
 
         if dataset_id is None:
             pattern = "^[\x00-\x7F]*$"
@@ -155,32 +161,35 @@ def find_invalid_characters(string, pattern):
                 # not basiclatin (XSD complains)
                 invalid_characters = find_invalid_characters(description, pattern)
                 raise ValueError(
-                    "Invalid symbols {} in description: {}".format(invalid_characters, description)
+                    f"Invalid symbols {invalid_characters} in description: {description}",
                 )
             pattern = "^[\x00-\x7F]*$"
             if citation and not re.match(pattern, citation):
                 # not basiclatin (XSD complains)
                 invalid_characters = find_invalid_characters(citation, pattern)
                 raise ValueError(
-                    "Invalid symbols {} in citation: {}".format(invalid_characters, citation)
+                    f"Invalid symbols {invalid_characters} in citation: {citation}",
                 )
             pattern = "^[a-zA-Z0-9_\\-\\.\\(\\),]+$"
             if not re.match(pattern, name):
                 # regex given by server in error message
                 invalid_characters = find_invalid_characters(name, pattern)
-                raise ValueError("Invalid symbols {} in name: {}".format(invalid_characters, name))
+                raise ValueError(f"Invalid symbols {invalid_characters} in name: {name}")
+
+        self.ignore_attribute: list[str] | None = None
+        if isinstance(ignore_attribute, str):
+            self.ignore_attribute = [ignore_attribute]
+        elif isinstance(ignore_attribute, list) or ignore_attribute is None:
+            self.ignore_attribute = ignore_attribute
+        else:
+            raise ValueError("Wrong data type for ignore_attribute. Should be list.")
+
         # TODO add function to check if the name is casual_string128
         # Attributes received by querying the RESTful API
         self.dataset_id = int(dataset_id) if dataset_id is not None else None
         self.name = name
         self.version = int(version) if version is not None else None
         self.description = description
-        if cache_format not in ["feather", "pickle"]:
-            raise ValueError(
-                "cache_format must be one of 'feather' or 'pickle. "
-                "Invalid format specified: {}".format(cache_format)
-            )
-
         self.cache_format = cache_format
         # Has to be called format, otherwise there will be an XML upload error
         self.format = data_format
@@ -193,12 +202,7 @@ def find_invalid_characters(string, pattern):
         self.url = url
         self.default_target_attribute = default_target_attribute
         self.row_id_attribute = row_id_attribute
-        if isinstance(ignore_attribute, str):
-            self.ignore_attribute = [ignore_attribute]  # type: Optional[List[str]]
-        elif isinstance(ignore_attribute, list) or ignore_attribute is None:
-            self.ignore_attribute = ignore_attribute
-        else:
-            raise ValueError("Wrong data type for ignore_attribute. " "Should be list.")
+
         self.version_label = version_label
         self.citation = citation
         self.tag = tag
@@ -210,14 +214,14 @@ def find_invalid_characters(string, pattern):
         self.data_file = data_file
         self.parquet_file = parquet_file
         self._dataset = dataset
-        self._minio_url = minio_url
+        self._parquet_url = parquet_url
 
-        self._features = None  # type: Optional[Dict[int, OpenMLDataFeature]]
-        self._qualities = None  # type: Optional[Dict[str, float]]
+        self._features: dict[int, OpenMLDataFeature] | None = None
+        self._qualities: dict[str, float] | None = None
         self._no_qualities_found = False
 
         if features_file is not None:
-            self._features = _read_features(features_file)
+            self._features = _read_features(Path(features_file))
 
         # "" was the old default value by `get_dataset` and maybe still used by some
         if qualities_file == "":
@@ -227,30 +231,40 @@ def find_invalid_characters(string, pattern):
                 "to avoid reading the qualities from file. Set `qualities_file` to None to avoid "
                 "this warning.",
                 FutureWarning,
+                stacklevel=2,
             )
+            qualities_file = None
 
-        if qualities_file:
-            self._qualities = _read_qualities(qualities_file)
+        if qualities_file is not None:
+            self._qualities = _read_qualities(Path(qualities_file))
 
         if data_file is not None:
-            rval = self._compressed_cache_file_paths(data_file)
-            self.data_pickle_file = rval[0] if os.path.exists(rval[0]) else None
-            self.data_feather_file = rval[1] if os.path.exists(rval[1]) else None
-            self.feather_attribute_file = rval[2] if os.path.exists(rval[2]) else None
+            data_pickle, data_feather, feather_attribute = self._compressed_cache_file_paths(
+                Path(data_file)
+            )
+            self.data_pickle_file = data_pickle if Path(data_pickle).exists() else None
+            self.data_feather_file = data_feather if Path(data_feather).exists() else None
+            self.feather_attribute_file = feather_attribute if Path(feather_attribute) else None
         else:
             self.data_pickle_file = None
             self.data_feather_file = None
             self.feather_attribute_file = None
 
     @property
-    def features(self):
+    def features(self) -> dict[int, OpenMLDataFeature]:
+        """Get the features of this dataset."""
         if self._features is None:
+            # TODO(eddiebergman): These should return a value so we can set it to be not None
             self._load_features()
 
+        assert self._features is not None
         return self._features
 
     @property
-    def qualities(self):
+    def qualities(self) -> dict[str, float] | None:
+        """Get the qualities of this dataset."""
+        # TODO(eddiebergman): Better docstring, I don't know what qualities means
+
         # We have to check `_no_qualities_found` as there might not be qualities for a dataset
         if self._qualities is None and (not self._no_qualities_found):
             self._load_qualities()
@@ -258,26 +272,29 @@ def qualities(self):
         return self._qualities
 
     @property
-    def id(self) -> Optional[int]:
+    def id(self) -> int | None:
+        """Get the dataset numeric id."""
         return self.dataset_id
 
-    def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
+    def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | None]]:
         """Collect all information to display in the __repr__ body."""
-
         # Obtain number of features in accordance with lazy loading.
+        n_features: int | None = None
         if self._qualities is not None and self._qualities["NumberOfFeatures"] is not None:
-            n_features = int(self._qualities["NumberOfFeatures"])  # type: Optional[int]
-        else:
-            n_features = len(self._features) if self._features is not None else None
+            n_features = int(self._qualities["NumberOfFeatures"])
+        elif self._features is not None:
+            n_features = len(self._features)
 
-        fields = {
+        fields: dict[str, int | str | None] = {
             "Name": self.name,
             "Version": self.version,
             "Format": self.format,
             "Licence": self.licence,
             "Download URL": self.url,
-            "Data file": self.data_file,
-            "Pickle file": self.data_pickle_file,
+            "Data file": str(self.data_file) if self.data_file is not None else None,
+            "Pickle file": (
+                str(self.data_pickle_file) if self.data_pickle_file is not None else None
+            ),
             "# of features": n_features,
         }
         if self.upload_date is not None:
@@ -303,7 +320,7 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
         ]
         return [(key, fields[key]) for key in order if key in fields]
 
-    def __eq__(self, other):
+    def __eq__(self, other: Any) -> bool:
         if not isinstance(other, OpenMLDataset):
             return False
 
@@ -328,11 +345,11 @@ def _download_data(self) -> None:
         # import required here to avoid circular import.
         from .functions import _get_dataset_arff, _get_dataset_parquet
 
-        self.data_file = _get_dataset_arff(self)
-        if self._minio_url is not None:
-            self.parquet_file = _get_dataset_parquet(self)
+        self.data_file = str(_get_dataset_arff(self))
+        if self._parquet_url is not None:
+            self.parquet_file = str(_get_dataset_parquet(self))
 
-    def _get_arff(self, format: str) -> Dict:
+    def _get_arff(self, format: str) -> dict:  # noqa: A002
         """Read ARFF file and return decoded arff.
 
         Reads the file referenced in self.data_file.
@@ -352,44 +369,49 @@ def _get_arff(self, format: str) -> Dict:
             Decoded arff.
 
         """
-
         # TODO: add a partial read method which only returns the attribute
         # headers of the corresponding .arff file!
         import struct
 
         filename = self.data_file
+        assert filename is not None
+        filepath = Path(filename)
+
         bits = 8 * struct.calcsize("P")
+
         # Files can be considered too large on a 32-bit system,
         # if it exceeds 120mb (slightly more than covtype dataset size)
         # This number is somewhat arbitrary.
-        if bits != 64 and os.path.getsize(filename) > 120000000:
-            raise NotImplementedError(
-                "File {} too big for {}-bit system ({} bytes).".format(
-                    filename, os.path.getsize(filename), bits
+        if bits != 64:
+            MB_120 = 120_000_000
+            file_size = filepath.stat().st_size
+            if file_size > MB_120:
+                raise NotImplementedError(
+                    f"File {filename} too big for {file_size}-bit system ({bits} bytes).",
                 )
-            )
 
         if format.lower() == "arff":
             return_type = arff.DENSE
         elif format.lower() == "sparse_arff":
             return_type = arff.COO
         else:
-            raise ValueError("Unknown data format {}".format(format))
+            raise ValueError(f"Unknown data format {format}")
 
-        def decode_arff(fh):
+        def decode_arff(fh: Any) -> dict:
             decoder = arff.ArffDecoder()
-            return decoder.decode(fh, encode_nominal=True, return_type=return_type)
+            return decoder.decode(fh, encode_nominal=True, return_type=return_type)  # type: ignore
 
-        if filename[-3:] == ".gz":
+        if filepath.suffix.endswith(".gz"):
             with gzip.open(filename) as zipfile:
                 return decode_arff(zipfile)
         else:
-            with open(filename, encoding="utf8") as fh:
+            with filepath.open(encoding="utf8") as fh:
                 return decode_arff(fh)
 
-    def _parse_data_from_arff(
-        self, arff_file_path: str
-    ) -> Tuple[Union[pd.DataFrame, scipy.sparse.csr_matrix], List[bool], List[str]]:
+    def _parse_data_from_arff(  # noqa: C901, PLR0912, PLR0915
+        self,
+        arff_file_path: Path,
+    ) -> tuple[pd.DataFrame | scipy.sparse.csr_matrix, list[bool], list[str]]:
         """Parse all required data from arff file.
 
         Parameters
@@ -408,8 +430,7 @@ def _parse_data_from_arff(
             data = self._get_arff(self.format)
         except OSError as e:
             logger.critical(
-                "Please check that the data file {} is "
-                "there and can be read.".format(arff_file_path)
+                f"Please check that the data file {arff_file_path} is " "there and can be read.",
             )
             raise e
 
@@ -423,7 +444,7 @@ def _parse_data_from_arff(
         attribute_names = []
         categories_names = {}
         categorical = []
-        for i, (name, type_) in enumerate(data["attributes"]):
+        for name, type_ in data["attributes"]:
             # if the feature is nominal and a sparse matrix is
             # requested, the categories need to be numeric
             if isinstance(type_, list) and self.format.lower() == "sparse_arff":
@@ -431,8 +452,11 @@ def _parse_data_from_arff(
                     # checks if the strings which should be the class labels
                     # can be encoded into integers
                     pd.factorize(type_)[0]
-                except ValueError:
-                    raise ValueError("Categorical data needs to be numeric when using sparse ARFF.")
+                except ValueError as e:
+                    raise ValueError(
+                        "Categorical data needs to be numeric when using sparse ARFF."
+                    ) from e
+
             # string can only be supported with pandas DataFrame
             elif type_ == "STRING" and self.format.lower() == "sparse_arff":
                 raise ValueError("Dataset containing strings is not supported with sparse ARFF.")
@@ -443,10 +467,8 @@ def _parse_data_from_arff(
                 categories_names[name] = type_
                 if len(type_) == 2:
                     type_norm = [cat.lower().capitalize() for cat in type_]
-                    if set(["True", "False"]) == set(type_norm):
-                        categories_names[name] = [
-                            True if cat == "True" else False for cat in type_norm
-                        ]
+                    if {"True", "False"} == set(type_norm):
+                        categories_names[name] = [cat == "True" for cat in type_norm]
                         attribute_dtype[name] = "boolean"
                     else:
                         attribute_dtype[name] = "categorical"
@@ -468,9 +490,11 @@ def _parse_data_from_arff(
             col = []
             for column_name in X.columns:
                 if attribute_dtype[column_name] in ("categorical", "boolean"):
-                    col.append(
-                        self._unpack_categories(X[column_name], categories_names[column_name])
+                    categories = self._unpack_categories(
+                        X[column_name],  # type: ignore
+                        categories_names[column_name],
                     )
+                    col.append(categories)
                 elif attribute_dtype[column_name] in ("floating", "integer"):
                     X_col = X[column_name]
                     if X_col.min() >= 0 and X_col.max() <= 255:
@@ -486,20 +510,20 @@ def _parse_data_from_arff(
                     col.append(X[column_name])
             X = pd.concat(col, axis=1)
         else:
-            raise ValueError("Dataset format '{}' is not a valid format.".format(self.format))
+            raise ValueError(f"Dataset format '{self.format}' is not a valid format.")
 
-        return X, categorical, attribute_names
+        return X, categorical, attribute_names  # type: ignore
 
-    def _compressed_cache_file_paths(self, data_file: str) -> Tuple[str, str, str]:
-        ext = f".{data_file.split('.')[-1]}"
-        data_pickle_file = data_file.replace(ext, ".pkl.py3")
-        data_feather_file = data_file.replace(ext, ".feather")
-        feather_attribute_file = data_file.replace(ext, ".feather.attributes.pkl.py3")
+    def _compressed_cache_file_paths(self, data_file: Path) -> tuple[Path, Path, Path]:
+        data_pickle_file = data_file.with_suffix(".pkl.py3")
+        data_feather_file = data_file.with_suffix(".feather")
+        feather_attribute_file = data_file.with_suffix(".feather.attributes.pkl.py3")
         return data_pickle_file, data_feather_file, feather_attribute_file
 
     def _cache_compressed_file_from_file(
-        self, data_file: str
-    ) -> Tuple[Union[pd.DataFrame, scipy.sparse.csr_matrix], List[bool], List[str]]:
+        self,
+        data_file: Path,
+    ) -> tuple[pd.DataFrame | scipy.sparse.csr_matrix, list[bool], list[str]]:
         """Store data from the local file in compressed format.
 
         If a local parquet file is present it will be used instead of the arff file.
@@ -511,12 +535,12 @@ def _cache_compressed_file_from_file(
             feather_attribute_file,
         ) = self._compressed_cache_file_paths(data_file)
 
-        if data_file.endswith(".arff"):
+        if data_file.suffix == ".arff":
             data, categorical, attribute_names = self._parse_data_from_arff(data_file)
-        elif data_file.endswith(".pq"):
+        elif data_file.suffix == ".pq":
             try:
                 data = pd.read_parquet(data_file)
-            except Exception as e:
+            except Exception as e:  # noqa: BLE001
                 raise Exception(f"File: {data_file}") from e
 
             categorical = [data[c].dtype.name == "category" for c in data.columns]
@@ -530,13 +554,16 @@ def _cache_compressed_file_from_file(
 
         logger.info(f"{self.cache_format} write {self.name}")
         if self.cache_format == "feather":
+            assert isinstance(data, pd.DataFrame)
+
             data.to_feather(data_feather_file)
-            with open(feather_attribute_file, "wb") as fh:
+            with open(feather_attribute_file, "wb") as fh:  # noqa: PTH123
                 pickle.dump((categorical, attribute_names), fh, pickle.HIGHEST_PROTOCOL)
             self.data_feather_file = data_feather_file
             self.feather_attribute_file = feather_attribute_file
+
         else:
-            with open(data_pickle_file, "wb") as fh:
+            with open(data_pickle_file, "wb") as fh:  # noqa: PTH123
                 pickle.dump((data, categorical, attribute_names), fh, pickle.HIGHEST_PROTOCOL)
             self.data_pickle_file = data_pickle_file
 
@@ -545,7 +572,7 @@ def _cache_compressed_file_from_file(
 
         return data, categorical, attribute_names
 
-    def _load_data(self):
+    def _load_data(self) -> tuple[pd.DataFrame | scipy.sparse.csr_matrix, list[bool], list[str]]:  # noqa: PLR0912, C901
         """Load data from compressed format or arff. Download data if not present on disk."""
         need_to_create_pickle = self.cache_format == "pickle" and self.data_pickle_file is None
         need_to_create_feather = self.cache_format == "feather" and self.data_feather_file is None
@@ -555,24 +582,31 @@ def _load_data(self):
                 self._download_data()
 
             file_to_load = self.data_file if self.parquet_file is None else self.parquet_file
-            return self._cache_compressed_file_from_file(file_to_load)
+            assert file_to_load is not None
+            return self._cache_compressed_file_from_file(Path(file_to_load))
 
         # helper variable to help identify where errors occur
         fpath = self.data_feather_file if self.cache_format == "feather" else self.data_pickle_file
         logger.info(f"{self.cache_format} load data {self.name}")
         try:
             if self.cache_format == "feather":
+                assert self.data_feather_file is not None
+                assert self.feather_attribute_file is not None
+
                 data = pd.read_feather(self.data_feather_file)
                 fpath = self.feather_attribute_file
-                with open(self.feather_attribute_file, "rb") as fh:
-                    categorical, attribute_names = pickle.load(fh)
+                with open(self.feather_attribute_file, "rb") as fh:  # noqa: PTH123
+                    categorical, attribute_names = pickle.load(fh)  # noqa: S301
             else:
-                with open(self.data_pickle_file, "rb") as fh:
-                    data, categorical, attribute_names = pickle.load(fh)
-        except FileNotFoundError:
-            raise ValueError(f"Cannot find file for dataset {self.name} at location '{fpath}'.")
+                assert self.data_pickle_file is not None
+                with open(self.data_pickle_file, "rb") as fh:  # noqa: PTH123
+                    data, categorical, attribute_names = pickle.load(fh)  # noqa: S301
+        except FileNotFoundError as e:
+            raise ValueError(
+                f"Cannot find file for dataset {self.name} at location '{fpath}'."
+            ) from e
         except (EOFError, ModuleNotFoundError, ValueError, AttributeError) as e:
-            error_message = e.message if hasattr(e, "message") else e.args[0]
+            error_message = getattr(e, "message", e.args[0])
             hint = ""
 
             if isinstance(e, EOFError):
@@ -591,7 +625,7 @@ def _load_data(self):
             elif isinstance(e, ValueError) and "unsupported pickle protocol" in e.args[0]:
                 readable_error = "Encountered unsupported pickle protocol"
             else:
-                raise  # an unknown ValueError is raised, should crash and file bug report
+                raise e
 
             logger.warning(
                 f"{readable_error} when loading dataset {self.id} from '{fpath}'. "
@@ -600,19 +634,28 @@ def _load_data(self):
                 "We will continue loading data from the arff-file, "
                 "but this will be much slower for big datasets. "
                 "Please manually delete the cache file if you want OpenML-Python "
-                "to attempt to reconstruct it."
+                "to attempt to reconstruct it.",
             )
-            data, categorical, attribute_names = self._parse_data_from_arff(self.data_file)
+            assert self.data_file is not None
+            data, categorical, attribute_names = self._parse_data_from_arff(Path(self.data_file))
 
         data_up_to_date = isinstance(data, pd.DataFrame) or scipy.sparse.issparse(data)
         if self.cache_format == "pickle" and not data_up_to_date:
             logger.info("Updating outdated pickle file.")
             file_to_load = self.data_file if self.parquet_file is None else self.parquet_file
-            return self._cache_compressed_file_from_file(file_to_load)
+            assert file_to_load is not None
+
+            return self._cache_compressed_file_from_file(Path(file_to_load))
         return data, categorical, attribute_names
 
+    # TODO(eddiebergman): Can type this better with overload
+    # TODO(eddiebergman): Could also techinically use scipy.sparse.sparray
     @staticmethod
-    def _convert_array_format(data, array_format, attribute_names):
+    def _convert_array_format(
+        data: pd.DataFrame | pd.Series | np.ndarray | scipy.sparse.spmatrix,
+        array_format: Literal["array", "dataframe"],
+        attribute_names: list | None = None,
+    ) -> pd.DataFrame | pd.Series | np.ndarray | scipy.sparse.spmatrix:
         """Convert a dataset to a given array format.
 
         Converts to numpy array if data is non-sparse.
@@ -635,18 +678,17 @@ def _convert_array_format(data, array_format, attribute_names):
                 else returns data as is
 
         """
-
-        if array_format == "array" and not scipy.sparse.issparse(data):
+        if array_format == "array" and not isinstance(data, scipy.sparse.spmatrix):
             # We encode the categories such that they are integer to be able
             # to make a conversion to numeric for backward compatibility
-            def _encode_if_category(column):
+            def _encode_if_category(column: pd.Series | np.ndarray) -> pd.Series | np.ndarray:
                 if column.dtype.name == "category":
                     column = column.cat.codes.astype(np.float32)
                     mask_nan = column == -1
                     column[mask_nan] = np.nan
                 return column
 
-            if data.ndim == 2:
+            if isinstance(data, pd.DataFrame):
                 columns = {
                     column_name: _encode_if_category(data.loc[:, column_name])
                     for column_name in data.columns
@@ -654,28 +696,33 @@ def _encode_if_category(column):
                 data = pd.DataFrame(columns)
             else:
                 data = _encode_if_category(data)
+
             try:
-                return np.asarray(data, dtype=np.float32)
-            except ValueError:
+                # TODO(eddiebergman): float32?
+                return_array = np.asarray(data, dtype=np.float32)
+            except ValueError as e:
                 raise PyOpenMLError(
                     "PyOpenML cannot handle string when returning numpy"
-                    ' arrays. Use dataset_format="dataframe".'
-                )
-        elif array_format == "dataframe":
+                    ' arrays. Use dataset_format="dataframe".',
+                ) from e
+
+            return return_array
+
+        if array_format == "dataframe":
             if scipy.sparse.issparse(data):
                 data = pd.DataFrame.sparse.from_spmatrix(data, columns=attribute_names)
         else:
             data_type = "sparse-data" if scipy.sparse.issparse(data) else "non-sparse data"
             logger.warning(
-                "Cannot convert %s (%s) to '%s'. Returning input data."
-                % (data_type, type(data), array_format)
+                f"Cannot convert {data_type} ({type(data)}) to '{array_format}'."
+                " Returning input data.",
             )
         return data
 
     @staticmethod
-    def _unpack_categories(series, categories):
+    def _unpack_categories(series: pd.Series, categories: list) -> pd.Series:
         # nan-likes can not be explicitly specified as a category
-        def valid_category(cat):
+        def valid_category(cat: Any) -> bool:
             return isinstance(cat, str) or (cat is not None and not np.isnan(cat))
 
         filtered_categories = [c for c in categories if valid_category(c)]
@@ -685,22 +732,23 @@ def valid_category(cat):
                 col.append(categories[int(x)])
             except (TypeError, ValueError):
                 col.append(np.nan)
+
         # We require two lines to create a series of categories as detailed here:
-        # https://pandas.pydata.org/pandas-docs/version/0.24/user_guide/categorical.html#series-creation  # noqa E501
+        # https://pandas.pydata.org/pandas-docs/version/0.24/user_guide/categorical.html#series-creation
         raw_cat = pd.Categorical(col, ordered=True, categories=filtered_categories)
         return pd.Series(raw_cat, index=series.index, name=series.name)
 
-    def get_data(
+    def get_data(  # noqa: C901, PLR0912, PLR0915
         self,
-        target: Optional[Union[List[str], str]] = None,
-        include_row_id: bool = False,
-        include_ignore_attribute: bool = False,
-        dataset_format: str = "dataframe",
-    ) -> Tuple[
-        Union[np.ndarray, pd.DataFrame, scipy.sparse.csr_matrix],
-        Optional[Union[np.ndarray, pd.DataFrame]],
-        List[bool],
-        List[str],
+        target: list[str] | str | None = None,
+        include_row_id: bool = False,  # noqa: FBT001, FBT002
+        include_ignore_attribute: bool = False,  # noqa: FBT001, FBT002
+        dataset_format: Literal["array", "dataframe"] = "dataframe",
+    ) -> tuple[
+        np.ndarray | pd.DataFrame | scipy.sparse.csr_matrix,
+        np.ndarray | pd.DataFrame | None,
+        list[bool],
+        list[str],
     ]:
         """Returns dataset content as dataframes or sparse matrices.
 
@@ -759,47 +807,40 @@ def get_data(
 
         if len(to_exclude) > 0:
             logger.info("Going to remove the following attributes: %s" % to_exclude)
-            keep = np.array(
-                [True if column not in to_exclude else False for column in attribute_names]
-            )
-            if hasattr(data, "iloc"):
-                data = data.iloc[:, keep]
-            else:
-                data = data[:, keep]
+            keep = np.array([column not in to_exclude for column in attribute_names])
+            data = data.loc[:, keep] if isinstance(data, pd.DataFrame) else data[:, keep]
+
             categorical = [cat for cat, k in zip(categorical, keep) if k]
             attribute_names = [att for att, k in zip(attribute_names, keep) if k]
 
         if target is None:
-            data = self._convert_array_format(data, dataset_format, attribute_names)
+            data = self._convert_array_format(data, dataset_format, attribute_names)  # type: ignore
             targets = None
         else:
             if isinstance(target, str):
-                if "," in target:
-                    target = target.split(",")
-                else:
-                    target = [target]
-            targets = np.array([True if column in target else False for column in attribute_names])
-            target_names = np.array([column for column in attribute_names if column in target])
+                target = target.split(",") if "," in target else [target]
+            targets = np.array([column in target for column in attribute_names])
+            target_names = [column for column in attribute_names if column in target]
             if np.sum(targets) > 1:
                 raise NotImplementedError(
-                    "Number of requested targets %d is not implemented." % np.sum(targets)
+                    "Number of requested targets %d is not implemented." % np.sum(targets),
                 )
             target_categorical = [
                 cat for cat, column in zip(categorical, attribute_names) if column in target
             ]
             target_dtype = int if target_categorical[0] else float
 
-            if hasattr(data, "iloc"):
+            if isinstance(data, pd.DataFrame):
                 x = data.iloc[:, ~targets]
                 y = data.iloc[:, targets]
             else:
                 x = data[:, ~targets]
-                y = data[:, targets].astype(target_dtype)
+                y = data[:, targets].astype(target_dtype)  # type: ignore
 
             categorical = [cat for cat, t in zip(categorical, targets) if not t]
             attribute_names = [att for att, k in zip(attribute_names, targets) if not k]
 
-            x = self._convert_array_format(x, dataset_format, attribute_names)
+            x = self._convert_array_format(x, dataset_format, attribute_names)  # type: ignore
             if dataset_format == "array" and scipy.sparse.issparse(y):
                 # scikit-learn requires dense representation of targets
                 y = np.asarray(y.todense()).astype(target_dtype)
@@ -807,15 +848,16 @@ def get_data(
                 # need to flatten it to a 1-d array for _convert_array_format()
                 y = y.squeeze()
             y = self._convert_array_format(y, dataset_format, target_names)
-            y = y.astype(target_dtype) if dataset_format == "array" else y
+            y = y.astype(target_dtype) if isinstance(y, np.ndarray) else y
             if len(y.shape) > 1 and y.shape[1] == 1:
                 # single column targets should be 1-d for both `array` and `dataframe` formats
+                assert isinstance(y, (np.ndarray, pd.DataFrame, pd.Series))
                 y = y.squeeze()
             data, targets = x, y
 
-        return data, targets, categorical, attribute_names
+        return data, targets, categorical, attribute_names  # type: ignore
 
-    def _load_features(self):
+    def _load_features(self) -> None:
         """Load the features metadata from the server and store it in the dataset object."""
         # Delayed Import to avoid circular imports or having to import all of dataset.functions to
         # import OpenMLDataset.
@@ -824,13 +866,13 @@ def _load_features(self):
         if self.dataset_id is None:
             raise ValueError(
                 "No dataset id specified. Please set the dataset id. Otherwise we cannot load "
-                "metadata."
+                "metadata.",
             )
 
         features_file = _get_dataset_features_file(None, self.dataset_id)
         self._features = _read_features(features_file)
 
-    def _load_qualities(self):
+    def _load_qualities(self) -> None:
         """Load qualities information from the server and store it in the dataset object."""
         # same reason as above for _load_features
         from openml.datasets.functions import _get_dataset_qualities_file
@@ -838,7 +880,7 @@ def _load_qualities(self):
         if self.dataset_id is None:
             raise ValueError(
                 "No dataset id specified. Please set the dataset id. Otherwise we cannot load "
-                "metadata."
+                "metadata.",
             )
 
         qualities_file = _get_dataset_qualities_file(None, self.dataset_id)
@@ -848,7 +890,7 @@ def _load_qualities(self):
         else:
             self._qualities = _read_qualities(qualities_file)
 
-    def retrieve_class_labels(self, target_name: str = "class") -> Union[None, List[str]]:
+    def retrieve_class_labels(self, target_name: str = "class") -> None | list[str]:
         """Reads the datasets arff to determine the class-labels.
 
         If the task has no class labels (for example a regression problem)
@@ -866,13 +908,27 @@ def retrieve_class_labels(self, target_name: str = "class") -> Union[None, List[
         list
         """
         for feature in self.features.values():
-            if (feature.name == target_name) and (feature.data_type == "nominal"):
-                return feature.nominal_values
+            if feature.name == target_name:
+                if feature.data_type == "nominal":
+                    return feature.nominal_values
+
+                if feature.data_type == "string":
+                    # Rel.: #1311
+                    # The target is invalid for a classification task if the feature type is string
+                    # and not nominal. For such miss-configured tasks, we silently fix it here as
+                    # we can safely interpreter string as nominal.
+                    df, *_ = self.get_data()
+                    return list(df[feature.name].unique())
+
         return None
 
-    def get_features_by_type(
-        self, data_type, exclude=None, exclude_ignore_attribute=True, exclude_row_id_attribute=True
-    ):
+    def get_features_by_type(  # noqa: C901
+        self,
+        data_type: str,
+        exclude: list[str] | None = None,
+        exclude_ignore_attribute: bool = True,  # noqa: FBT002, FBT001
+        exclude_row_id_attribute: bool = True,  # noqa: FBT002, FBT001
+    ) -> list[int]:
         """
         Return indices of features of a given type, e.g. all nominal features.
         Optional parameters to exclude various features by index or ontology.
@@ -882,8 +938,7 @@ def get_features_by_type(
         data_type : str
             The data type to return (e.g., nominal, numeric, date, string)
         exclude : list(int)
-            Indices to exclude (and adapt the return values as if these indices
-                        are not present)
+            List of columns to exclude from the return value
         exclude_ignore_attribute : bool
             Whether to exclude the defined ignore attributes (and adapt the
             return values as if these indices are not present)
@@ -898,15 +953,12 @@ def get_features_by_type(
         """
         if data_type not in OpenMLDataFeature.LEGAL_DATA_TYPES:
             raise TypeError("Illegal feature type requested")
-        if self.ignore_attribute is not None:
-            if not isinstance(self.ignore_attribute, list):
-                raise TypeError("ignore_attribute should be a list")
-        if self.row_id_attribute is not None:
-            if not isinstance(self.row_id_attribute, str):
-                raise TypeError("row id attribute should be a str")
-        if exclude is not None:
-            if not isinstance(exclude, list):
-                raise TypeError("Exclude should be a list")
+        if self.ignore_attribute is not None and not isinstance(self.ignore_attribute, list):
+            raise TypeError("ignore_attribute should be a list")
+        if self.row_id_attribute is not None and not isinstance(self.row_id_attribute, str):
+            raise TypeError("row id attribute should be a str")
+        if exclude is not None and not isinstance(exclude, list):
+            raise TypeError("Exclude should be a list")
             # assert all(isinstance(elem, str) for elem in exclude),
             #            "Exclude should be a list of strings"
         to_exclude = []
@@ -925,35 +977,36 @@ def get_features_by_type(
             name = self.features[idx].name
             if name in to_exclude:
                 offset += 1
-            else:
-                if self.features[idx].data_type == data_type:
-                    result.append(idx - offset)
+            elif self.features[idx].data_type == data_type:
+                result.append(idx - offset)
         return result
 
-    def _get_file_elements(self) -> Dict:
+    def _get_file_elements(self) -> dict:
         """Adds the 'dataset' to file elements."""
-        file_elements = {}
-        path = None if self.data_file is None else os.path.abspath(self.data_file)
+        file_elements: dict = {}
+        path = None if self.data_file is None else Path(self.data_file).absolute()
 
         if self._dataset is not None:
             file_elements["dataset"] = self._dataset
-        elif path is not None and os.path.exists(path):
-            with open(path, "rb") as fp:
+        elif path is not None and path.exists():
+            with path.open("rb") as fp:
                 file_elements["dataset"] = fp.read()
+
             try:
-                dataset_utf8 = str(file_elements["dataset"], "utf8")
+                dataset_utf8 = str(file_elements["dataset"], encoding="utf8")
                 arff.ArffDecoder().decode(dataset_utf8, encode_nominal=True)
-            except arff.ArffException:
-                raise ValueError("The file you have provided is not a valid arff file.")
+            except arff.ArffException as e:
+                raise ValueError("The file you have provided is not a valid arff file.") from e
+
         elif self.url is None:
             raise ValueError("No valid url/path to the data file was given.")
         return file_elements
 
-    def _parse_publish_response(self, xml_response: Dict):
+    def _parse_publish_response(self, xml_response: dict) -> None:
         """Parse the id from the xml_response and assign it to self."""
         self.dataset_id = int(xml_response["oml:upload_data_set"]["oml:id"])
 
-    def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
+    def _to_dict(self) -> dict[str, dict]:
         """Creates a dictionary representation of self."""
         props = [
             "id",
@@ -981,39 +1034,43 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
             "md5_checksum",
         ]
 
-        data_container = OrderedDict()  # type: 'OrderedDict[str, OrderedDict]'
-        data_dict = OrderedDict([("@xmlns:oml", "http://openml.org/openml")])
-        data_container["oml:data_set_description"] = data_dict
-
+        prop_values = {}
         for prop in props:
             content = getattr(self, prop, None)
             if content is not None:
-                data_dict["oml:" + prop] = content
+                prop_values["oml:" + prop] = content
 
-        return data_container
+        return {
+            "oml:data_set_description": {
+                "@xmlns:oml": "http://openml.org/openml",
+                **prop_values,
+            }
+        }
 
 
-def _read_features(features_file: str) -> Dict[int, OpenMLDataFeature]:
-    features_pickle_file = _get_features_pickle_file(features_file)
+def _read_features(features_file: Path) -> dict[int, OpenMLDataFeature]:
+    features_pickle_file = Path(_get_features_pickle_file(str(features_file)))
     try:
-        with open(features_pickle_file, "rb") as fh_binary:
-            features = pickle.load(fh_binary)
-    except:  # noqa E722
-        with open(features_file, encoding="utf8") as fh:
+        with features_pickle_file.open("rb") as fh_binary:
+            return pickle.load(fh_binary)  # type: ignore  # noqa: S301
+
+    except:  # noqa: E722
+        with Path(features_file).open("r", encoding="utf8") as fh:
             features_xml_string = fh.read()
 
         features = _parse_features_xml(features_xml_string)
 
-        with open(features_pickle_file, "wb") as fh_binary:
+        with features_pickle_file.open("wb") as fh_binary:
             pickle.dump(features, fh_binary)
-    return features
+
+        return features
 
 
-def _parse_features_xml(features_xml_string):
+def _parse_features_xml(features_xml_string: str) -> dict[int, OpenMLDataFeature]:
     xml_dict = xmltodict.parse(features_xml_string, force_list=("oml:feature", "oml:nominal_value"))
     features_xml = xml_dict["oml:data_features"]
 
-    features = {}
+    features: dict[int, OpenMLDataFeature] = {}
     for idx, xmlfeature in enumerate(features_xml["oml:feature"]):
         nr_missing = xmlfeature.get("oml:number_of_missing_values", 0)
         feature = OpenMLDataFeature(
@@ -1022,6 +1079,7 @@ def _parse_features_xml(features_xml_string):
             xmlfeature["oml:data_type"],
             xmlfeature.get("oml:nominal_value"),
             int(nr_missing),
+            xmlfeature.get("oml:ontology"),
         )
         if idx != feature.index:
             raise ValueError("Data features not provided in right order")
@@ -1030,32 +1088,40 @@ def _parse_features_xml(features_xml_string):
     return features
 
 
+# TODO(eddiebergman): Should this really exist?
 def _get_features_pickle_file(features_file: str) -> str:
-    """This function only exists so it can be mocked during unit testing"""
+    """Exists so it can be mocked during unit testing"""
     return features_file + ".pkl"
 
 
-def _read_qualities(qualities_file: str) -> Dict[str, float]:
-    qualities_pickle_file = _get_qualities_pickle_file(qualities_file)
+# TODO(eddiebergman): Should this really exist?
+def _get_qualities_pickle_file(qualities_file: str) -> str:
+    """Exists so it can be mocked during unit testing."""
+    return qualities_file + ".pkl"
+
+
+def _read_qualities(qualities_file: str | Path) -> dict[str, float]:
+    qualities_file = Path(qualities_file)
+    qualities_pickle_file = Path(_get_qualities_pickle_file(str(qualities_file)))
     try:
-        with open(qualities_pickle_file, "rb") as fh_binary:
-            qualities = pickle.load(fh_binary)
-    except:  # noqa E722
-        with open(qualities_file, encoding="utf8") as fh:
+        with qualities_pickle_file.open("rb") as fh_binary:
+            return pickle.load(fh_binary)  # type: ignore  # noqa: S301
+    except:  # noqa: E722
+        with qualities_file.open(encoding="utf8") as fh:
             qualities_xml = fh.read()
+
         qualities = _parse_qualities_xml(qualities_xml)
-        with open(qualities_pickle_file, "wb") as fh_binary:
+        with qualities_pickle_file.open("wb") as fh_binary:
             pickle.dump(qualities, fh_binary)
-    return qualities
 
+        return qualities
 
-def _check_qualities(qualities: List[Dict[str, str]]) -> Dict[str, float]:
+
+def _check_qualities(qualities: list[dict[str, str]]) -> dict[str, float]:
     qualities_ = {}
     for xmlquality in qualities:
         name = xmlquality["oml:name"]
-        if xmlquality.get("oml:value", None) is None:
-            value = float("NaN")
-        elif xmlquality["oml:value"] == "null":
+        if xmlquality.get("oml:value", None) is None or xmlquality["oml:value"] == "null":
             value = float("NaN")
         else:
             value = float(xmlquality["oml:value"])
@@ -1063,12 +1129,7 @@ def _check_qualities(qualities: List[Dict[str, str]]) -> Dict[str, float]:
     return qualities_
 
 
-def _parse_qualities_xml(qualities_xml):
+def _parse_qualities_xml(qualities_xml: str) -> dict[str, float]:
     xml_as_dict = xmltodict.parse(qualities_xml, force_list=("oml:quality",))
     qualities = xml_as_dict["oml:data_qualities"]["oml:quality"]
     return _check_qualities(qualities)
-
-
-def _get_qualities_pickle_file(qualities_file: str) -> str:
-    """This function only exists so it can be mocked during unit testing"""
-    return qualities_file + ".pkl"
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
index adbb46c6e..a797588d4 100644
--- a/openml/datasets/functions.py
+++ b/openml/datasets/functions.py
@@ -1,47 +1,58 @@
 # License: BSD 3-Clause
+# ruff: noqa: PLR0913
+from __future__ import annotations
 
-import io
 import logging
-import os
-from pyexpat import ExpatError
-from typing import List, Dict, Optional, Union, cast
 import warnings
+from collections import OrderedDict
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, overload
+from typing_extensions import Literal
 
+import arff
 import minio.error
 import numpy as np
-import arff
 import pandas as pd
 import urllib3
-
 import xmltodict
+from pyexpat import ExpatError
 from scipy.sparse import coo_matrix
-from collections import OrderedDict
 
-import openml.utils
 import openml._api_calls
-from .dataset import OpenMLDataset
-from ..exceptions import (
+import openml.utils
+from openml.exceptions import (
     OpenMLHashException,
+    OpenMLPrivateDatasetError,
     OpenMLServerError,
     OpenMLServerException,
-    OpenMLPrivateDatasetError,
 )
-from ..utils import _remove_cache_dir_for_id, _create_cache_directory_for_id, _get_cache_dir_for_id
+from openml.utils import (
+    _create_cache_directory_for_id,
+    _get_cache_dir_for_id,
+    _remove_cache_dir_for_id,
+)
+
+from .dataset import OpenMLDataset
+
+if TYPE_CHECKING:
+    import scipy
 
 DATASETS_CACHE_DIR_NAME = "datasets"
 logger = logging.getLogger(__name__)
 
+NO_ACCESS_GRANTED_ERRCODE = 112
 
 ############################################################################
 # Local getters/accessors to the cache directory
 
 
-def _get_cache_directory(dataset: OpenMLDataset) -> str:
-    """Return the cache directory of the OpenMLDataset"""
+def _get_cache_directory(dataset: OpenMLDataset) -> Path:
+    """Creates and returns the cache directory of the OpenMLDataset."""
+    assert dataset.dataset_id is not None
     return _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset.dataset_id)
 
 
-def list_qualities() -> List[str]:
+def list_qualities() -> list[str]:
     """Return list of data qualities available.
 
     The function performs an API call to retrieve the entire list of
@@ -56,22 +67,63 @@ def list_qualities() -> List[str]:
     qualities = xmltodict.parse(xml_string, force_list=("oml:quality"))
     # Minimalistic check if the XML is useful
     if "oml:data_qualities_list" not in qualities:
-        raise ValueError("Error in return XML, does not contain " '"oml:data_qualities_list"')
+        raise ValueError('Error in return XML, does not contain "oml:data_qualities_list"')
+
     if not isinstance(qualities["oml:data_qualities_list"]["oml:quality"], list):
         raise TypeError("Error in return XML, does not contain " '"oml:quality" as a list')
-    qualities = qualities["oml:data_qualities_list"]["oml:quality"]
-    return qualities
+
+    return qualities["oml:data_qualities_list"]["oml:quality"]
 
 
+@overload
 def list_datasets(
-    data_id: Optional[List[int]] = None,
-    offset: Optional[int] = None,
-    size: Optional[int] = None,
-    status: Optional[str] = None,
-    tag: Optional[str] = None,
-    output_format: str = "dict",
-    **kwargs,
-) -> Union[Dict, pd.DataFrame]:
+    data_id: list[int] | None = ...,
+    offset: int | None = ...,
+    size: int | None = ...,
+    status: str | None = ...,
+    tag: str | None = ...,
+    *,
+    output_format: Literal["dataframe"],
+    **kwargs: Any,
+) -> pd.DataFrame:
+    ...
+
+
+@overload
+def list_datasets(
+    data_id: list[int] | None,
+    offset: int | None,
+    size: int | None,
+    status: str | None,
+    tag: str | None,
+    output_format: Literal["dataframe"],
+    **kwargs: Any,
+) -> pd.DataFrame:
+    ...
+
+
+@overload
+def list_datasets(
+    data_id: list[int] | None = ...,
+    offset: int | None = ...,
+    size: int | None = ...,
+    status: str | None = ...,
+    tag: str | None = ...,
+    output_format: Literal["dict"] = "dict",
+    **kwargs: Any,
+) -> pd.DataFrame:
+    ...
+
+
+def list_datasets(
+    data_id: list[int] | None = None,
+    offset: int | None = None,
+    size: int | None = None,
+    status: str | None = None,
+    tag: str | None = None,
+    output_format: Literal["dataframe", "dict"] = "dict",
+    **kwargs: Any,
+) -> dict | pd.DataFrame:
     """
     Return a list of all dataset which are on OpenML.
     Supports large amount of results.
@@ -126,7 +178,7 @@ def list_datasets(
     """
     if output_format not in ["dataframe", "dict"]:
         raise ValueError(
-            "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
+            "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable.",
         )
 
     # TODO: [0.15]
@@ -138,9 +190,9 @@ def list_datasets(
         )
         warnings.warn(msg, category=FutureWarning, stacklevel=2)
 
-    return openml.utils._list_all(
+    return openml.utils._list_all(  # type: ignore
         data_id=data_id,
-        output_format=output_format,
+        list_output_format=output_format,  # type: ignore
         listing_call=_list_datasets,
         offset=offset,
         size=size,
@@ -150,7 +202,29 @@ def list_datasets(
     )
 
 
-def _list_datasets(data_id: Optional[List] = None, output_format="dict", **kwargs):
+@overload
+def _list_datasets(
+    data_id: list | None = ...,
+    output_format: Literal["dict"] = "dict",
+    **kwargs: Any,
+) -> dict:
+    ...
+
+
+@overload
+def _list_datasets(
+    data_id: list | None = ...,
+    output_format: Literal["dataframe"] = "dataframe",
+    **kwargs: Any,
+) -> pd.DataFrame:
+    ...
+
+
+def _list_datasets(
+    data_id: list | None = None,
+    output_format: Literal["dict", "dataframe"] = "dict",
+    **kwargs: Any,
+) -> dict | pd.DataFrame:
     """
     Perform api call to return a list of all datasets.
 
@@ -176,28 +250,42 @@ def _list_datasets(data_id: Optional[List] = None, output_format="dict", **kwarg
     -------
     datasets : dict of dicts, or dataframe
     """
-
     api_call = "data/list"
 
     if kwargs is not None:
         for operator, value in kwargs.items():
-            api_call += "/%s/%s" % (operator, value)
+            api_call += f"/{operator}/{value}"
     if data_id is not None:
         api_call += "/data_id/%s" % ",".join([str(int(i)) for i in data_id])
     return __list_datasets(api_call=api_call, output_format=output_format)
 
 
-def __list_datasets(api_call, output_format="dict"):
+@overload
+def __list_datasets(api_call: str, output_format: Literal["dict"] = "dict") -> dict:
+    ...
+
+
+@overload
+def __list_datasets(api_call: str, output_format: Literal["dataframe"]) -> pd.DataFrame:
+    ...
+
+
+def __list_datasets(
+    api_call: str,
+    output_format: Literal["dict", "dataframe"] = "dict",
+) -> dict | pd.DataFrame:
     xml_string = openml._api_calls._perform_api_call(api_call, "get")
     datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",))
 
     # Minimalistic check if the XML is useful
-    assert type(datasets_dict["oml:data"]["oml:dataset"]) == list, type(datasets_dict["oml:data"])
+    assert isinstance(datasets_dict["oml:data"]["oml:dataset"], list), type(
+        datasets_dict["oml:data"],
+    )
     assert datasets_dict["oml:data"]["@xmlns:oml"] == "http://openml.org/openml", datasets_dict[
         "oml:data"
     ]["@xmlns:oml"]
 
-    datasets = dict()
+    datasets = {}
     for dataset_ in datasets_dict["oml:data"]["oml:dataset"]:
         ignore_attribute = ["oml:file_id", "oml:quality"]
         dataset = {
@@ -207,7 +295,7 @@ def __list_datasets(api_call, output_format="dict"):
         dataset["version"] = int(dataset["version"])
 
         # The number of qualities can range from 0 to infinity
-        for quality in dataset_.get("oml:quality", list()):
+        for quality in dataset_.get("oml:quality", []):
             try:
                 dataset[quality["@name"]] = int(quality["#text"])
             except ValueError:
@@ -220,7 +308,7 @@ def __list_datasets(api_call, output_format="dict"):
     return datasets
 
 
-def _expand_parameter(parameter: Union[str, List[str]]) -> List[str]:
+def _expand_parameter(parameter: str | list[str] | None) -> list[str]:
     expanded_parameter = []
     if isinstance(parameter, str):
         expanded_parameter = [x.strip() for x in parameter.split(",")]
@@ -230,23 +318,24 @@ def _expand_parameter(parameter: Union[str, List[str]]) -> List[str]:
 
 
 def _validated_data_attributes(
-    attributes: List[str], data_attributes: List[str], parameter_name: str
+    attributes: list[str],
+    data_attributes: list[tuple[str, Any]],
+    parameter_name: str,
 ) -> None:
     for attribute_ in attributes:
-        is_attribute_a_data_attribute = any([attr[0] == attribute_ for attr in data_attributes])
+        is_attribute_a_data_attribute = any(dattr[0] == attribute_ for dattr in data_attributes)
         if not is_attribute_a_data_attribute:
             raise ValueError(
-                "all attribute of '{}' should be one of the data attribute. "
-                " Got '{}' while candidates are {}.".format(
-                    parameter_name, attribute_, [attr[0] for attr in data_attributes]
-                )
+                f"all attribute of '{parameter_name}' should be one of the data attribute. "
+                f" Got '{attribute_}' while candidates are"
+                f" {[dattr[0] for dattr in data_attributes]}.",
             )
 
 
 def check_datasets_active(
-    dataset_ids: List[int],
-    raise_error_if_not_exist: bool = True,
-) -> Dict[int, bool]:
+    dataset_ids: list[int],
+    raise_error_if_not_exist: bool = True,  # noqa: FBT001, FBT002
+) -> dict[int, bool]:
     """
     Check if the dataset ids provided are active.
 
@@ -276,7 +365,9 @@ def check_datasets_active(
 
 
 def _name_to_id(
-    dataset_name: str, version: Optional[int] = None, error_if_multiple: bool = False
+    dataset_name: str,
+    version: int | None = None,
+    error_if_multiple: bool = False,  # noqa: FBT001, FBT002
 ) -> int:
     """Attempt to find the dataset id of the dataset with the given name.
 
@@ -304,27 +395,30 @@ def _name_to_id(
        The id of the dataset.
     """
     status = None if version is not None else "active"
-    candidates = cast(
-        pd.DataFrame,
-        list_datasets(
-            data_name=dataset_name, status=status, data_version=version, output_format="dataframe"
-        ),
+    candidates = list_datasets(
+        data_name=dataset_name,
+        status=status,
+        data_version=version,
+        output_format="dataframe",
     )
     if error_if_multiple and len(candidates) > 1:
         msg = f"Multiple active datasets exist with name '{dataset_name}'."
         raise ValueError(msg)
+
     if candidates.empty:
         no_dataset_for_name = f"No active datasets exist with name '{dataset_name}'"
         and_version = f" and version '{version}'." if version is not None else "."
         raise RuntimeError(no_dataset_for_name + and_version)
 
     # Dataset ids are chronological so we can just sort based on ids (instead of version)
-    return candidates["did"].min()
+    return candidates["did"].min()  # type: ignore
 
 
 def get_datasets(
-    dataset_ids: List[Union[str, int]], download_data: bool = True, download_qualities: bool = True
-) -> List[OpenMLDataset]:
+    dataset_ids: list[str | int],
+    download_data: bool = True,  # noqa: FBT001, FBT002
+    download_qualities: bool = True,  # noqa: FBT001, FBT002
+) -> list[OpenMLDataset]:
     """Download datasets.
 
     This function iterates :meth:`openml.datasets.get_dataset`.
@@ -350,22 +444,22 @@ def get_datasets(
     datasets = []
     for dataset_id in dataset_ids:
         datasets.append(
-            get_dataset(dataset_id, download_data, download_qualities=download_qualities)
+            get_dataset(dataset_id, download_data, download_qualities=download_qualities),
         )
     return datasets
 
 
 @openml.utils.thread_safe_if_oslo_installed
-def get_dataset(
-    dataset_id: Union[int, str],
-    download_data: Optional[bool] = None,  # Optional for deprecation warning; later again only bool
-    version: Optional[int] = None,
-    error_if_multiple: bool = False,
-    cache_format: str = "pickle",
-    download_qualities: Optional[bool] = None,  # Same as above
-    download_features_meta_data: Optional[bool] = None,  # Same as above
-    download_all_files: bool = False,
-    force_refresh_cache: bool = False,
+def get_dataset(  # noqa: C901, PLR0912
+    dataset_id: int | str,
+    download_data: bool | None = None,  # Optional for deprecation warning; later again only bool
+    version: int | None = None,
+    error_if_multiple: bool = False,  # noqa: FBT002, FBT001
+    cache_format: Literal["pickle", "feather"] = "pickle",
+    download_qualities: bool | None = None,  # Same as above
+    download_features_meta_data: bool | None = None,  # Same as above
+    download_all_files: bool = False,  # noqa: FBT002, FBT001
+    force_refresh_cache: bool = False,  # noqa: FBT001, FBT002
 ) -> OpenMLDataset:
     """Download the OpenML dataset representation, optionally also download actual data file.
 
@@ -442,6 +536,7 @@ def get_dataset(
             "`download_qualities`, and `download_features_meta_data` to a bool while calling "
             "`get_dataset`.",
             FutureWarning,
+            stacklevel=2,
         )
 
     download_data = True if download_data is None else download_data
@@ -452,13 +547,15 @@ def get_dataset(
 
     if download_all_files:
         warnings.warn(
-            "``download_all_files`` is experimental and is likely to break with new releases."
+            "``download_all_files`` is experimental and is likely to break with new releases.",
+            FutureWarning,
+            stacklevel=2,
         )
 
     if cache_format not in ["feather", "pickle"]:
         raise ValueError(
             "cache_format must be one of 'feather' or 'pickle. "
-            "Invalid format specified: {}".format(cache_format)
+            f"Invalid format specified: {cache_format}",
         )
 
     if isinstance(dataset_id, str):
@@ -468,12 +565,12 @@ def get_dataset(
             dataset_id = _name_to_id(dataset_id, version, error_if_multiple)  # type: ignore
     elif not isinstance(dataset_id, int):
         raise TypeError(
-            "`dataset_id` must be one of `str` or `int`, not {}.".format(type(dataset_id))
+            f"`dataset_id` must be one of `str` or `int`, not {type(dataset_id)}.",
         )
 
     if force_refresh_cache:
         did_cache_dir = _get_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, dataset_id)
-        if os.path.exists(did_cache_dir):
+        if did_cache_dir.exists():
             _remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, did_cache_dir)
 
     did_cache_dir = _create_cache_directory_for_id(
@@ -493,10 +590,11 @@ def get_dataset(
             qualities_file = _get_dataset_qualities_file(did_cache_dir, dataset_id)
 
         arff_file = _get_dataset_arff(description) if download_data else None
-        if "oml:minio_url" in description and download_data:
+        if "oml:parquet_url" in description and download_data:
             try:
                 parquet_file = _get_dataset_parquet(
-                    description, download_all_files=download_all_files
+                    description,
+                    download_all_files=download_all_files,
                 )
             except urllib3.exceptions.MaxRetryError:
                 parquet_file = None
@@ -508,21 +606,25 @@ def get_dataset(
     except OpenMLServerException as e:
         # if there was an exception
         # check if the user had access to the dataset
-        if e.code == 112:
+        if e.code == NO_ACCESS_GRANTED_ERRCODE:
             raise OpenMLPrivateDatasetError(e.message) from None
-        else:
-            raise e
+
+        raise e
     finally:
         if remove_dataset_cache:
             _remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, did_cache_dir)
 
-    dataset = _create_dataset_from_description(
-        description, features_file, qualities_file, arff_file, parquet_file, cache_format
+    return _create_dataset_from_description(
+        description,
+        features_file,
+        qualities_file,
+        arff_file,
+        parquet_file,
+        cache_format,
     )
-    return dataset
 
 
-def attributes_arff_from_df(df):
+def attributes_arff_from_df(df: pd.DataFrame) -> list[tuple[str, list[str] | str]]:
     """Describe attributes of the dataframe according to ARFF specification.
 
     Parameters
@@ -532,13 +634,13 @@ def attributes_arff_from_df(df):
 
     Returns
     -------
-    attributes_arff : str
+    attributes_arff : list[str]
         The data set attributes as required by the ARFF format.
     """
     PD_DTYPES_TO_ARFF_DTYPE = {"integer": "INTEGER", "floating": "REAL", "string": "STRING"}
-    attributes_arff = []
+    attributes_arff: list[tuple[str, list[str] | str]] = []
 
-    if not all([isinstance(column_name, str) for column_name in df.columns]):
+    if not all(isinstance(column_name, str) for column_name in df.columns):
         logger.warning("Converting non-str column names to str.")
         df.columns = [str(column_name) for column_name in df.columns]
 
@@ -555,47 +657,50 @@ def attributes_arff_from_df(df):
             categories_dtype = pd.api.types.infer_dtype(categories)
             if categories_dtype not in ("string", "unicode"):
                 raise ValueError(
-                    "The column '{}' of the dataframe is of "
+                    f"The column '{column_name}' of the dataframe is of "
                     "'category' dtype. Therefore, all values in "
                     "this columns should be string. Please "
                     "convert the entries which are not string. "
-                    "Got {} dtype in this column.".format(column_name, categories_dtype)
+                    f"Got {categories_dtype} dtype in this column.",
                 )
             attributes_arff.append((column_name, categories.tolist()))
         elif column_dtype == "boolean":
             # boolean are encoded as categorical.
             attributes_arff.append((column_name, ["True", "False"]))
-        elif column_dtype in PD_DTYPES_TO_ARFF_DTYPE.keys():
+        elif column_dtype in PD_DTYPES_TO_ARFF_DTYPE:
             attributes_arff.append((column_name, PD_DTYPES_TO_ARFF_DTYPE[column_dtype]))
         else:
             raise ValueError(
-                "The dtype '{}' of the column '{}' is not "
+                f"The dtype '{column_dtype}' of the column '{column_name}' is not "
                 "currently supported by liac-arff. Supported "
                 "dtypes are categorical, string, integer, "
-                "floating, and boolean.".format(column_dtype, column_name)
+                "floating, and boolean.",
             )
     return attributes_arff
 
 
-def create_dataset(
-    name,
-    description,
-    creator,
-    contributor,
-    collection_date,
-    language,
-    licence,
-    attributes,
-    data,
-    default_target_attribute,
-    ignore_attribute,
-    citation,
-    row_id_attribute=None,
-    original_data_url=None,
-    paper_url=None,
-    update_comment=None,
-    version_label=None,
-):
+def create_dataset(  # noqa: C901, PLR0912, PLR0915
+    name: str,
+    description: str | None,
+    creator: str | None,
+    contributor: str | None,
+    collection_date: str | None,
+    language: str | None,
+    licence: str | None,
+    # TODO(eddiebergman): Docstring says `type` but I don't know what this is other than strings
+    # Edit: Found it could also be like ["True", "False"]
+    attributes: list[tuple[str, str | list[str]]] | dict[str, str | list[str]] | Literal["auto"],
+    data: pd.DataFrame | np.ndarray | scipy.sparse.coo_matrix,
+    # TODO(eddiebergman): Function requires `default_target_attribute` exist but API allows None
+    default_target_attribute: str,
+    ignore_attribute: str | list[str] | None,
+    citation: str,
+    row_id_attribute: str | None = None,
+    original_data_url: str | None = None,
+    paper_url: str | None = None,
+    update_comment: str | None = None,
+    version_label: str | None = None,
+) -> OpenMLDataset:
     """Create a dataset.
 
     This function creates an OpenMLDataset object.
@@ -661,8 +766,8 @@ def create_dataset(
     Returns
     -------
     class:`openml.OpenMLDataset`
-        Dataset description."""
-
+    Dataset description.
+    """
     if isinstance(data, pd.DataFrame):
         # infer the row id from the index of the dataset
         if row_id_attribute is None:
@@ -673,10 +778,10 @@ def create_dataset(
             data = data.reset_index()
 
     if attributes == "auto" or isinstance(attributes, dict):
-        if not hasattr(data, "columns"):
+        if not isinstance(data, pd.DataFrame):
             raise ValueError(
                 "Automatically inferring attributes requires "
-                "a pandas DataFrame. A {!r} was given instead.".format(data)
+                f"a pandas DataFrame. A {data!r} was given instead.",
             )
         # infer the type of data for each column of the DataFrame
         attributes_ = attributes_arff_from_df(data)
@@ -684,7 +789,7 @@ def create_dataset(
             # override the attributes which was specified by the user
             for attr_idx in range(len(attributes_)):
                 attr_name = attributes_[attr_idx][0]
-                if attr_name in attributes.keys():
+                if attr_name in attributes:
                     attributes_[attr_idx] = (attr_name, attributes[attr_name])
     else:
         attributes_ = attributes
@@ -695,26 +800,28 @@ def create_dataset(
     _validated_data_attributes(default_target_attributes, attributes_, "default_target_attribute")
 
     if row_id_attribute is not None:
-        is_row_id_an_attribute = any([attr[0] == row_id_attribute for attr in attributes_])
+        is_row_id_an_attribute = any(attr[0] == row_id_attribute for attr in attributes_)
         if not is_row_id_an_attribute:
             raise ValueError(
                 "'row_id_attribute' should be one of the data attribute. "
                 " Got '{}' while candidates are {}.".format(
-                    row_id_attribute, [attr[0] for attr in attributes_]
-                )
+                    row_id_attribute,
+                    [attr[0] for attr in attributes_],
+                ),
             )
 
-    if hasattr(data, "columns"):
+    if isinstance(data, pd.DataFrame):
         if all(isinstance(dtype, pd.SparseDtype) for dtype in data.dtypes):
             data = data.sparse.to_coo()
             # liac-arff only support COO matrices with sorted rows
-            row_idx_sorted = np.argsort(data.row)
-            data.row = data.row[row_idx_sorted]
-            data.col = data.col[row_idx_sorted]
-            data.data = data.data[row_idx_sorted]
+            row_idx_sorted = np.argsort(data.row)  # type: ignore
+            data.row = data.row[row_idx_sorted]  # type: ignore
+            data.col = data.col[row_idx_sorted]  # type: ignore
+            data.data = data.data[row_idx_sorted]  # type: ignore
         else:
-            data = data.values
+            data = data.to_numpy()
 
+    data_format: Literal["arff", "sparse_arff"]
     if isinstance(data, (list, np.ndarray)):
         if isinstance(data[0], (list, np.ndarray)):
             data_format = "arff"
@@ -725,7 +832,7 @@ def create_dataset(
                 "When giving a list or a numpy.ndarray, "
                 "they should contain a list/ numpy.ndarray "
                 "for dense data or a dictionary for sparse "
-                "data. Got {!r} instead.".format(data[0])
+                f"data. Got {data[0]!r} instead.",
             )
     elif isinstance(data, coo_matrix):
         data_format = "sparse_arff"
@@ -734,7 +841,7 @@ def create_dataset(
             "When giving a list or a numpy.ndarray, "
             "they should contain a list/ numpy.ndarray "
             "for dense data or a dictionary for sparse "
-            "data. Got {!r} instead.".format(data[0])
+            f"data. Got {data[0]!r} instead.",
         )
 
     arff_object = {
@@ -751,11 +858,10 @@ def create_dataset(
         decoder = arff.ArffDecoder()
         return_type = arff.COO if data_format == "sparse_arff" else arff.DENSE
         decoder.decode(arff_dataset, encode_nominal=True, return_type=return_type)
-    except arff.ArffException:
+    except arff.ArffException as e:
         raise ValueError(
-            "The arguments you have provided \
-                             do not construct a valid ARFF file"
-        )
+            "The arguments you have provided do not construct a valid ARFF file"
+        ) from e
 
     return OpenMLDataset(
         name=name,
@@ -778,7 +884,7 @@ def create_dataset(
     )
 
 
-def status_update(data_id, status):
+def status_update(data_id: int, status: Literal["active", "deactivated"]) -> None:
     """
     Updates the status of a dataset to either 'active' or 'deactivated'.
     Please see the OpenML API documentation for a description of the status
@@ -794,8 +900,9 @@ def status_update(data_id, status):
     """
     legal_status = {"active", "deactivated"}
     if status not in legal_status:
-        raise ValueError("Illegal status value. " "Legal values: %s" % legal_status)
-    data = {"data_id": data_id, "status": status}
+        raise ValueError(f"Illegal status value. Legal values: {legal_status}")
+
+    data: openml._api_calls.DATA_TYPE = {"data_id": data_id, "status": status}
     result_xml = openml._api_calls._perform_api_call("data/status/update", "post", data=data)
     result = xmltodict.parse(result_xml)
     server_data_id = result["oml:data_status_update"]["oml:id"]
@@ -806,18 +913,18 @@ def status_update(data_id, status):
 
 
 def edit_dataset(
-    data_id,
-    description=None,
-    creator=None,
-    contributor=None,
-    collection_date=None,
-    language=None,
-    default_target_attribute=None,
-    ignore_attribute=None,
-    citation=None,
-    row_id_attribute=None,
-    original_data_url=None,
-    paper_url=None,
+    data_id: int,
+    description: str | None = None,
+    creator: str | None = None,
+    contributor: str | None = None,
+    collection_date: str | None = None,
+    language: str | None = None,
+    default_target_attribute: str | None = None,
+    ignore_attribute: str | list[str] | None = None,
+    citation: str | None = None,
+    row_id_attribute: str | None = None,
+    original_data_url: str | None = None,
+    paper_url: str | None = None,
 ) -> int:
     """Edits an OpenMLDataset.
 
@@ -877,7 +984,7 @@ def edit_dataset(
     Dataset id
     """
     if not isinstance(data_id, int):
-        raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id)))
+        raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.")
 
     # compose data edit parameters as xml
     form_data = {"data_id": data_id}  # type: openml._api_calls.DATA_TYPE
@@ -902,10 +1009,13 @@ def edit_dataset(
             del xml["oml:data_edit_parameters"][k]
 
     file_elements = {
-        "edit_parameters": ("description.xml", xmltodict.unparse(xml))
+        "edit_parameters": ("description.xml", xmltodict.unparse(xml)),
     }  # type: openml._api_calls.FILE_ELEMENTS_TYPE
     result_xml = openml._api_calls._perform_api_call(
-        "data/edit", "post", data=form_data, file_elements=file_elements
+        "data/edit",
+        "post",
+        data=form_data,
+        file_elements=file_elements,
     )
     result = xmltodict.parse(result_xml)
     data_id = result["oml:data_edit"]["oml:id"]
@@ -942,7 +1052,7 @@ def fork_dataset(data_id: int) -> int:
 
     """
     if not isinstance(data_id, int):
-        raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id)))
+        raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.")
     # compose data fork parameters
     form_data = {"data_id": data_id}  # type: openml._api_calls.DATA_TYPE
     result_xml = openml._api_calls._perform_api_call("data/fork", "post", data=form_data)
@@ -951,19 +1061,77 @@ def fork_dataset(data_id: int) -> int:
     return int(data_id)
 
 
-def _topic_add_dataset(data_id: int, topic: str):
+def data_feature_add_ontology(data_id: int, index: int, ontology: str) -> bool:
+    """
+    An ontology describes the concept that are described in a feature. An
+    ontology is defined by an URL where the information is provided. Adds
+    an ontology (URL) to a given dataset feature (defined by a dataset id
+    and index). The dataset has to exists on OpenML and needs to have been
+    processed by the evaluation engine.
+
+    Parameters
+    ----------
+    data_id : int
+        id of the dataset to which the feature belongs
+    index : int
+        index of the feature in dataset (0-based)
+    ontology : str
+        URL to ontology (max. 256 characters)
+
+    Returns
+    -------
+    True or throws an OpenML server exception
+    """
+    upload_data: dict[str, int | str] = {"data_id": data_id, "index": index, "ontology": ontology}
+    openml._api_calls._perform_api_call("data/feature/ontology/add", "post", data=upload_data)
+    # an error will be thrown in case the request was unsuccessful
+    return True
+
+
+def data_feature_remove_ontology(data_id: int, index: int, ontology: str) -> bool:
+    """
+    Removes an existing ontology (URL) from a given dataset feature (defined
+    by a dataset id and index). The dataset has to exists on OpenML and needs
+    to have been processed by the evaluation engine. Ontology needs to be
+    attached to the specific fearure.
+
+    Parameters
+    ----------
+    data_id : int
+        id of the dataset to which the feature belongs
+    index : int
+        index of the feature in dataset (0-based)
+    ontology : str
+        URL to ontology (max. 256 characters)
+
+    Returns
+    -------
+    True or throws an OpenML server exception
+    """
+    upload_data: dict[str, int | str] = {"data_id": data_id, "index": index, "ontology": ontology}
+    openml._api_calls._perform_api_call("data/feature/ontology/remove", "post", data=upload_data)
+    # an error will be thrown in case the request was unsuccessful
+    return True
+
+
+def _topic_add_dataset(data_id: int, topic: str) -> int:
     """
     Adds a topic for a dataset.
     This API is not available for all OpenML users and is accessible only by admins.
+
     Parameters
     ----------
     data_id : int
         id of the dataset for which the topic needs to be added
     topic : str
         Topic to be added for the dataset
+
+    Returns
+    -------
+    Dataset id
     """
     if not isinstance(data_id, int):
-        raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id)))
+        raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.")
     form_data = {"data_id": data_id, "topic": topic}  # type: openml._api_calls.DATA_TYPE
     result_xml = openml._api_calls._perform_api_call("data/topicadd", "post", data=form_data)
     result = xmltodict.parse(result_xml)
@@ -971,10 +1139,11 @@ def _topic_add_dataset(data_id: int, topic: str):
     return int(data_id)
 
 
-def _topic_delete_dataset(data_id: int, topic: str):
+def _topic_delete_dataset(data_id: int, topic: str) -> int:
     """
     Removes a topic from a dataset.
     This API is not available for all OpenML users and is accessible only by admins.
+
     Parameters
     ----------
     data_id : int
@@ -982,9 +1151,12 @@ def _topic_delete_dataset(data_id: int, topic: str):
     topic : str
         Topic to be deleted
 
+    Returns
+    -------
+    Dataset id
     """
     if not isinstance(data_id, int):
-        raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id)))
+        raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.")
     form_data = {"data_id": data_id, "topic": topic}  # type: openml._api_calls.DATA_TYPE
     result_xml = openml._api_calls._perform_api_call("data/topicdelete", "post", data=form_data)
     result = xmltodict.parse(result_xml)
@@ -992,14 +1164,14 @@ def _topic_delete_dataset(data_id: int, topic: str):
     return int(data_id)
 
 
-def _get_dataset_description(did_cache_dir, dataset_id):
+def _get_dataset_description(did_cache_dir: Path, dataset_id: int) -> dict[str, Any]:
     """Get the dataset description as xml dictionary.
 
     This function is NOT thread/multiprocessing safe.
 
     Parameters
     ----------
-    did_cache_dir : str
+    did_cache_dir : Path
         Cache subdirectory for this dataset.
 
     dataset_id : int
@@ -1011,35 +1183,35 @@ def _get_dataset_description(did_cache_dir, dataset_id):
         XML Dataset description parsed to a dict.
 
     """
-
     # TODO implement a cache for this that invalidates itself after some time
     # This can be saved on disk, but cannot be cached properly, because
     # it contains the information on whether a dataset is active.
-    description_file = os.path.join(did_cache_dir, "description.xml")
+    description_file = did_cache_dir / "description.xml"
 
     try:
-        with io.open(description_file, encoding="utf8") as fh:
+        with description_file.open(encoding="utf8") as fh:
             dataset_xml = fh.read()
         description = xmltodict.parse(dataset_xml)["oml:data_set_description"]
-    except Exception:
-        url_extension = "data/{}".format(dataset_id)
+    except Exception:  # noqa: BLE001
+        url_extension = f"data/{dataset_id}"
         dataset_xml = openml._api_calls._perform_api_call(url_extension, "get")
         try:
             description = xmltodict.parse(dataset_xml)["oml:data_set_description"]
         except ExpatError as e:
             url = openml._api_calls._create_url_from_endpoint(url_extension)
             raise OpenMLServerError(f"Dataset description XML at '{url}' is malformed.") from e
-        with io.open(description_file, "w", encoding="utf8") as fh:
+
+        with description_file.open("w", encoding="utf8") as fh:
             fh.write(dataset_xml)
 
-    return description
+    return description  # type: ignore
 
 
 def _get_dataset_parquet(
-    description: Union[Dict, OpenMLDataset],
-    cache_directory: Optional[str] = None,
-    download_all_files: bool = False,
-) -> Optional[str]:
+    description: dict | OpenMLDataset,
+    cache_directory: Path | None = None,
+    download_all_files: bool = False,  # noqa: FBT001, FBT002
+) -> Path | None:
     """Return the path to the local parquet file of the dataset. If is not cached, it is downloaded.
 
     Checks if the file is in the cache, if yes, return the path to the file.
@@ -1054,59 +1226,62 @@ def _get_dataset_parquet(
     description : dictionary or OpenMLDataset
         Either a dataset description as dict or OpenMLDataset.
 
-    cache_directory: str, optional (default=None)
+    cache_directory: Path, optional (default=None)
         Folder to store the parquet file in.
         If None, use the default cache directory for the dataset.
 
     download_all_files: bool, optional (default=False)
         If `True`, download all data found in the bucket to which the description's
-        ``minio_url`` points, only download the parquet file otherwise.
+        ``parquet_url`` points, only download the parquet file otherwise.
 
     Returns
     -------
-    output_filename : string, optional
+    output_filename : Path, optional
         Location of the Parquet file if successfully downloaded, None otherwise.
     """
     if isinstance(description, dict):
-        url = cast(str, description.get("oml:minio_url"))
-        did = description.get("oml:id")
+        url = str(description.get("oml:parquet_url"))
+        did = int(description.get("oml:id"))  # type: ignore
     elif isinstance(description, OpenMLDataset):
-        url = cast(str, description._minio_url)
-        did = description.dataset_id
+        url = str(description._parquet_url)
+        assert description.dataset_id is not None
+
+        did = int(description.dataset_id)
     else:
         raise TypeError("`description` should be either OpenMLDataset or Dict.")
 
     if cache_directory is None:
         cache_directory = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, did)
-    output_file_path = os.path.join(cache_directory, f"dataset_{did}.pq")
 
-    old_file_path = os.path.join(cache_directory, "dataset.pq")
-    if os.path.isfile(old_file_path):
-        os.rename(old_file_path, output_file_path)
+    output_file_path = cache_directory / f"dataset_{did}.pq"
+
+    old_file_path = cache_directory / "dataset.pq"
+    if old_file_path.is_file():
+        old_file_path.rename(output_file_path)
 
     # For this release, we want to be able to force a new download even if the
     # parquet file is already present when ``download_all_files`` is set.
     # For now, it would be the only way for the user to fetch the additional
     # files in the bucket (no function exists on an OpenMLDataset to do this).
     if download_all_files:
-        if url.endswith(".pq"):
-            url, _ = url.rsplit("/", maxsplit=1)
-        openml._api_calls._download_minio_bucket(source=cast(str, url), destination=cache_directory)
+        openml._api_calls._download_minio_bucket(source=url, destination=cache_directory)
 
-    if not os.path.isfile(output_file_path):
+    if not output_file_path.is_file():
         try:
             openml._api_calls._download_minio_file(
-                source=cast(str, url), destination=output_file_path
+                source=url,
+                destination=output_file_path,
             )
         except (FileNotFoundError, urllib3.exceptions.MaxRetryError, minio.error.ServerError) as e:
-            logger.warning("Could not download file from %s: %s" % (cast(str, url), e))
+            logger.warning(f"Could not download file from {url}: {e}")
             return None
     return output_file_path
 
 
 def _get_dataset_arff(
-    description: Union[Dict, OpenMLDataset], cache_directory: Optional[str] = None
-) -> str:
+    description: dict | OpenMLDataset,
+    cache_directory: Path | None = None,
+) -> Path:
     """Return the path to the local arff file of the dataset. If is not cached, it is downloaded.
 
     Checks if the file is in the cache, if yes, return the path to the file.
@@ -1120,48 +1295,56 @@ def _get_dataset_arff(
     description : dictionary or OpenMLDataset
         Either a dataset description as dict or OpenMLDataset.
 
-    cache_directory: str, optional (default=None)
+    cache_directory: Path, optional (default=None)
         Folder to store the arff file in.
         If None, use the default cache directory for the dataset.
 
     Returns
     -------
-    output_filename : string
+    output_filename : Path
         Location of ARFF file.
     """
     if isinstance(description, dict):
         md5_checksum_fixture = description.get("oml:md5_checksum")
-        url = description["oml:url"]
-        did = description.get("oml:id")
+        url = str(description["oml:url"])
+        did = int(description.get("oml:id"))  # type: ignore
     elif isinstance(description, OpenMLDataset):
         md5_checksum_fixture = description.md5_checksum
+        assert description.url is not None
+        assert description.dataset_id is not None
+
         url = description.url
-        did = description.dataset_id
+        did = int(description.dataset_id)
     else:
         raise TypeError("`description` should be either OpenMLDataset or Dict.")
 
-    if cache_directory is None:
-        cache_directory = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, did)
-    output_file_path = os.path.join(cache_directory, "dataset.arff")
+    save_cache_directory = (
+        _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, did)
+        if cache_directory is None
+        else Path(cache_directory)
+    )
+    output_file_path = save_cache_directory / "dataset.arff"
 
     try:
         openml._api_calls._download_text_file(
-            source=url, output_path=output_file_path, md5_checksum=md5_checksum_fixture
+            source=url,
+            output_path=output_file_path,
+            md5_checksum=md5_checksum_fixture,
         )
     except OpenMLHashException as e:
-        additional_info = " Raised when downloading dataset {}.".format(did)
+        additional_info = f" Raised when downloading dataset {did}."
         e.args = (e.args[0] + additional_info,)
-        raise
+        raise e
 
     return output_file_path
 
 
-def _get_features_xml(dataset_id):
+def _get_features_xml(dataset_id: int) -> str:
     url_extension = f"data/features/{dataset_id}"
     return openml._api_calls._perform_api_call(url_extension, "get")
 
 
-def _get_dataset_features_file(did_cache_dir: Union[str, None], dataset_id: int) -> str:
+def _get_dataset_features_file(did_cache_dir: str | Path | None, dataset_id: int) -> Path:
     """API call to load dataset features. Loads from cache or downloads them.
 
     Features are feature descriptions for each column.
@@ -1179,37 +1362,36 @@ def _get_dataset_features_file(did_cache_dir: Union[str, None], dataset_id: int)
 
     Returns
     -------
-    str
+    Path
         Path of the cached dataset feature file
     """
-
+    did_cache_dir = Path(did_cache_dir) if did_cache_dir is not None else None
     if did_cache_dir is None:
-        did_cache_dir = _create_cache_directory_for_id(
-            DATASETS_CACHE_DIR_NAME,
-            dataset_id,
-        )
+        did_cache_dir = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset_id)
 
-    features_file = os.path.join(did_cache_dir, "features.xml")
+    features_file = did_cache_dir / "features.xml"
 
     # Dataset features aren't subject to change...
-    if not os.path.isfile(features_file):
+    if not features_file.is_file():
         features_xml = _get_features_xml(dataset_id)
-        with io.open(features_file, "w", encoding="utf8") as fh:
+        with features_file.open("w", encoding="utf8") as fh:
             fh.write(features_xml)
 
     return features_file
 
 
-def _get_qualities_xml(dataset_id):
-    url_extension = f"data/qualities/{dataset_id}"
+def _get_qualities_xml(dataset_id: int) -> str:
+    url_extension = f"data/qualities/{dataset_id!s}"
     return openml._api_calls._perform_api_call(url_extension, "get")
 
 
 def _get_dataset_qualities_file(
-    did_cache_dir: Union[str, None], dataset_id: int
-) -> Union[str, None]:
-    """API call to load dataset qualities. Loads from cache or downloads them.
+    did_cache_dir: str | Path | None,
+    dataset_id: int,
+) -> Path | None:
+    """Get the path for the dataset qualities file, or None if no qualities exist.
 
+    Loads from cache or downloads them.
     Features are metafeatures (number of features, number of classes, ...)
 
     This function is NOT thread/multiprocessing safe.
@@ -1222,47 +1404,45 @@ def _get_dataset_qualities_file(
     dataset_id : int
         Dataset ID
 
-    download_qualities : bool
-        wheather to download/use cahsed version or not.
     Returns
     -------
     str
         Path of the cached qualities file
     """
-    if did_cache_dir is None:
-        did_cache_dir = _create_cache_directory_for_id(
-            DATASETS_CACHE_DIR_NAME,
-            dataset_id,
-        )
+    save_did_cache_dir = (
+        _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset_id)
+        if did_cache_dir is None
+        else Path(did_cache_dir)
+    )
 
     # Dataset qualities are subject to change and must be fetched every time
-    qualities_file = os.path.join(did_cache_dir, "qualities.xml")
+    qualities_file = save_did_cache_dir / "qualities.xml"
     try:
-        with io.open(qualities_file, encoding="utf8") as fh:
+        with qualities_file.open(encoding="utf8") as fh:
             qualities_xml = fh.read()
-    except (OSError, IOError):
+    except OSError:
         try:
             qualities_xml = _get_qualities_xml(dataset_id)
-            with io.open(qualities_file, "w", encoding="utf8") as fh:
+            with qualities_file.open("w", encoding="utf8") as fh:
                 fh.write(qualities_xml)
         except OpenMLServerException as e:
             if e.code == 362 and str(e) == "No qualities found - None":
                 # quality file stays as None
-                logger.warning("No qualities found for dataset {}".format(dataset_id))
+                logger.warning(f"No qualities found for dataset {dataset_id}")
                 return None
-            else:
-                raise
+
+            raise e
 
     return qualities_file
 
 
 def _create_dataset_from_description(
-    description: Dict[str, str],
-    features_file: Optional[str] = None,
-    qualities_file: Optional[str] = None,
-    arff_file: Optional[str] = None,
-    parquet_file: Optional[str] = None,
-    cache_format: str = "pickle",
+    description: dict[str, str],
+    features_file: Path | None = None,
+    qualities_file: Path | None = None,
+    arff_file: Path | None = None,
+    parquet_file: Path | None = None,
+    cache_format: Literal["pickle", "feather"] = "pickle",
 ) -> OpenMLDataset:
     """Create a dataset object from a description dict.
 
@@ -1270,9 +1450,9 @@ def _create_dataset_from_description(
     ----------
     description : dict
         Description of a dataset in xml dict.
-    featuresfile : str
+    features_file : str
         Path of the dataset features as xml file.
-    qualities : list
+    qualities_file : list
         Path of the dataset qualities as xml file.
     arff_file : string, optional
         Path of dataset ARFF file.
@@ -1289,9 +1469,9 @@ def _create_dataset_from_description(
     return OpenMLDataset(
         description["oml:name"],
         description.get("oml:description"),
-        data_format=description["oml:format"],
-        dataset_id=description["oml:id"],
-        version=description["oml:version"],
+        data_format=description["oml:format"],  # type: ignore
+        dataset_id=int(description["oml:id"]),
+        version=int(description["oml:version"]),
         creator=description.get("oml:creator"),
         contributor=description.get("oml:contributor"),
         collection_date=description.get("oml:collection_date"),
@@ -1310,16 +1490,16 @@ def _create_dataset_from_description(
         paper_url=description.get("oml:paper_url"),
         update_comment=description.get("oml:update_comment"),
         md5_checksum=description.get("oml:md5_checksum"),
-        data_file=arff_file,
+        data_file=str(arff_file) if arff_file is not None else None,
         cache_format=cache_format,
-        features_file=features_file,
-        qualities_file=qualities_file,
-        minio_url=description.get("oml:minio_url"),
-        parquet_file=parquet_file,
+        features_file=str(features_file) if features_file is not None else None,
+        qualities_file=str(qualities_file) if qualities_file is not None else None,
+        parquet_url=description.get("oml:parquet_url"),
+        parquet_file=str(parquet_file) if parquet_file is not None else None,
     )
 
 
-def _get_online_dataset_arff(dataset_id):
+def _get_online_dataset_arff(dataset_id: int) -> str | None:
     """Download the ARFF file for a given dataset id
     from the OpenML website.
 
@@ -1330,8 +1510,8 @@ def _get_online_dataset_arff(dataset_id):
 
     Returns
     -------
-    str
-        A string representation of an ARFF file.
+    str or None
+        A string representation of an ARFF file. Or None if file already exists.
     """
     dataset_xml = openml._api_calls._perform_api_call("data/%d" % dataset_id, "get")
     # build a dict from the xml.
@@ -1341,7 +1521,7 @@ def _get_online_dataset_arff(dataset_id):
     )
 
 
-def _get_online_dataset_format(dataset_id):
+def _get_online_dataset_format(dataset_id: int) -> str:
     """Get the dataset format for a given dataset id
     from the OpenML website.
 
@@ -1357,7 +1537,7 @@ def _get_online_dataset_format(dataset_id):
     """
     dataset_xml = openml._api_calls._perform_api_call("data/%d" % dataset_id, "get")
     # build a dict from the xml and get the format from the dataset description
-    return xmltodict.parse(dataset_xml)["oml:data_set_description"]["oml:format"].lower()
+    return xmltodict.parse(dataset_xml)["oml:data_set_description"]["oml:format"].lower()  # type: ignore
 
 
 def delete_dataset(dataset_id: int) -> bool:
diff --git a/openml/evaluations/__init__.py b/openml/evaluations/__init__.py
index 400a59652..dbff47037 100644
--- a/openml/evaluations/__init__.py
+++ b/openml/evaluations/__init__.py
@@ -1,7 +1,7 @@
 # License: BSD 3-Clause
 
 from .evaluation import OpenMLEvaluation
-from .functions import list_evaluations, list_evaluation_measures, list_evaluations_setups
+from .functions import list_evaluation_measures, list_evaluations, list_evaluations_setups
 
 __all__ = [
     "OpenMLEvaluation",
diff --git a/openml/evaluations/evaluation.py b/openml/evaluations/evaluation.py
index 8bdf741c2..3cf732f25 100644
--- a/openml/evaluations/evaluation.py
+++ b/openml/evaluations/evaluation.py
@@ -1,9 +1,14 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
 import openml.config
+import openml.datasets
+import openml.flows
+import openml.runs
+import openml.tasks
 
 
-class OpenMLEvaluation(object):
+class OpenMLEvaluation:
     """
     Contains all meta-information about a run / evaluation combination,
     according to the evaluation/list function
@@ -41,22 +46,22 @@ class OpenMLEvaluation(object):
         (e.g., in case of precision, auroc, recall)
     """
 
-    def __init__(
+    def __init__(  # noqa: PLR0913
         self,
-        run_id,
-        task_id,
-        setup_id,
-        flow_id,
-        flow_name,
-        data_id,
-        data_name,
-        function,
-        upload_time,
+        run_id: int,
+        task_id: int,
+        setup_id: int,
+        flow_id: int,
+        flow_name: str,
+        data_id: int,
+        data_name: str,
+        function: str,
+        upload_time: str,
         uploader: int,
         uploader_name: str,
-        value,
-        values,
-        array_data=None,
+        value: float | None,
+        values: list[float] | None,
+        array_data: str | None = None,
     ):
         self.run_id = run_id
         self.task_id = task_id
@@ -73,7 +78,7 @@ def __init__(
         self.values = values
         self.array_data = array_data
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         header = "OpenML Evaluation"
         header = "{}\n{}\n".format(header, "=" * len(header))
 
@@ -107,9 +112,9 @@ def __repr__(self):
             "Metric Used",
             "Result",
         ]
-        fields = [(key, fields[key]) for key in order if key in fields]
+        _fields = [(key, fields[key]) for key in order if key in fields]
 
-        longest_field_name_length = max(len(name) for name, value in fields)
-        field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length)
-        body = "\n".join(field_line_format.format(name, value) for name, value in fields)
+        longest_field_name_length = max(len(name) for name, _ in _fields)
+        field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}"
+        body = "\n".join(field_line_format.format(name, value) for name, value in _fields)
         return header + body
diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py
index 214348345..a854686d1 100644
--- a/openml/evaluations/functions.py
+++ b/openml/evaluations/functions.py
@@ -1,35 +1,75 @@
 # License: BSD 3-Clause
+# ruff: noqa: PLR0913
+from __future__ import annotations
 
 import json
 import warnings
+from typing import Any
+from typing_extensions import Literal, overload
 
-import xmltodict
-import pandas as pd
 import numpy as np
-from typing import Union, List, Optional, Dict
-import collections
+import pandas as pd
+import xmltodict
 
-import openml.utils
-import openml._api_calls
-from ..evaluations import OpenMLEvaluation
 import openml
+import openml._api_calls
+import openml.utils
+from openml.evaluations import OpenMLEvaluation
+
+
+@overload
+def list_evaluations(
+    function: str,
+    offset: int | None = ...,
+    size: int | None = ...,
+    tasks: list[str | int] | None = ...,
+    setups: list[str | int] | None = ...,
+    flows: list[str | int] | None = ...,
+    runs: list[str | int] | None = ...,
+    uploaders: list[str | int] | None = ...,
+    tag: str | None = ...,
+    study: int | None = ...,
+    per_fold: bool | None = ...,
+    sort_order: str | None = ...,
+    output_format: Literal["dict", "object"] = "dict",
+) -> dict:
+    ...
+
+
+@overload
+def list_evaluations(
+    function: str,
+    offset: int | None = ...,
+    size: int | None = ...,
+    tasks: list[str | int] | None = ...,
+    setups: list[str | int] | None = ...,
+    flows: list[str | int] | None = ...,
+    runs: list[str | int] | None = ...,
+    uploaders: list[str | int] | None = ...,
+    tag: str | None = ...,
+    study: int | None = ...,
+    per_fold: bool | None = ...,
+    sort_order: str | None = ...,
+    output_format: Literal["dataframe"] = ...,
+) -> pd.DataFrame:
+    ...
 
 
 def list_evaluations(
     function: str,
-    offset: Optional[int] = None,
-    size: Optional[int] = 10000,
-    tasks: Optional[List[Union[str, int]]] = None,
-    setups: Optional[List[Union[str, int]]] = None,
-    flows: Optional[List[Union[str, int]]] = None,
-    runs: Optional[List[Union[str, int]]] = None,
-    uploaders: Optional[List[Union[str, int]]] = None,
-    tag: Optional[str] = None,
-    study: Optional[int] = None,
-    per_fold: Optional[bool] = None,
-    sort_order: Optional[str] = None,
-    output_format: str = "object",
-) -> Union[Dict, pd.DataFrame]:
+    offset: int | None = None,
+    size: int | None = 10000,
+    tasks: list[str | int] | None = None,
+    setups: list[str | int] | None = None,
+    flows: list[str | int] | None = None,
+    runs: list[str | int] | None = None,
+    uploaders: list[str | int] | None = None,
+    tag: str | None = None,
+    study: int | None = None,
+    per_fold: bool | None = None,
+    sort_order: str | None = None,
+    output_format: Literal["object", "dict", "dataframe"] = "object",
+) -> dict | pd.DataFrame:
     """
     List all run-evaluation pairs matching all of the given filters.
     (Supports large amount of results)
@@ -76,7 +116,7 @@ def list_evaluations(
     """
     if output_format not in ["dataframe", "dict", "object"]:
         raise ValueError(
-            "Invalid output format selected. " "Only 'object', 'dataframe', or 'dict' applicable."
+            "Invalid output format selected. Only 'object', 'dataframe', or 'dict' applicable.",
         )
 
     # TODO: [0.15]
@@ -92,8 +132,8 @@ def list_evaluations(
     if per_fold is not None:
         per_fold_str = str(per_fold).lower()
 
-    return openml.utils._list_all(
-        output_format=output_format,
+    return openml.utils._list_all(  # type: ignore
+        list_output_format=output_format,  # type: ignore
         listing_call=_list_evaluations,
         function=function,
         offset=offset,
@@ -112,16 +152,16 @@ def list_evaluations(
 
 def _list_evaluations(
     function: str,
-    tasks: Optional[List] = None,
-    setups: Optional[List] = None,
-    flows: Optional[List] = None,
-    runs: Optional[List] = None,
-    uploaders: Optional[List] = None,
-    study: Optional[int] = None,
-    sort_order: Optional[str] = None,
-    output_format: str = "object",
-    **kwargs
-) -> Union[Dict, pd.DataFrame]:
+    tasks: list | None = None,
+    setups: list | None = None,
+    flows: list | None = None,
+    runs: list | None = None,
+    uploaders: list | None = None,
+    study: int | None = None,
+    sort_order: str | None = None,
+    output_format: Literal["object", "dict", "dataframe"] = "object",
+    **kwargs: Any,
+) -> dict | pd.DataFrame:
     """
     Perform API call ``/evaluation/function{function}/{filters}``
 
@@ -164,11 +204,10 @@ def _list_evaluations(
     -------
     dict of objects, or dataframe
     """
-
     api_call = "evaluation/list/function/%s" % function
     if kwargs is not None:
         for operator, value in kwargs.items():
-            api_call += "/%s/%s" % (operator, value)
+            api_call += f"/{operator}/{value}"
     if tasks is not None:
         api_call += "/task/%s" % ",".join([str(int(i)) for i in tasks])
     if setups is not None:
@@ -187,23 +226,26 @@ def _list_evaluations(
     return __list_evaluations(api_call, output_format=output_format)
 
 
-def __list_evaluations(api_call, output_format="object"):
+def __list_evaluations(
+    api_call: str,
+    output_format: Literal["object", "dict", "dataframe"] = "object",
+) -> dict | pd.DataFrame:
     """Helper function to parse API calls which are lists of runs"""
     xml_string = openml._api_calls._perform_api_call(api_call, "get")
     evals_dict = xmltodict.parse(xml_string, force_list=("oml:evaluation",))
     # Minimalistic check if the XML is useful
     if "oml:evaluations" not in evals_dict:
         raise ValueError(
-            "Error in return XML, does not contain " '"oml:evaluations": %s' % str(evals_dict)
+            "Error in return XML, does not contain " '"oml:evaluations": %s' % str(evals_dict),
         )
 
-    assert type(evals_dict["oml:evaluations"]["oml:evaluation"]) == list, type(
-        evals_dict["oml:evaluations"]
+    assert isinstance(evals_dict["oml:evaluations"]["oml:evaluation"], list), type(
+        evals_dict["oml:evaluations"],
     )
 
-    evals = collections.OrderedDict()
+    evals: dict[int, dict | OpenMLEvaluation] = {}
     uploader_ids = list(
-        set([eval_["oml:uploader"] for eval_ in evals_dict["oml:evaluations"]["oml:evaluation"]])
+        {eval_["oml:uploader"] for eval_ in evals_dict["oml:evaluations"]["oml:evaluation"]},
     )
     api_users = "user/list/user_id/" + ",".join(uploader_ids)
     xml_string_user = openml._api_calls._perform_api_call(api_users, "get")
@@ -211,32 +253,33 @@ def __list_evaluations(api_call, output_format="object"):
     user_dict = {user["oml:id"]: user["oml:username"] for user in users["oml:users"]["oml:user"]}
     for eval_ in evals_dict["oml:evaluations"]["oml:evaluation"]:
         run_id = int(eval_["oml:run_id"])
+
         value = None
-        values = None
-        array_data = None
         if "oml:value" in eval_:
             value = float(eval_["oml:value"])
+
+        values = None
         if "oml:values" in eval_:
             values = json.loads(eval_["oml:values"])
-        if "oml:array_data" in eval_:
-            array_data = eval_["oml:array_data"]
+
+        array_data = eval_.get("oml:array_data")
 
         if output_format == "object":
             evals[run_id] = OpenMLEvaluation(
-                int(eval_["oml:run_id"]),
-                int(eval_["oml:task_id"]),
-                int(eval_["oml:setup_id"]),
-                int(eval_["oml:flow_id"]),
-                eval_["oml:flow_name"],
-                int(eval_["oml:data_id"]),
-                eval_["oml:data_name"],
-                eval_["oml:function"],
-                eval_["oml:upload_time"],
-                int(eval_["oml:uploader"]),
-                user_dict[eval_["oml:uploader"]],
-                value,
-                values,
-                array_data,
+                run_id=run_id,
+                task_id=int(eval_["oml:task_id"]),
+                setup_id=int(eval_["oml:setup_id"]),
+                flow_id=int(eval_["oml:flow_id"]),
+                flow_name=eval_["oml:flow_name"],
+                data_id=int(eval_["oml:data_id"]),
+                data_name=eval_["oml:data_name"],
+                function=eval_["oml:function"],
+                upload_time=eval_["oml:upload_time"],
+                uploader=int(eval_["oml:uploader"]),
+                uploader_name=user_dict[eval_["oml:uploader"]],
+                value=value,
+                values=values,
+                array_data=array_data,
             )
         else:
             # for output_format in ['dict', 'dataframe']
@@ -258,12 +301,13 @@ def __list_evaluations(api_call, output_format="object"):
             }
 
     if output_format == "dataframe":
-        rows = [value for key, value in evals.items()]
-        evals = pd.DataFrame.from_records(rows, columns=rows[0].keys())
+        rows = list(evals.values())
+        return pd.DataFrame.from_records(rows, columns=rows[0].keys())  # type: ignore
+
     return evals
 
 
-def list_evaluation_measures() -> List[str]:
+def list_evaluation_measures() -> list[str]:
     """Return list of evaluation measures available.
 
     The function performs an API call to retrieve the entire list of
@@ -282,11 +326,10 @@ def list_evaluation_measures() -> List[str]:
         raise ValueError("Error in return XML, does not contain " '"oml:evaluation_measures"')
     if not isinstance(qualities["oml:evaluation_measures"]["oml:measures"][0]["oml:measure"], list):
         raise TypeError("Error in return XML, does not contain " '"oml:measure" as a list')
-    qualities = qualities["oml:evaluation_measures"]["oml:measures"][0]["oml:measure"]
-    return qualities
+    return qualities["oml:evaluation_measures"]["oml:measures"][0]["oml:measure"]
 
 
-def list_estimation_procedures() -> List[str]:
+def list_estimation_procedures() -> list[str]:
     """Return list of evaluation procedures available.
 
     The function performs an API call to retrieve the entire list of
@@ -296,7 +339,6 @@ def list_estimation_procedures() -> List[str]:
     -------
     list
     """
-
     api_call = "estimationprocedure/list"
     xml_string = openml._api_calls._perform_api_call(api_call, "get")
     api_results = xmltodict.parse(xml_string)
@@ -309,31 +351,30 @@ def list_estimation_procedures() -> List[str]:
 
     if not isinstance(api_results["oml:estimationprocedures"]["oml:estimationprocedure"], list):
         raise TypeError(
-            "Error in return XML, does not contain " '"oml:estimationprocedure" as a list'
+            "Error in return XML, does not contain " '"oml:estimationprocedure" as a list',
         )
 
-    prods = [
+    return [
         prod["oml:name"]
         for prod in api_results["oml:estimationprocedures"]["oml:estimationprocedure"]
     ]
-    return prods
 
 
 def list_evaluations_setups(
     function: str,
-    offset: Optional[int] = None,
-    size: Optional[int] = None,
-    tasks: Optional[List] = None,
-    setups: Optional[List] = None,
-    flows: Optional[List] = None,
-    runs: Optional[List] = None,
-    uploaders: Optional[List] = None,
-    tag: Optional[str] = None,
-    per_fold: Optional[bool] = None,
-    sort_order: Optional[str] = None,
+    offset: int | None = None,
+    size: int | None = None,
+    tasks: list | None = None,
+    setups: list | None = None,
+    flows: list | None = None,
+    runs: list | None = None,
+    uploaders: list | None = None,
+    tag: str | None = None,
+    per_fold: bool | None = None,
+    sort_order: str | None = None,
     output_format: str = "dataframe",
-    parameters_in_separate_columns: bool = False,
-) -> Union[Dict, pd.DataFrame]:
+    parameters_in_separate_columns: bool = False,  # noqa: FBT001, FBT002
+) -> dict | pd.DataFrame:
     """
     List all run-evaluation pairs matching all of the given filters
     and their hyperparameter settings.
@@ -376,7 +417,7 @@ def list_evaluations_setups(
     """
     if parameters_in_separate_columns and (flows is None or len(flows) != 1):
         raise ValueError(
-            "Can set parameters_in_separate_columns to true " "only for single flow_id"
+            "Can set parameters_in_separate_columns to true " "only for single flow_id",
         )
 
     # List evaluations
@@ -397,40 +438,42 @@ def list_evaluations_setups(
     # List setups
     # list_setups by setup id does not support large sizes (exceeds URL length limit)
     # Hence we split the list of unique setup ids returned by list_evaluations into chunks of size N
-    df = pd.DataFrame()
+    _df = pd.DataFrame()
     if len(evals) != 0:
         N = 100  # size of section
         length = len(evals["setup_id"].unique())  # length of the array we want to split
         # array_split - allows indices_or_sections to not equally divide the array
         # array_split -length % N sub-arrays of size length//N + 1 and the rest of size length//N.
-        setup_chunks = np.array_split(
-            ary=evals["setup_id"].unique(), indices_or_sections=((length - 1) // N) + 1
-        )
+        uniq = np.asarray(evals["setup_id"].unique())
+        setup_chunks = np.array_split(uniq, ((length - 1) // N) + 1)
         setup_data = pd.DataFrame()
-        for setups in setup_chunks:
-            result = pd.DataFrame(
-                openml.setups.list_setups(setup=setups, output_format="dataframe")
-            )
-            result.drop("flow_id", axis=1, inplace=True)
+        for _setups in setup_chunks:
+            result = openml.setups.list_setups(setup=_setups, output_format="dataframe")
+            assert isinstance(result, pd.DataFrame)
+            result = result.drop("flow_id", axis=1)
             # concat resulting setup chunks into single datframe
             setup_data = pd.concat([setup_data, result], ignore_index=True)
+
         parameters = []
         # Convert parameters of setup into list of tuples of (hyperparameter, value)
         for parameter_dict in setup_data["parameters"]:
             if parameter_dict is not None:
                 parameters.append(
-                    {param["full_name"]: param["value"] for param in parameter_dict.values()}
+                    {param["full_name"]: param["value"] for param in parameter_dict.values()},
                 )
             else:
                 parameters.append({})
         setup_data["parameters"] = parameters
         # Merge setups with evaluations
-        df = pd.merge(evals, setup_data, on="setup_id", how="left")
+        _df = evals.merge(setup_data, on="setup_id", how="left")
 
     if parameters_in_separate_columns:
-        df = pd.concat([df.drop("parameters", axis=1), df["parameters"].apply(pd.Series)], axis=1)
+        _df = pd.concat(
+            [_df.drop("parameters", axis=1), _df["parameters"].apply(pd.Series)],
+            axis=1,
+        )
 
     if output_format == "dataframe":
-        return df
-    else:
-        return df.to_dict(orient="index")
+        return _df
+
+    return _df.to_dict(orient="index")
diff --git a/openml/exceptions.py b/openml/exceptions.py
index a86434f51..fe63b8a58 100644
--- a/openml/exceptions.py
+++ b/openml/exceptions.py
@@ -1,9 +1,10 @@
 # License: BSD 3-Clause
-
-from typing import Optional
+from __future__ import annotations
 
 
 class PyOpenMLError(Exception):
+    """Base class for all exceptions in OpenML-Python."""
+
     def __init__(self, message: str):
         self.message = message
         super().__init__(message)
@@ -11,55 +12,47 @@ def __init__(self, message: str):
 
 class OpenMLServerError(PyOpenMLError):
     """class for when something is really wrong on the server
-    (result did not parse to dict), contains unparsed error."""
+    (result did not parse to dict), contains unparsed error.
+    """
 
-    pass
 
-
-class OpenMLServerException(OpenMLServerError):
+class OpenMLServerException(OpenMLServerError):  # noqa: N818
     """exception for when the result of the server was
-    not 200 (e.g., listing call w/o results)."""
+    not 200 (e.g., listing call w/o results).
+    """
 
     # Code needs to be optional to allow the exception to be picklable:
     # https://stackoverflow.com/questions/16244923/how-to-make-a-custom-exception-class-with-multiple-init-args-pickleable  # noqa: E501
-    def __init__(self, message: str, code: Optional[int] = None, url: Optional[str] = None):
+    def __init__(self, message: str, code: int | None = None, url: str | None = None):
         self.message = message
         self.code = code
         self.url = url
         super().__init__(message)
 
-    def __str__(self):
+    def __str__(self) -> str:
         return f"{self.url} returned code {self.code}: {self.message}"
 
 
 class OpenMLServerNoResult(OpenMLServerException):
     """Exception for when the result of the server is empty."""
 
-    pass
-
 
-class OpenMLCacheException(PyOpenMLError):
+class OpenMLCacheException(PyOpenMLError):  # noqa: N818
     """Dataset / task etc not found in cache"""
 
-    pass
 
-
-class OpenMLHashException(PyOpenMLError):
+class OpenMLHashException(PyOpenMLError):  # noqa: N818
     """Locally computed hash is different than hash announced by the server."""
 
-    pass
-
 
 class OpenMLPrivateDatasetError(PyOpenMLError):
     """Exception thrown when the user has no rights to access the dataset."""
 
-    pass
-
 
 class OpenMLRunsExistError(PyOpenMLError):
     """Indicates run(s) already exists on the server when they should not be duplicated."""
 
-    def __init__(self, run_ids: set, message: str):
+    def __init__(self, run_ids: set[int], message: str) -> None:
         if len(run_ids) < 1:
             raise ValueError("Set of run ids must be non-empty.")
         self.run_ids = run_ids
@@ -69,4 +62,6 @@ def __init__(self, run_ids: set, message: str):
 class OpenMLNotAuthorizedError(OpenMLServerError):
     """Indicates an authenticated user is not authorized to execute the requested action."""
 
-    pass
+
+class ObjectNotPublishedError(PyOpenMLError):
+    """Indicates an object has not been published yet."""
diff --git a/openml/extensions/__init__.py b/openml/extensions/__init__.py
index 91cbc1600..b49865e0e 100644
--- a/openml/extensions/__init__.py
+++ b/openml/extensions/__init__.py
@@ -3,8 +3,7 @@
 from typing import List, Type  # noqa: F401
 
 from .extension_interface import Extension
-from .functions import register_extension, get_extension_by_model, get_extension_by_flow
-
+from .functions import get_extension_by_flow, get_extension_by_model, register_extension
 
 extensions = []  # type: List[Type[Extension]]
 
diff --git a/openml/extensions/extension_interface.py b/openml/extensions/extension_interface.py
index 981bf2417..2a336eb52 100644
--- a/openml/extensions/extension_interface.py
+++ b/openml/extensions/extension_interface.py
@@ -1,21 +1,21 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from collections import OrderedDict  # noqa: F401
-from typing import Any, Dict, List, Optional, Tuple, TYPE_CHECKING, Union
-
-import numpy as np
-import scipy.sparse
+from collections import OrderedDict
+from typing import TYPE_CHECKING, Any
 
 # Avoid import cycles: https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles
 if TYPE_CHECKING:
+    import numpy as np
+    import scipy.sparse
+
     from openml.flows import OpenMLFlow
+    from openml.runs.trace import OpenMLRunTrace, OpenMLTraceIteration  # F401
     from openml.tasks.task import OpenMLTask
-    from openml.runs.trace import OpenMLRunTrace, OpenMLTraceIteration  # noqa F401
 
 
 class Extension(ABC):
-
     """Defines the interface to connect machine learning libraries to OpenML-Python.
 
     See ``openml.extension.sklearn.extension`` for an implementation to bootstrap from.
@@ -26,7 +26,7 @@ class Extension(ABC):
 
     @classmethod
     @abstractmethod
-    def can_handle_flow(cls, flow: "OpenMLFlow") -> bool:
+    def can_handle_flow(cls, flow: OpenMLFlow) -> bool:
         """Check whether a given flow can be handled by this extension.
 
         This is typically done by parsing the ``external_version`` field.
@@ -62,9 +62,9 @@ def can_handle_model(cls, model: Any) -> bool:
     @abstractmethod
     def flow_to_model(
         self,
-        flow: "OpenMLFlow",
-        initialize_with_defaults: bool = False,
-        strict_version: bool = True,
+        flow: OpenMLFlow,
+        initialize_with_defaults: bool = False,  # noqa: FBT001, FBT002
+        strict_version: bool = True,  # noqa: FBT002, FBT001
     ) -> Any:
         """Instantiate a model from the flow representation.
 
@@ -85,7 +85,7 @@ def flow_to_model(
         """
 
     @abstractmethod
-    def model_to_flow(self, model: Any) -> "OpenMLFlow":
+    def model_to_flow(self, model: Any) -> OpenMLFlow:
         """Transform a model to a flow for uploading it to OpenML.
 
         Parameters
@@ -98,7 +98,7 @@ def model_to_flow(self, model: Any) -> "OpenMLFlow":
         """
 
     @abstractmethod
-    def get_version_information(self) -> List[str]:
+    def get_version_information(self) -> list[str]:
         """List versions of libraries required by the flow.
 
         Returns
@@ -139,7 +139,7 @@ def is_estimator(self, model: Any) -> bool:
         """
 
     @abstractmethod
-    def seed_model(self, model: Any, seed: Optional[int]) -> Any:
+    def seed_model(self, model: Any, seed: int | None) -> Any:
         """Set the seed of all the unseeded components of a model and return the seeded model.
 
         Required so that all seed information can be uploaded to OpenML for reproducible results.
@@ -156,16 +156,16 @@ def seed_model(self, model: Any, seed: Optional[int]) -> Any:
         """
 
     @abstractmethod
-    def _run_model_on_fold(
+    def _run_model_on_fold(  # noqa: PLR0913
         self,
         model: Any,
-        task: "OpenMLTask",
-        X_train: Union[np.ndarray, scipy.sparse.spmatrix],
+        task: OpenMLTask,
+        X_train: np.ndarray | scipy.sparse.spmatrix,
         rep_no: int,
         fold_no: int,
-        y_train: Optional[np.ndarray] = None,
-        X_test: Optional[Union[np.ndarray, scipy.sparse.spmatrix]] = None,
-    ) -> Tuple[np.ndarray, np.ndarray, "OrderedDict[str, float]", Optional["OpenMLRunTrace"]]:
+        y_train: np.ndarray | None = None,
+        X_test: np.ndarray | scipy.sparse.spmatrix | None = None,
+    ) -> tuple[np.ndarray, np.ndarray | None, OrderedDict[str, float], OpenMLRunTrace | None]:
         """Run a model on a repeat, fold, subsample triplet of the task.
 
         Returns the data that is necessary to construct the OpenML Run object. Is used by
@@ -205,9 +205,9 @@ def _run_model_on_fold(
     @abstractmethod
     def obtain_parameter_values(
         self,
-        flow: "OpenMLFlow",
+        flow: OpenMLFlow,
         model: Any = None,
-    ) -> List[Dict[str, Any]]:
+    ) -> list[dict[str, Any]]:
         """Extracts all parameter settings required for the flow from the model.
 
         If no explicit model is provided, the parameters will be extracted from `flow.model`
@@ -251,7 +251,7 @@ def check_if_model_fitted(self, model: Any) -> bool:
     def instantiate_model_from_hpo_class(
         self,
         model: Any,
-        trace_iteration: "OpenMLTraceIteration",
+        trace_iteration: OpenMLTraceIteration,
     ) -> Any:
         """Instantiate a base model which can be searched over by the hyperparameter optimization
         model.
diff --git a/openml/extensions/functions.py b/openml/extensions/functions.py
index a080e1004..302ab246c 100644
--- a/openml/extensions/functions.py
+++ b/openml/extensions/functions.py
@@ -1,7 +1,7 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
-from typing import Any, Optional, Type, TYPE_CHECKING
-from . import Extension
+from typing import TYPE_CHECKING, Any
 
 # Need to implement the following by its full path because otherwise it won't be possible to
 # access openml.extensions.extensions
@@ -11,8 +11,10 @@
 if TYPE_CHECKING:
     from openml.flows import OpenMLFlow
 
+    from . import Extension
 
-def register_extension(extension: Type[Extension]) -> None:
+
+def register_extension(extension: type[Extension]) -> None:
     """Register an extension.
 
     Registered extensions are considered by ``get_extension_by_flow`` and
@@ -30,9 +32,9 @@ def register_extension(extension: Type[Extension]) -> None:
 
 
 def get_extension_by_flow(
-    flow: "OpenMLFlow",
-    raise_if_no_extension: bool = False,
-) -> Optional[Extension]:
+    flow: OpenMLFlow,
+    raise_if_no_extension: bool = False,  # noqa: FBT001, FBT002
+) -> Extension | None:
     """Get an extension which can handle the given flow.
 
     Iterates all registered extensions and checks whether they can handle the presented flow.
@@ -55,22 +57,23 @@ def get_extension_by_flow(
             candidates.append(extension_class())
     if len(candidates) == 0:
         if raise_if_no_extension:
-            raise ValueError("No extension registered which can handle flow: {}".format(flow))
-        else:
-            return None
-    elif len(candidates) == 1:
+            raise ValueError(f"No extension registered which can handle flow: {flow}")
+
+        return None
+
+    if len(candidates) == 1:
         return candidates[0]
-    else:
-        raise ValueError(
-            "Multiple extensions registered which can handle flow: {}, but only one "
-            "is allowed ({}).".format(flow, candidates)
-        )
+
+    raise ValueError(
+        f"Multiple extensions registered which can handle flow: {flow}, but only one "
+        f"is allowed ({candidates}).",
+    )
 
 
 def get_extension_by_model(
     model: Any,
-    raise_if_no_extension: bool = False,
-) -> Optional[Extension]:
+    raise_if_no_extension: bool = False,  # noqa: FBT001, FBT002
+) -> Extension | None:
     """Get an extension which can handle the given flow.
 
     Iterates all registered extensions and checks whether they can handle the presented model.
@@ -93,13 +96,14 @@ def get_extension_by_model(
             candidates.append(extension_class())
     if len(candidates) == 0:
         if raise_if_no_extension:
-            raise ValueError("No extension registered which can handle model: {}".format(model))
-        else:
-            return None
-    elif len(candidates) == 1:
+            raise ValueError(f"No extension registered which can handle model: {model}")
+
+        return None
+
+    if len(candidates) == 1:
         return candidates[0]
-    else:
-        raise ValueError(
-            "Multiple extensions registered which can handle model: {}, but only one "
-            "is allowed ({}).".format(model, candidates)
-        )
+
+    raise ValueError(
+        f"Multiple extensions registered which can handle model: {model}, but only one "
+        f"is allowed ({candidates}).",
+    )
diff --git a/openml/extensions/sklearn/__init__.py b/openml/extensions/sklearn/__init__.py
index 135e5ccf6..9c1c6cba6 100644
--- a/openml/extensions/sklearn/__init__.py
+++ b/openml/extensions/sklearn/__init__.py
@@ -1,15 +1,21 @@
 # License: BSD 3-Clause
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
 
-from .extension import SklearnExtension
 from openml.extensions import register_extension
 
+from .extension import SklearnExtension
+
+if TYPE_CHECKING:
+    import pandas as pd
 
 __all__ = ["SklearnExtension"]
 
 register_extension(SklearnExtension)
 
 
-def cont(X):
+def cont(X: pd.DataFrame) -> pd.Series:
     """Returns True for all non-categorical columns, False for the rest.
 
     This is a helper function for OpenML datasets encoded as DataFrames simplifying the handling
@@ -23,7 +29,7 @@ def cont(X):
     return X.dtypes != "category"
 
 
-def cat(X):
+def cat(X: pd.DataFrame) -> pd.Series:
     """Returns True for all categorical columns, False for the rest.
 
     This is a helper function for OpenML datasets encoded as DataFrames simplifying the handling
diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
index 82d202e9c..3427ca7c9 100644
--- a/openml/extensions/sklearn/extension.py
+++ b/openml/extensions/sklearn/extension.py
@@ -1,23 +1,27 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
-from collections import OrderedDict  # noqa: F401
+import contextlib
 import copy
-from distutils.version import LooseVersion
 import importlib
 import inspect
 import json
 import logging
 import re
-from re import IGNORECASE
 import sys
 import time
-from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union, cast, Sized
+import traceback
 import warnings
+from collections import OrderedDict
+from distutils.version import LooseVersion
+from json.decoder import JSONDecodeError
+from re import IGNORECASE
+from typing import Any, Callable, List, Sized, cast
 
 import numpy as np
 import pandas as pd
-import scipy.stats
 import scipy.sparse
+import scipy.stats
 import sklearn.base
 import sklearn.model_selection
 import sklearn.pipeline
@@ -26,26 +30,22 @@
 from openml.exceptions import PyOpenMLError
 from openml.extensions import Extension
 from openml.flows import OpenMLFlow
-from openml.runs.trace import OpenMLRunTrace, OpenMLTraceIteration, PREFIX
+from openml.runs.trace import PREFIX, OpenMLRunTrace, OpenMLTraceIteration
 from openml.tasks import (
-    OpenMLTask,
-    OpenMLSupervisedTask,
     OpenMLClassificationTask,
-    OpenMLLearningCurveTask,
     OpenMLClusteringTask,
+    OpenMLLearningCurveTask,
     OpenMLRegressionTask,
+    OpenMLSupervisedTask,
+    OpenMLTask,
 )
 
 logger = logging.getLogger(__name__)
 
-if sys.version_info >= (3, 5):
-    from json.decoder import JSONDecodeError
-else:
-    JSONDecodeError = ValueError
 
 DEPENDENCIES_PATTERN = re.compile(
     r"^(?P<name>[\w\-]+)((?P<operation>==|>=|>)"
-    r"(?P<version>(\d+\.)?(\d+\.)?(\d+)?(dev)?[0-9]*))?$"
+    r"(?P<version>(\d+\.)?(\d+\.)?(\d+)?(dev)?[0-9]*))?$",
 )
 
 SIMPLE_NUMPY_TYPES = [
@@ -54,7 +54,7 @@
     for nptype in nptypes  # type: ignore
     if type_cat != "others"
 ]
-SIMPLE_TYPES = tuple([bool, int, float, str] + SIMPLE_NUMPY_TYPES)
+SIMPLE_TYPES = (bool, int, float, str, *SIMPLE_NUMPY_TYPES)
 
 SKLEARN_PIPELINE_STRING_COMPONENTS = ("drop", "passthrough")
 COMPONENT_REFERENCE = "component_reference"
@@ -71,7 +71,7 @@ class SklearnExtension(Extension):
     # General setup
 
     @classmethod
-    def can_handle_flow(cls, flow: "OpenMLFlow") -> bool:
+    def can_handle_flow(cls, flow: OpenMLFlow) -> bool:
         """Check whether a given describes a scikit-learn estimator.
 
         This is done by parsing the ``external_version`` field.
@@ -101,8 +101,11 @@ def can_handle_model(cls, model: Any) -> bool:
         return isinstance(model, sklearn.base.BaseEstimator)
 
     @classmethod
-    def trim_flow_name(
-        cls, long_name: str, extra_trim_length: int = 100, _outer: bool = True
+    def trim_flow_name(  # noqa: C901
+        cls,
+        long_name: str,
+        extra_trim_length: int = 100,
+        _outer: bool = True,  # noqa: FBT001, FBT002
     ) -> str:
         """Shorten generated sklearn flow name to at most ``max_length`` characters.
 
@@ -157,7 +160,7 @@ def remove_all_in_parentheses(string: str) -> str:
         # the example below, we want to trim `sklearn.tree.tree.DecisionTreeClassifier`, and
         # keep it in the final trimmed flow name:
         # sklearn.pipeline.Pipeline(Imputer=sklearn.preprocessing.imputation.Imputer,
-        # VarianceThreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold,
+        # VarianceThreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold,  # noqa: ERA001, E501
         # Estimator=sklearn.model_selection._search.RandomizedSearchCV(estimator=
         # sklearn.tree.tree.DecisionTreeClassifier))
         if "sklearn.model_selection" in long_name:
@@ -173,7 +176,7 @@ def remove_all_in_parentheses(string: str) -> str:
             # Now we want to also find and parse the `estimator`, for this we find the closing
             # parenthesis to the model selection technique:
             closing_parenthesis_expected = 1
-            for i, char in enumerate(long_name[estimator_start:], start=estimator_start):
+            for char in long_name[estimator_start:]:
                 if char == "(":
                     closing_parenthesis_expected += 1
                 if char == ")":
@@ -181,11 +184,13 @@ def remove_all_in_parentheses(string: str) -> str:
                 if closing_parenthesis_expected == 0:
                     break
 
-            model_select_pipeline = long_name[estimator_start:i]
+            _end: int = estimator_start + len(long_name[estimator_start:]) - 1
+            model_select_pipeline = long_name[estimator_start:_end]
+
             trimmed_pipeline = cls.trim_flow_name(model_select_pipeline, _outer=False)
             _, trimmed_pipeline = trimmed_pipeline.split(".", maxsplit=1)  # trim module prefix
-            model_select_short = "sklearn.{}[{}]".format(model_selection_class, trimmed_pipeline)
-            name = long_name[:start_index] + model_select_short + long_name[i + 1 :]
+            model_select_short = f"sklearn.{model_selection_class}[{trimmed_pipeline}]"
+            name = long_name[:start_index] + model_select_short + long_name[_end + 1 :]
         else:
             name = long_name
 
@@ -204,7 +209,7 @@ def remove_all_in_parentheses(string: str) -> str:
             components = [component.split(".")[-1] for component in pipeline.split(",")]
             pipeline = "{}({})".format(pipeline_class, ",".join(components))
             if len(short_name.format(pipeline)) > extra_trim_length:
-                pipeline = "{}(...,{})".format(pipeline_class, components[-1])
+                pipeline = f"{pipeline_class}(...,{components[-1]})"
         else:
             # Just a simple component: e.g. sklearn.tree.DecisionTreeClassifier
             pipeline = remove_all_in_parentheses(name).split(".")[-1]
@@ -242,10 +247,10 @@ def _min_dependency_str(cls, sklearn_version: str) -> str:
                 from sklearn import _min_dependencies as _mindep
 
                 dependency_list = {
-                    "numpy": "{}".format(_mindep.NUMPY_MIN_VERSION),
-                    "scipy": "{}".format(_mindep.SCIPY_MIN_VERSION),
-                    "joblib": "{}".format(_mindep.JOBLIB_MIN_VERSION),
-                    "threadpoolctl": "{}".format(_mindep.THREADPOOLCTL_MIN_VERSION),
+                    "numpy": f"{_mindep.NUMPY_MIN_VERSION}",
+                    "scipy": f"{_mindep.SCIPY_MIN_VERSION}",
+                    "joblib": f"{_mindep.JOBLIB_MIN_VERSION}",
+                    "threadpoolctl": f"{_mindep.THREADPOOLCTL_MIN_VERSION}",
                 }
             elif LooseVersion(sklearn_version) >= "0.23":
                 dependency_list = {
@@ -269,8 +274,8 @@ def _min_dependency_str(cls, sklearn_version: str) -> str:
             # the dependency list will be accurately updated for any flow uploaded to OpenML
             dependency_list = {"numpy": "1.6.1", "scipy": "0.9"}
 
-        sklearn_dep = "sklearn=={}".format(sklearn_version)
-        dep_str = "\n".join(["{}>={}".format(k, v) for k, v in dependency_list.items()])
+        sklearn_dep = f"sklearn=={sklearn_version}"
+        dep_str = "\n".join([f"{k}>={v}" for k, v in dependency_list.items()])
         return "\n".join([sklearn_dep, dep_str])
 
     ################################################################################################
@@ -278,9 +283,9 @@ def _min_dependency_str(cls, sklearn_version: str) -> str:
 
     def flow_to_model(
         self,
-        flow: "OpenMLFlow",
-        initialize_with_defaults: bool = False,
-        strict_version: bool = True,
+        flow: OpenMLFlow,
+        initialize_with_defaults: bool = False,  # noqa: FBT001, FBT002
+        strict_version: bool = True,  # noqa: FBT001, FBT002
     ) -> Any:
         """Initializes a sklearn model based on a flow.
 
@@ -302,16 +307,18 @@ def flow_to_model(
         mixed
         """
         return self._deserialize_sklearn(
-            flow, initialize_with_defaults=initialize_with_defaults, strict_version=strict_version
+            flow,
+            initialize_with_defaults=initialize_with_defaults,
+            strict_version=strict_version,
         )
 
-    def _deserialize_sklearn(
+    def _deserialize_sklearn(  # noqa: PLR0915, C901, PLR0913, PLR0912
         self,
         o: Any,
-        components: Optional[Dict] = None,
-        initialize_with_defaults: bool = False,
+        components: dict | None = None,
+        initialize_with_defaults: bool = False,  # noqa: FBT001, FBT002
         recursion_depth: int = 0,
-        strict_version: bool = True,
+        strict_version: bool = True,  # noqa: FBT002, FBT001
     ) -> Any:
         """Recursive function to deserialize a scikit-learn flow.
 
@@ -346,10 +353,10 @@ def _deserialize_sklearn(
         -------
         mixed
         """
-
         logger.info(
-            "-%s flow_to_sklearn START o=%s, components=%s, init_defaults=%s"
-            % ("-" * recursion_depth, o, components, initialize_with_defaults)
+            "-{} flow_to_sklearn START o={}, components={}, init_defaults={}".format(
+                "-" * recursion_depth, o, components, initialize_with_defaults
+            ),
         )
         depth_pp = recursion_depth + 1  # shortcut var, depth plus plus
 
@@ -359,10 +366,8 @@ def _deserialize_sklearn(
         # the parameter values to the correct type.
 
         if isinstance(o, str):
-            try:
+            with contextlib.suppress(JSONDecodeError):
                 o = json.loads(o)
-            except JSONDecodeError:
-                pass
 
         if isinstance(o, dict):
             # Check if the dict encodes a 'special' object, which could not
@@ -382,7 +387,9 @@ def _deserialize_sklearn(
                         pass
                     elif serialized_type == COMPONENT_REFERENCE:
                         value = self._deserialize_sklearn(
-                            value, recursion_depth=depth_pp, strict_version=strict_version
+                            value,
+                            recursion_depth=depth_pp,
+                            strict_version=strict_version,
                         )
                     else:
                         raise NotImplementedError(serialized_type)
@@ -407,7 +414,9 @@ def _deserialize_sklearn(
                         rval = (step_name, component, value["argument_1"])
                 elif serialized_type == "cv_object":
                     rval = self._deserialize_cross_validator(
-                        value, recursion_depth=recursion_depth, strict_version=strict_version
+                        value,
+                        recursion_depth=recursion_depth,
+                        strict_version=strict_version,
                     )
                 else:
                     raise ValueError("Cannot flow_to_sklearn %s" % serialized_type)
@@ -458,10 +467,12 @@ def _deserialize_sklearn(
             )
         else:
             raise TypeError(o)
-        logger.info("-%s flow_to_sklearn END   o=%s, rval=%s" % ("-" * recursion_depth, o, rval))
+        logger.info(
+            "-{} flow_to_sklearn END   o={}, rval={}".format("-" * recursion_depth, o, rval)
+        )
         return rval
 
-    def model_to_flow(self, model: Any) -> "OpenMLFlow":
+    def model_to_flow(self, model: Any) -> OpenMLFlow:
         """Transform a scikit-learn model to a flow for uploading it to OpenML.
 
         Parameters
@@ -475,7 +486,7 @@ def model_to_flow(self, model: Any) -> "OpenMLFlow":
         # Necessary to make pypy not complain about all the different possible return types
         return self._serialize_sklearn(model)
 
-    def _serialize_sklearn(self, o: Any, parent_model: Optional[Any] = None) -> Any:
+    def _serialize_sklearn(self, o: Any, parent_model: Any | None = None) -> Any:  # noqa: PLR0912, C901
         rval = None  # type: Any
 
         # TODO: assert that only on first recursion lvl `parent_model` can be None
@@ -502,19 +513,17 @@ def _serialize_sklearn(self, o: Any, parent_model: Optional[Any] = None) -> Any:
         elif isinstance(o, dict):
             # TODO: explain what type of parameter is here
             if not isinstance(o, OrderedDict):
-                o = OrderedDict([(key, value) for key, value in sorted(o.items())])
+                o = OrderedDict(sorted(o.items()))
 
             rval = OrderedDict()
             for key, value in o.items():
                 if not isinstance(key, str):
                     raise TypeError(
                         "Can only use string as keys, you passed "
-                        "type %s for value %s." % (type(key), str(key))
+                        f"type {type(key)} for value {key!s}.",
                     )
-                key = self._serialize_sklearn(key, parent_model)
-                value = self._serialize_sklearn(value, parent_model)
-                rval[key] = value
-            rval = rval
+                _key = self._serialize_sklearn(key, parent_model)
+                rval[_key] = self._serialize_sklearn(value, parent_model)
         elif isinstance(o, type):
             # TODO: explain what type of parameter is here
             rval = self._serialize_type(o)
@@ -534,7 +543,7 @@ def _serialize_sklearn(self, o: Any, parent_model: Optional[Any] = None) -> Any:
 
         return rval
 
-    def get_version_information(self) -> List[str]:
+    def get_version_information(self) -> list[str]:
         """List versions of libraries required by the flow.
 
         Libraries listed are ``Python``, ``scikit-learn``, ``numpy`` and ``scipy``.
@@ -543,22 +552,21 @@ def get_version_information(self) -> List[str]:
         -------
         List
         """
-
         # This can possibly be done by a package such as pyxb, but I could not get
         # it to work properly.
-        import sklearn
-        import scipy
         import numpy
+        import scipy
+        import sklearn
 
         major, minor, micro, _, _ = sys.version_info
         python_version = "Python_{}.".format(".".join([str(major), str(minor), str(micro)]))
-        sklearn_version = "Sklearn_{}.".format(sklearn.__version__)
-        numpy_version = "NumPy_{}.".format(numpy.__version__)  # type: ignore
-        scipy_version = "SciPy_{}.".format(scipy.__version__)
+        sklearn_version = f"Sklearn_{sklearn.__version__}."
+        numpy_version = f"NumPy_{numpy.__version__}."  # type: ignore
+        scipy_version = f"SciPy_{scipy.__version__}."
 
         return [python_version, sklearn_version, numpy_version, scipy_version]
 
-    def create_setup_string(self, model: Any) -> str:
+    def create_setup_string(self, model: Any) -> str:  # noqa: ARG002
         """Create a string which can be used to reinstantiate the given model.
 
         Parameters
@@ -569,8 +577,7 @@ def create_setup_string(self, model: Any) -> str:
         -------
         str
         """
-        run_environment = " ".join(self.get_version_information())
-        return run_environment
+        return " ".join(self.get_version_information())
 
     def _is_cross_validator(self, o: Any) -> bool:
         return isinstance(o, sklearn.model_selection.BaseCrossValidator)
@@ -584,7 +591,7 @@ def _is_sklearn_flow(cls, flow: OpenMLFlow) -> bool:
         return sklearn_dependency or sklearn_as_external
 
     def _get_sklearn_description(self, model: Any, char_lim: int = 1024) -> str:
-        """Fetches the sklearn function docstring for the flow description
+        r"""Fetches the sklearn function docstring for the flow description
 
         Retrieves the sklearn docstring available and does the following:
         * If length of docstring <= char_lim, then returns the complete docstring
@@ -618,14 +625,13 @@ def match_format(s):
             s = s[:index]
             # trimming docstring to be within char_lim
             if len(s) > char_lim:
-                s = "{}...".format(s[: char_lim - 3])
+                s = f"{s[: char_lim - 3]}..."
             return s.strip()
         except ValueError:
             logger.warning(
                 "'Read more' not found in descriptions. "
-                "Trying to trim till 'Parameters' if available in docstring."
+                "Trying to trim till 'Parameters' if available in docstring.",
             )
-            pass
         try:
             # if 'Read more' doesn't exist, trim till 'Parameters'
             pattern = "Parameters"
@@ -637,10 +643,10 @@ def match_format(s):
         s = s[:index]
         # trimming docstring to be within char_lim
         if len(s) > char_lim:
-            s = "{}...".format(s[: char_lim - 3])
+            s = f"{s[: char_lim - 3]}..."
         return s.strip()
 
-    def _extract_sklearn_parameter_docstring(self, model) -> Union[None, str]:
+    def _extract_sklearn_parameter_docstring(self, model) -> None | str:
         """Extracts the part of sklearn docstring containing parameter information
 
         Fetches the entire docstring and trims just the Parameter section.
@@ -678,7 +684,7 @@ def match_format(s):
                 index2 = s.index(match_format(h))
                 break
             except ValueError:
-                logger.warning("{} not available in docstring".format(h))
+                logger.warning(f"{h} not available in docstring")
                 continue
         else:
             # in the case only 'Parameters' exist, trim till end of docstring
@@ -686,7 +692,7 @@ def match_format(s):
         s = s[index1:index2]
         return s.strip()
 
-    def _extract_sklearn_param_info(self, model, char_lim=1024) -> Union[None, Dict]:
+    def _extract_sklearn_param_info(self, model, char_lim=1024) -> None | dict:
         """Parses parameter type and description from sklearn dosctring
 
         Parameters
@@ -715,7 +721,7 @@ def _extract_sklearn_param_info(self, model, char_lim=1024) -> Union[None, Dict]
 
         # collecting parameters and their descriptions
         description = []  # type: List
-        for i, s in enumerate(lines):
+        for s in lines:
             param = p.findall(s)
             if param != []:
                 # a parameter definition is found by regex
@@ -724,19 +730,18 @@ def _extract_sklearn_param_info(self, model, char_lim=1024) -> Union[None, Dict]
                 # till another parameter is found and a new placeholder is created
                 placeholder = [""]  # type: List[str]
                 description.append(placeholder)
-            else:
-                if len(description) > 0:  # description=[] means no parameters found yet
-                    # appending strings to the placeholder created when parameter found
-                    description[-1].append(s)
+            elif len(description) > 0:  # description=[] means no parameters found yet
+                # appending strings to the placeholder created when parameter found
+                description[-1].append(s)
         for i in range(len(description)):
             # concatenating parameter description strings
             description[i] = "\n".join(description[i]).strip()
             # limiting all parameter descriptions to accepted OpenML string length
             if len(description[i]) > char_lim:
-                description[i] = "{}...".format(description[i][: char_lim - 3])
+                description[i] = f"{description[i][: char_lim - 3]}..."
 
         # collecting parameters and their types
-        parameter_docs = OrderedDict()  # type: Dict
+        parameter_docs = OrderedDict()
         matches = p.findall(docstring)
         for i, param in enumerate(matches):
             key, value = str(param).split(":")
@@ -765,7 +770,6 @@ def _serialize_model(self, model: Any) -> OpenMLFlow:
         OpenMLFlow
 
         """
-
         # Get all necessary information about the model objects itself
         (
             parameters,
@@ -786,25 +790,24 @@ def _serialize_model(self, model: Any) -> OpenMLFlow:
         # will be part of the name (in brackets)
         sub_components_names = ""
         for key in subcomponents:
-            if isinstance(subcomponents[key], OpenMLFlow):
-                name = subcomponents[key].name
+            name_thing = subcomponents[key]
+            if isinstance(name_thing, OpenMLFlow):
+                name = name_thing.name
             elif (
-                isinstance(subcomponents[key], str)
+                isinstance(name_thing, str)
                 and subcomponents[key] in SKLEARN_PIPELINE_STRING_COMPONENTS
             ):
-                name = subcomponents[key]
+                name = name_thing
             else:
                 raise TypeError(type(subcomponents[key]))
+
             if key in subcomponents_explicit:
                 sub_components_names += "," + key + "=" + name
             else:
                 sub_components_names += "," + name
 
-        if sub_components_names:
-            # slice operation on string in order to get rid of leading comma
-            name = "%s(%s)" % (class_name, sub_components_names[1:])
-        else:
-            name = class_name
+        # slice operation on string in order to get rid of leading comma
+        name = f"{class_name}({sub_components_names[1:]})" if sub_components_names else class_name
         short_name = SklearnExtension.trim_flow_name(name)
 
         # Get the external versions of all sub-components
@@ -813,7 +816,7 @@ def _serialize_model(self, model: Any) -> OpenMLFlow:
         tags = self._get_tags()
 
         sklearn_description = self._get_sklearn_description(model)
-        flow = OpenMLFlow(
+        return OpenMLFlow(
             name=name,
             class_name=class_name,
             custom_name=short_name,
@@ -829,14 +832,11 @@ def _serialize_model(self, model: Any) -> OpenMLFlow:
             dependencies=dependencies,
         )
 
-        return flow
-
     def _get_dependencies(self) -> str:
-        dependencies = self._min_dependency_str(sklearn.__version__)
-        return dependencies
+        return self._min_dependency_str(sklearn.__version__)  # type: ignore
 
-    def _get_tags(self) -> List[str]:
-        sklearn_version = self._format_external_version("sklearn", sklearn.__version__)
+    def _get_tags(self) -> list[str]:
+        sklearn_version = self._format_external_version("sklearn", sklearn.__version__)  # type: ignore
         sklearn_version_formatted = sklearn_version.replace("==", "_")
         return [
             "openml-python",
@@ -853,7 +853,7 @@ def _get_tags(self) -> List[str]:
     def _get_external_version_string(
         self,
         model: Any,
-        sub_components: Dict[str, OpenMLFlow],
+        sub_components: dict[str, OpenMLFlow],
     ) -> str:
         # Create external version string for a flow, given the model and the
         # already parsed dictionary of sub_components. Retrieves the external
@@ -875,7 +875,7 @@ def _get_external_version_string(
             external_versions.add(external_version)
 
         openml_version = self._format_external_version("openml", openml.__version__)
-        sklearn_version = self._format_external_version("sklearn", sklearn.__version__)
+        sklearn_version = self._format_external_version("sklearn", sklearn.__version__)  # type: ignore
         external_versions.add(openml_version)
         external_versions.add(sklearn_version)
         for visitee in sub_components.values():
@@ -883,16 +883,16 @@ def _get_external_version_string(
                 continue
             for external_version in visitee.external_version.split(","):
                 external_versions.add(external_version)
-        return ",".join(list(sorted(external_versions)))
+        return ",".join(sorted(external_versions))
 
     def _check_multiple_occurence_of_component_in_flow(
         self,
         model: Any,
-        sub_components: Dict[str, OpenMLFlow],
+        sub_components: dict[str, OpenMLFlow],
     ) -> None:
-        to_visit_stack = []  # type: List[OpenMLFlow]
+        to_visit_stack: list[OpenMLFlow] = []
         to_visit_stack.extend(sub_components.values())
-        known_sub_components = set()  # type: Set[str]
+        known_sub_components: set[str] = set()
 
         while len(to_visit_stack) > 0:
             visitee = to_visit_stack.pop()
@@ -900,21 +900,21 @@ def _check_multiple_occurence_of_component_in_flow(
                 known_sub_components.add(visitee)
             elif visitee.name in known_sub_components:
                 raise ValueError(
-                    "Found a second occurence of component %s when "
-                    "trying to serialize %s." % (visitee.name, model)
+                    f"Found a second occurence of component {visitee.name} when "
+                    f"trying to serialize {model}.",
                 )
             else:
                 known_sub_components.add(visitee.name)
                 to_visit_stack.extend(visitee.components.values())
 
-    def _extract_information_from_model(
+    def _extract_information_from_model(  # noqa: PLR0915, C901, PLR0912
         self,
         model: Any,
-    ) -> Tuple[
-        "OrderedDict[str, Optional[str]]",
-        "OrderedDict[str, Optional[Dict]]",
-        "OrderedDict[str, OpenMLFlow]",
-        Set,
+    ) -> tuple[
+        OrderedDict[str, str | None],
+        OrderedDict[str, dict | None],
+        OrderedDict[str, OpenMLFlow],
+        set,
     ]:
         # This function contains four "global" states and is quite long and
         # complicated. If it gets to complicated to ensure it's correctness,
@@ -926,8 +926,8 @@ def _extract_information_from_model(
         sub_components = OrderedDict()  # type: OrderedDict[str, OpenMLFlow]
         # stores the keys of all subcomponents that should become
         sub_components_explicit = set()
-        parameters = OrderedDict()  # type: OrderedDict[str, Optional[str]]
-        parameters_meta_info = OrderedDict()  # type: OrderedDict[str, Optional[Dict]]
+        parameters: OrderedDict[str, str | None] = OrderedDict()
+        parameters_meta_info: OrderedDict[str, dict | None] = OrderedDict()
         parameters_docs = self._extract_sklearn_param_info(model)
 
         model_parameters = model.get_params(deep=False)
@@ -951,18 +951,16 @@ def flatten_all(list_):
                 isinstance(rval, (list, tuple))
                 and len(rval) > 0
                 and isinstance(rval[0], (list, tuple))
-                and all([isinstance(rval_i, type(rval[0])) for rval_i in rval])
+                and all(isinstance(rval_i, type(rval[0])) for rval_i in rval)
             )
 
             # Check that all list elements are of simple types.
             nested_list_of_simple_types = (
                 is_non_empty_list_of_lists_with_same_type
-                and all([isinstance(el, SIMPLE_TYPES) for el in flatten_all(rval)])
+                and all(isinstance(el, SIMPLE_TYPES) for el in flatten_all(rval))
                 and all(
-                    [
-                        len(rv) in (2, 3) and rv[1] not in SKLEARN_PIPELINE_STRING_COMPONENTS
-                        for rv in rval
-                    ]
+                    len(rv) in (2, 3) and rv[1] not in SKLEARN_PIPELINE_STRING_COMPONENTS
+                    for rv in rval
                 )
             )
 
@@ -970,10 +968,10 @@ def flatten_all(list_):
                 # If a list of lists is identified that include 'non-simple' types (e.g. objects),
                 # we assume they are steps in a pipeline, feature union, or base classifiers in
                 # a voting classifier.
-                parameter_value = list()  # type: List
+                parameter_value = []  # type: List
                 reserved_keywords = set(model.get_params(deep=False).keys())
 
-                for i, sub_component_tuple in enumerate(rval):
+                for sub_component_tuple in rval:
                     identifier = sub_component_tuple[0]
                     sub_component = sub_component_tuple[1]
                     sub_component_type = type(sub_component_tuple)
@@ -982,7 +980,7 @@ def flatten_all(list_):
                         # Pipeline.steps, FeatureUnion.transformer_list}
                         # length 3 is for ColumnTransformer
                         msg = "Length of tuple of type {} does not match assumptions".format(
-                            sub_component_type
+                            sub_component_type,
                         )
                         raise ValueError(msg)
 
@@ -994,9 +992,7 @@ def flatten_all(list_):
                                 "got %s" % sub_component
                             )
                             raise ValueError(msg)
-                        else:
-                            pass
-                    elif isinstance(sub_component, type(None)):
+                    elif sub_component is None:
                         msg = (
                             "Cannot serialize objects of None type. Please use a valid "
                             "placeholder for None. Note that empty sklearn estimators can be "
@@ -1011,8 +1007,8 @@ def flatten_all(list_):
                         raise TypeError(msg)
 
                     if identifier in reserved_keywords:
-                        parent_model = "{}.{}".format(model.__module__, model.__class__.__name__)
-                        msg = "Found element shadowing official " "parameter for %s: %s" % (
+                        parent_model = f"{model.__module__}.{model.__class__.__name__}"
+                        msg = "Found element shadowing official " "parameter for {}: {}".format(
                             parent_model,
                             identifier,
                         )
@@ -1038,11 +1034,11 @@ def flatten_all(list_):
                             dependencies=dependencies,
                             model=None,
                         )
-                        component_reference = OrderedDict()  # type: Dict[str, Union[str, Dict]]
+                        component_reference: OrderedDict[str, str | dict] = OrderedDict()
                         component_reference[
                             "oml-python:serialized_object"
                         ] = COMPOSITION_STEP_CONSTANT
-                        cr_value = OrderedDict()  # type: Dict[str, Any]
+                        cr_value: dict[str, Any] = OrderedDict()
                         cr_value["key"] = identifier
                         cr_value["step_name"] = identifier
                         if len(sub_component_tuple) == 3:
@@ -1084,27 +1080,27 @@ def flatten_all(list_):
                 cr = self._serialize_sklearn(component_reference, model)
                 parameters[k] = json.dumps(cr)
 
+            elif not (hasattr(rval, "__len__") and len(rval) == 0):
+                rval = json.dumps(rval)
+                parameters[k] = rval
+            # a regular hyperparameter
             else:
-                # a regular hyperparameter
-                if not (hasattr(rval, "__len__") and len(rval) == 0):
-                    rval = json.dumps(rval)
-                    parameters[k] = rval
-                else:
-                    parameters[k] = None
+                parameters[k] = None
 
             if parameters_docs is not None:
                 data_type, description = parameters_docs[k]
                 parameters_meta_info[k] = OrderedDict(
-                    (("description", description), ("data_type", data_type))
+                    (("description", description), ("data_type", data_type)),
                 )
             else:
                 parameters_meta_info[k] = OrderedDict((("description", None), ("data_type", None)))
 
         return parameters, parameters_meta_info, sub_components, sub_components_explicit
 
-    def _get_fn_arguments_with_defaults(self, fn_name: Callable) -> Tuple[Dict, Set]:
+    def _get_fn_arguments_with_defaults(self, fn_name: Callable) -> tuple[dict, set]:
         """
-        Returns:
+        Returns
+        -------
             i) a dict with all parameter names that have a default value, and
             ii) a set with all parameter names that do not have a default
 
@@ -1123,8 +1119,8 @@ def _get_fn_arguments_with_defaults(self, fn_name: Callable) -> Tuple[Dict, Set]
         # parameters with defaults are optional, all others are required.
         parameters = inspect.signature(fn_name).parameters
         required_params = set()
-        optional_params = dict()
-        for param in parameters.keys():
+        optional_params = {}
+        for param in parameters:
             parameter = parameters.get(param)
             default_val = parameter.default  # type: ignore
             if default_val is inspect.Signature.empty:
@@ -1136,17 +1132,17 @@ def _get_fn_arguments_with_defaults(self, fn_name: Callable) -> Tuple[Dict, Set]
     def _deserialize_model(
         self,
         flow: OpenMLFlow,
-        keep_defaults: bool,
+        keep_defaults: bool,  # noqa: FBT001
         recursion_depth: int,
-        strict_version: bool = True,
+        strict_version: bool = True,  # noqa: FBT002, FBT001
     ) -> Any:
-        logger.info("-%s deserialize %s" % ("-" * recursion_depth, flow.name))
+        logger.info("-{} deserialize {}".format("-" * recursion_depth, flow.name))
         model_name = flow.class_name
         self._check_dependencies(flow.dependencies, strict_version=strict_version)
 
         parameters = flow.parameters
         components = flow.components
-        parameter_dict = OrderedDict()  # type: Dict[str, Any]
+        parameter_dict: dict[str, Any] = OrderedDict()
 
         # Do a shallow copy of the components dictionary so we can remove the
         # components from this copy once we added them into the pipeline. This
@@ -1157,7 +1153,9 @@ def _deserialize_model(
 
         for name in parameters:
             value = parameters.get(name)
-            logger.info("--%s flow_parameter=%s, value=%s" % ("-" * recursion_depth, name, value))
+            logger.info(
+                "--{} flow_parameter={}, value={}".format("-" * recursion_depth, name, value)
+            )
             rval = self._deserialize_sklearn(
                 value,
                 components=components_,
@@ -1173,36 +1171,46 @@ def _deserialize_model(
             if name not in components_:
                 continue
             value = components[name]
-            logger.info("--%s flow_component=%s, value=%s" % ("-" * recursion_depth, name, value))
+            logger.info(
+                "--{} flow_component={}, value={}".format("-" * recursion_depth, name, value)
+            )
             rval = self._deserialize_sklearn(
-                value, recursion_depth=recursion_depth + 1, strict_version=strict_version
+                value,
+                recursion_depth=recursion_depth + 1,
+                strict_version=strict_version,
             )
             parameter_dict[name] = rval
 
         if model_name is None and flow.name in SKLEARN_PIPELINE_STRING_COMPONENTS:
             return flow.name
-        else:
-            module_name = model_name.rsplit(".", 1)
-            model_class = getattr(importlib.import_module(module_name[0]), module_name[1])
-
-            if keep_defaults:
-                # obtain all params with a default
-                param_defaults, _ = self._get_fn_arguments_with_defaults(model_class.__init__)
-
-                # delete the params that have a default from the dict,
-                # so they get initialized with their default value
-                # except [...]
-                for param in param_defaults:
-                    # [...] the ones that also have a key in the components dict.
-                    # As OpenML stores different flows for ensembles with different
-                    # (base-)components, in OpenML terms, these are not considered
-                    # hyperparameters but rather constants (i.e., changing them would
-                    # result in a different flow)
-                    if param not in components.keys():
-                        del parameter_dict[param]
-            return model_class(**parameter_dict)
-
-    def _check_dependencies(self, dependencies: str, strict_version: bool = True) -> None:
+
+        assert model_name is not None
+        module_name = model_name.rsplit(".", 1)
+        model_class = getattr(importlib.import_module(module_name[0]), module_name[1])
+
+        if keep_defaults:
+            # obtain all params with a default
+            param_defaults, _ = self._get_fn_arguments_with_defaults(model_class.__init__)
+
+            # delete the params that have a default from the dict,
+            # so they get initialized with their default value
+            # except [...]
+            for param in param_defaults:
+                # [...] the ones that also have a key in the components dict.
+                # As OpenML stores different flows for ensembles with different
+                # (base-)components, in OpenML terms, these are not considered
+                # hyperparameters but rather constants (i.e., changing them would
+                # result in a different flow)
+                if param not in components:
+                    del parameter_dict[param]
+
+        return model_class(**parameter_dict)
+
+    def _check_dependencies(
+        self,
+        dependencies: str,
+        strict_version: bool = True,  # noqa: FBT001, FBT002
+    ) -> None:
         if not dependencies:
             return
 
@@ -1232,15 +1240,15 @@ def _check_dependencies(self, dependencies: str, strict_version: bool = True) ->
                 raise NotImplementedError("operation '%s' is not supported" % operation)
             message = (
                 "Trying to deserialize a model with dependency "
-                "%s not satisfied." % dependency_string
+                f"{dependency_string} not satisfied."
             )
             if not check:
                 if strict_version:
                     raise ValueError(message)
-                else:
-                    warnings.warn(message)
 
-    def _serialize_type(self, o: Any) -> "OrderedDict[str, str]":
+                warnings.warn(message, category=UserWarning, stacklevel=2)
+
+    def _serialize_type(self, o: Any) -> OrderedDict[str, str]:
         mapping = {
             float: "float",
             np.float32: "np.float32",
@@ -1250,8 +1258,8 @@ def _serialize_type(self, o: Any) -> "OrderedDict[str, str]":
             np.int64: "np.int64",
         }
         if LooseVersion(np.__version__) < "1.24":
-            mapping[np.float] = "np.float"
-            mapping[np.int] = "np.int"
+            mapping[float] = "np.float"
+            mapping[int] = "np.int"
 
         ret = OrderedDict()  # type: 'OrderedDict[str, str]'
         ret["oml-python:serialized_object"] = "type"
@@ -1267,26 +1275,28 @@ def _deserialize_type(self, o: str) -> Any:
             "np.int32": np.int32,
             "np.int64": np.int64,
         }
+
+        # TODO(eddiebergman): Might be able to remove this
         if LooseVersion(np.__version__) < "1.24":
-            mapping["np.float"] = np.float
-            mapping["np.int"] = np.int
+            mapping["np.float"] = np.float  # type: ignore # noqa: NPY001
+            mapping["np.int"] = np.int  # type: ignore # noqa: NPY001
 
         return mapping[o]
 
-    def _serialize_rv_frozen(self, o: Any) -> "OrderedDict[str, Union[str, Dict]]":
+    def _serialize_rv_frozen(self, o: Any) -> OrderedDict[str, str | dict]:
         args = o.args
         kwds = o.kwds
         a = o.a
         b = o.b
         dist = o.dist.__class__.__module__ + "." + o.dist.__class__.__name__
-        ret = OrderedDict()  # type: 'OrderedDict[str, Union[str, Dict]]'
+        ret: OrderedDict[str, str | dict] = OrderedDict()
         ret["oml-python:serialized_object"] = "rv_frozen"
         ret["value"] = OrderedDict(
-            (("dist", dist), ("a", a), ("b", b), ("args", args), ("kwds", kwds))
+            (("dist", dist), ("a", a), ("b", b), ("args", args), ("kwds", kwds)),
         )
         return ret
 
-    def _deserialize_rv_frozen(self, o: "OrderedDict[str, str]") -> Any:
+    def _deserialize_rv_frozen(self, o: OrderedDict[str, str]) -> Any:
         args = o["args"]
         kwds = o["kwds"]
         a = o["a"]
@@ -1296,17 +1306,23 @@ def _deserialize_rv_frozen(self, o: "OrderedDict[str, str]") -> Any:
         module_name = dist_name.rsplit(".", 1)
         try:
             rv_class = getattr(importlib.import_module(module_name[0]), module_name[1])
-        except AttributeError:
-            warnings.warn("Cannot create model %s for flow." % dist_name)
+        except AttributeError as e:
+            _tb = traceback.format_exc()
+            warnings.warn(
+                f"Cannot create model {dist_name} for flow. Reason is from error {type(e)}:{e}"
+                f"\nTraceback: {_tb}",
+                RuntimeWarning,
+                stacklevel=2,
+            )
             return None
 
-        dist = scipy.stats.distributions.rv_frozen(rv_class(), *args, **kwds)
+        dist = scipy.stats.distributions.rv_frozen(rv_class(), *args, **kwds)  # type: ignore
         dist.a = a
         dist.b = b
 
         return dist
 
-    def _serialize_function(self, o: Callable) -> "OrderedDict[str, str]":
+    def _serialize_function(self, o: Callable) -> OrderedDict[str, str]:
         name = o.__module__ + "." + o.__name__
         ret = OrderedDict()  # type: 'OrderedDict[str, str]'
         ret["oml-python:serialized_object"] = "function"
@@ -1315,11 +1331,10 @@ def _serialize_function(self, o: Callable) -> "OrderedDict[str, str]":
 
     def _deserialize_function(self, name: str) -> Callable:
         module_name = name.rsplit(".", 1)
-        function_handle = getattr(importlib.import_module(module_name[0]), module_name[1])
-        return function_handle
+        return getattr(importlib.import_module(module_name[0]), module_name[1])
 
-    def _serialize_cross_validator(self, o: Any) -> "OrderedDict[str, Union[str, Dict]]":
-        ret = OrderedDict()  # type: 'OrderedDict[str, Union[str, Dict]]'
+    def _serialize_cross_validator(self, o: Any) -> OrderedDict[str, str | dict]:
+        ret: OrderedDict[str, str | dict] = OrderedDict()
 
         parameters = OrderedDict()  # type: 'OrderedDict[str, Any]'
 
@@ -1327,7 +1342,7 @@ def _serialize_cross_validator(self, o: Any) -> "OrderedDict[str, Union[str, Dic
         cls = o.__class__
         init = getattr(cls.__init__, "deprecated_original", cls.__init__)
         # Ignore varargs, kw and default values and pop self
-        init_signature = inspect.signature(init)
+        init_signature = inspect.signature(init)  # type: ignore
         # Consider the constructor parameters excluding 'self'
         if init is object.__init__:
             args = []  # type: List
@@ -1337,7 +1352,7 @@ def _serialize_cross_validator(self, o: Any) -> "OrderedDict[str, Union[str, Dic
                     p.name
                     for p in init_signature.parameters.values()
                     if p.name != "self" and p.kind != p.VAR_KEYWORD
-                ]
+                ],
             )
 
         for key in args:
@@ -1366,7 +1381,10 @@ def _serialize_cross_validator(self, o: Any) -> "OrderedDict[str, Union[str, Dic
         return ret
 
     def _deserialize_cross_validator(
-        self, value: "OrderedDict[str, Any]", recursion_depth: int, strict_version: bool = True
+        self,
+        value: OrderedDict[str, Any],
+        recursion_depth: int,
+        strict_version: bool = True,  # noqa: FBT002, FBT001
     ) -> Any:
         model_name = value["name"]
         parameters = value["parameters"]
@@ -1386,12 +1404,13 @@ def _format_external_version(
         model_package_name: str,
         model_package_version_number: str,
     ) -> str:
-        return "%s==%s" % (model_package_name, model_package_version_number)
+        return f"{model_package_name}=={model_package_version_number}"
 
     @staticmethod
     def _get_parameter_values_recursive(
-        param_grid: Union[Dict, List[Dict]], parameter_name: str
-    ) -> List[Any]:
+        param_grid: dict | list[dict],
+        parameter_name: str,
+    ) -> list[Any]:
         """
         Returns a list of values for a given hyperparameter, encountered
         recursively throughout the flow. (e.g., n_jobs can be defined
@@ -1412,28 +1431,28 @@ def _get_parameter_values_recursive(
             A list of all values of hyperparameters with this name
         """
         if isinstance(param_grid, dict):
-            result = list()
-            for param, value in param_grid.items():
-                # n_jobs is scikit-learn parameter for parallelizing jobs
-                if param.split("__")[-1] == parameter_name:
-                    result.append(value)
-            return result
-        elif isinstance(param_grid, list):
-            result = list()
+            return [
+                value
+                for param, value in param_grid.items()
+                if param.split("__")[-1] == parameter_name
+            ]
+
+        if isinstance(param_grid, list):
+            result = []
             for sub_grid in param_grid:
                 result.extend(
-                    SklearnExtension._get_parameter_values_recursive(sub_grid, parameter_name)
+                    SklearnExtension._get_parameter_values_recursive(sub_grid, parameter_name),
                 )
             return result
-        else:
-            raise ValueError("Param_grid should either be a dict or list of dicts")
+
+        raise ValueError("Param_grid should either be a dict or list of dicts")
 
     def _prevent_optimize_n_jobs(self, model):
         """
         Ensures that HPO classes will not optimize the n_jobs hyperparameter
 
-        Parameters:
-        -----------
+        Parameters
+        ----------
         model:
             The model that will be fitted
         """
@@ -1450,19 +1469,20 @@ def _prevent_optimize_n_jobs(self, model):
                         "Using subclass BaseSearchCV other than "
                         "{GridSearchCV, RandomizedSearchCV}. "
                         "Could not find attribute "
-                        "param_distributions."
+                        "param_distributions.",
                     )
                 logger.warning(
                     "Warning! Using subclass BaseSearchCV other than "
                     "{GridSearchCV, RandomizedSearchCV}. "
-                    "Should implement param check. "
+                    "Should implement param check. ",
                 )
             n_jobs_vals = SklearnExtension._get_parameter_values_recursive(
-                param_distributions, "n_jobs"
+                param_distributions,
+                "n_jobs",
             )
             if len(n_jobs_vals) > 0:
                 raise PyOpenMLError(
-                    "openml-python should not be used to " "optimize the n_jobs parameter."
+                    "openml-python should not be used to " "optimize the n_jobs parameter.",
                 )
 
     ################################################################################################
@@ -1485,7 +1505,7 @@ def is_estimator(self, model: Any) -> bool:
         o = model
         return hasattr(o, "fit") and hasattr(o, "get_params") and hasattr(o, "set_params")
 
-    def seed_model(self, model: Any, seed: Optional[int] = None) -> Any:
+    def seed_model(self, model: Any, seed: int | None = None) -> Any:  # noqa: C901
         """Set the random state of all the unseeded components of a model and return the seeded
         model.
 
@@ -1511,17 +1531,19 @@ def seed_model(self, model: Any, seed: Optional[int] = None) -> Any:
         def _seed_current_object(current_value):
             if isinstance(current_value, int):  # acceptable behaviour
                 return False
-            elif isinstance(current_value, np.random.RandomState):
+
+            if isinstance(current_value, np.random.RandomState):
                 raise ValueError(
                     "Models initialized with a RandomState object are not "
-                    "supported. Please seed with an integer. "
+                    "supported. Please seed with an integer. ",
                 )
-            elif current_value is not None:
+
+            if current_value is not None:
                 raise ValueError(
-                    "Models should be seeded with int or None (this should never " "happen). "
+                    "Models should be seeded with int or None (this should never " "happen). ",
                 )
-            else:
-                return True
+
+            return True
 
         rs = np.random.RandomState(seed)
         model_params = model.get_params()
@@ -1561,12 +1583,15 @@ def check_if_model_fitted(self, model: Any) -> bool:
         -------
         bool
         """
+        from sklearn.exceptions import NotFittedError
+        from sklearn.utils.validation import check_is_fitted
+
         try:
             # check if model is fitted
-            from sklearn.exceptions import NotFittedError
+            check_is_fitted(model)
 
             # Creating random dummy data of arbitrary size
-            dummy_data = np.random.uniform(size=(10, 3))
+            dummy_data = np.random.uniform(size=(10, 3))  # noqa: NPY002
             # Using 'predict' instead of 'sklearn.utils.validation.check_is_fitted' for a more
             # robust check that works across sklearn versions and models. Internally, 'predict'
             # should call 'check_is_fitted' for every concerned attribute, thus offering a more
@@ -1581,17 +1606,20 @@ def check_if_model_fitted(self, model: Any) -> bool:
             # Will reach here if the model was fit on a dataset with more or less than 3 features
             return True
 
-    def _run_model_on_fold(
+    def _run_model_on_fold(  # noqa: PLR0915, PLR0913, C901, PLR0912
         self,
         model: Any,
-        task: "OpenMLTask",
-        X_train: Union[np.ndarray, scipy.sparse.spmatrix, pd.DataFrame],
+        task: OpenMLTask,
+        X_train: np.ndarray | scipy.sparse.spmatrix | pd.DataFrame,
         rep_no: int,
         fold_no: int,
-        y_train: Optional[np.ndarray] = None,
-        X_test: Optional[Union[np.ndarray, scipy.sparse.spmatrix, pd.DataFrame]] = None,
-    ) -> Tuple[
-        np.ndarray, Optional[pd.DataFrame], "OrderedDict[str, float]", Optional[OpenMLRunTrace]
+        y_train: np.ndarray | None = None,
+        X_test: np.ndarray | scipy.sparse.spmatrix | pd.DataFrame | None = None,
+    ) -> tuple[
+        np.ndarray,
+        pd.DataFrame | None,
+        OrderedDict[str, float],
+        OpenMLRunTrace | None,
     ]:
         """Run a model on a repeat,fold,subsample triplet of the task and return prediction
         information.
@@ -1640,7 +1668,9 @@ def _run_model_on_fold(
         """
 
         def _prediction_to_probabilities(
-            y: Union[np.ndarray, List], model_classes: List[Any], class_labels: Optional[List[str]]
+            y: np.ndarray | list,
+            model_classes: list[Any],
+            class_labels: list[str] | None,
         ) -> pd.DataFrame:
             """Transforms predicted probabilities to match with OpenML class indices.
 
@@ -1673,7 +1703,10 @@ def _prediction_to_probabilities(
 
             # DataFrame allows more accurate mapping of classes as column names
             result = pd.DataFrame(
-                0, index=np.arange(len(y)), columns=model_classes, dtype=np.float32
+                0,
+                index=np.arange(len(y)),
+                columns=model_classes,
+                dtype=np.float32,
             )
             for obs, prediction in enumerate(y):
                 result.loc[obs, prediction] = 1.0
@@ -1696,20 +1729,20 @@ def _prediction_to_probabilities(
             modelfit_start_walltime = time.time()
 
             if isinstance(task, OpenMLSupervisedTask):
-                model_copy.fit(X_train, y_train)
+                model_copy.fit(X_train, y_train)  # type: ignore
             elif isinstance(task, OpenMLClusteringTask):
-                model_copy.fit(X_train)
+                model_copy.fit(X_train)  # type: ignore
 
             modelfit_dur_cputime = (time.process_time() - modelfit_start_cputime) * 1000
             modelfit_dur_walltime = (time.time() - modelfit_start_walltime) * 1000
 
             user_defined_measures["usercpu_time_millis_training"] = modelfit_dur_cputime
-            refit_time = model_copy.refit_time_ * 1000 if hasattr(model_copy, "refit_time_") else 0
+            refit_time = model_copy.refit_time_ * 1000 if hasattr(model_copy, "refit_time_") else 0  # type: ignore
             user_defined_measures["wall_clock_time_millis_training"] = modelfit_dur_walltime
 
         except AttributeError as e:
             # typically happens when training a regressor on classification task
-            raise PyOpenMLError(str(e))
+            raise PyOpenMLError(str(e)) from e
 
         if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
             # search for model classes_ (might differ depending on modeltype)
@@ -1732,7 +1765,8 @@ def _prediction_to_probabilities(
             # to handle the case when dataset is numpy and categories are encoded
             # however the class labels stored in task are still categories
             if isinstance(y_train, np.ndarray) and isinstance(
-                cast(List, task.class_labels)[0], str
+                cast(List, task.class_labels)[0],
+                str,
             ):
                 model_classes = [cast(List[str], task.class_labels)[i] for i in model_classes]
 
@@ -1782,10 +1816,10 @@ def _prediction_to_probabilities(
                         proba_y.shape[1],
                         len(task.class_labels),
                     )
-                    warnings.warn(message)
+                    warnings.warn(message, stacklevel=2)
                     openml.config.logger.warning(message)
 
-                    for i, col in enumerate(task.class_labels):
+                    for _i, col in enumerate(task.class_labels):
                         # adding missing columns with 0 probability
                         if col not in model_classes:
                             proba_y[col] = 0
@@ -1798,30 +1832,27 @@ def _prediction_to_probabilities(
                 missing_cols = list(set(task.class_labels) - set(proba_y.columns))
                 raise ValueError("Predicted probabilities missing for the columns: ", missing_cols)
 
-        elif isinstance(task, OpenMLRegressionTask):
+        elif isinstance(task, (OpenMLRegressionTask, OpenMLClusteringTask)):
             proba_y = None
-
-        elif isinstance(task, OpenMLClusteringTask):
-            proba_y = None
-
         else:
             raise TypeError(type(task))
 
         if self._is_hpo_class(model_copy):
             trace_data = self._extract_trace_data(model_copy, rep_no, fold_no)
-            trace = self._obtain_arff_trace(
-                model_copy, trace_data
-            )  # type: Optional[OpenMLRunTrace]  # noqa E501
+            trace: OpenMLRunTrace | None = self._obtain_arff_trace(
+                model_copy,
+                trace_data,
+            )
         else:
             trace = None
 
         return pred_y, proba_y, user_defined_measures, trace
 
-    def obtain_parameter_values(
+    def obtain_parameter_values(  # noqa: C901, PLR0915
         self,
-        flow: "OpenMLFlow",
+        flow: OpenMLFlow,
         model: Any = None,
-    ) -> List[Dict[str, Any]]:
+    ) -> list[dict[str, Any]]:
         """Extracts all parameter settings required for the flow from the model.
 
         If no explicit model is provided, the parameters will be extracted from `flow.model`
@@ -1852,7 +1883,13 @@ def get_flow_dict(_flow):
                 flow_map.update(get_flow_dict(_flow.components[subflow]))
             return flow_map
 
-        def extract_parameters(_flow, _flow_dict, component_model, _main_call=False, main_id=None):
+        def extract_parameters(  # noqa: PLR0915, PLR0912, C901
+            _flow,
+            _flow_dict,
+            component_model,
+            _main_call=False,  # noqa: FBT002
+            main_id=None,
+        ):
             def is_subcomponent_specification(values):
                 # checks whether the current value can be a specification of
                 # subcomponents, as for example the value for steps parameter
@@ -1885,7 +1922,7 @@ def is_subcomponent_specification(values):
             ):
                 model_parameters = set()
             else:
-                model_parameters = set([mp for mp in component_model.get_params(deep=False)])
+                model_parameters = set(component_model.get_params(deep=False))
             if len(exp_parameters.symmetric_difference(model_parameters)) != 0:
                 flow_params = sorted(exp_parameters)
                 model_params = sorted(model_parameters)
@@ -1893,7 +1930,7 @@ def is_subcomponent_specification(values):
                     "Parameters of the model do not match the "
                     "parameters expected by the "
                     "flow:\nexpected flow parameters: "
-                    "%s\nmodel parameters: %s" % (flow_params, model_params)
+                    f"{flow_params}\nmodel parameters: {model_params}",
                 )
             exp_components = set(_flow.components)
             if (
@@ -1902,14 +1939,12 @@ def is_subcomponent_specification(values):
             ):
                 model_components = set()
             else:
-                _ = set([mp for mp in component_model.get_params(deep=False)])
-                model_components = set(
-                    [
-                        mp
-                        for mp in component_model.get_params(deep=True)
-                        if "__" not in mp and mp not in _
-                    ]
-                )
+                _ = set(component_model.get_params(deep=False))
+                model_components = {
+                    mp
+                    for mp in component_model.get_params(deep=True)
+                    if "__" not in mp and mp not in _
+                }
             if len(exp_components.symmetric_difference(model_components)) != 0:
                 is_problem = True
                 if len(exp_components - model_components) > 0:
@@ -1931,7 +1966,7 @@ def is_subcomponent_specification(values):
                         "Subcomponents of the model do not match the "
                         "parameters expected by the "
                         "flow:\nexpected flow subcomponents: "
-                        "%s\nmodel subcomponents: %s" % (flow_components, model_components)
+                        f"{flow_components}\nmodel subcomponents: {model_components}",
                     )
 
             _params = []
@@ -1949,7 +1984,7 @@ def is_subcomponent_specification(values):
 
                 if is_subcomponent_specification(current_param_values):
                     # complex parameter value, with subcomponents
-                    parsed_values = list()
+                    parsed_values = []
                     for subcomponent in current_param_values:
                         # scikit-learn stores usually tuples in the form
                         # (name (str), subcomponent (mixed), argument
@@ -1963,7 +1998,7 @@ def is_subcomponent_specification(values):
                         if not isinstance(subcomponent_identifier, str):
                             raise TypeError(
                                 "Subcomponent identifier should be of type string, "
-                                "but is {}".format(type(subcomponent_identifier))
+                                f"but is {type(subcomponent_identifier)}",
                             )
                         if not isinstance(subcomponent_flow, (openml.flows.OpenMLFlow, str)):
                             if (
@@ -1974,8 +2009,8 @@ def is_subcomponent_specification(values):
                             else:
                                 raise TypeError(
                                     "Subcomponent flow should be of type flow, but is {}".format(
-                                        type(subcomponent_flow)
-                                    )
+                                        type(subcomponent_flow),
+                                    ),
                                 )
 
                         current = {
@@ -1987,10 +2022,11 @@ def is_subcomponent_specification(values):
                         }
                         if len(subcomponent) == 3:
                             if not isinstance(subcomponent[2], list) and not isinstance(
-                                subcomponent[2], OrderedDict
+                                subcomponent[2],
+                                OrderedDict,
                             ):
                                 raise TypeError(
-                                    "Subcomponent argument should be list or OrderedDict"
+                                    "Subcomponent argument should be list or OrderedDict",
                                 )
                             current["value"]["argument_1"] = subcomponent[2]
                         parsed_values.append(current)
@@ -2010,16 +2046,16 @@ def is_subcomponent_specification(values):
                 subcomponent_model = component_model.get_params()[_identifier]
                 _params.extend(
                     extract_parameters(
-                        _flow.components[_identifier], _flow_dict, subcomponent_model
-                    )
+                        _flow.components[_identifier],
+                        _flow_dict,
+                        subcomponent_model,
+                    ),
                 )
             return _params
 
         flow_dict = get_flow_dict(flow)
         model = model if model is not None else flow.model
-        parameters = extract_parameters(flow, flow_dict, model, True, flow.flow_id)
-
-        return parameters
+        return extract_parameters(flow, flow_dict, model, _main_call=True, main_id=flow.flow_id)
 
     def _openml_param_name_to_sklearn(
         self,
@@ -2094,15 +2130,31 @@ def instantiate_model_from_hpo_class(
         if not self._is_hpo_class(model):
             raise AssertionError(
                 "Flow model %s is not an instance of sklearn.model_selection._search.BaseSearchCV"
-                % model
+                % model,
             )
         base_estimator = model.estimator
         base_estimator.set_params(**trace_iteration.get_parameters())
         return base_estimator
 
     def _extract_trace_data(self, model, rep_no, fold_no):
+        """Extracts data from a machine learning model's cross-validation results
+        and creates an ARFF (Attribute-Relation File Format) trace.
+
+        Parameters
+        ----------
+        model : Any
+            A fitted hyperparameter optimization model.
+        rep_no : int
+            The repetition number.
+        fold_no : int
+            The fold number.
+
+        Returns
+        -------
+        A list of ARFF tracecontent.
+        """
         arff_tracecontent = []
-        for itt_no in range(0, len(model.cv_results_["mean_test_score"])):
+        for itt_no in range(len(model.cv_results_["mean_test_score"])):
             # we use the string values for True and False, as it is defined in
             # this way by the OpenML server
             selected = "false"
@@ -2113,10 +2165,7 @@ def _extract_trace_data(self, model, rep_no, fold_no):
             for key in model.cv_results_:
                 if key.startswith("param_"):
                     value = model.cv_results_[key][itt_no]
-                    if value is not np.ma.masked:
-                        serialized_value = json.dumps(value)
-                    else:
-                        serialized_value = np.nan
+                    serialized_value = json.dumps(value) if value is not np.ma.masked else np.nan
                     arff_line.append(serialized_value)
             arff_tracecontent.append(arff_line)
         return arff_tracecontent
@@ -2124,8 +2173,8 @@ def _extract_trace_data(self, model, rep_no, fold_no):
     def _obtain_arff_trace(
         self,
         model: Any,
-        trace_content: List,
-    ) -> "OpenMLRunTrace":
+        trace_content: list,
+    ) -> OpenMLRunTrace:
         """Create arff trace object from a fitted model and the trace content obtained by
         repeatedly calling ``run_model_on_task``.
 
@@ -2144,7 +2193,7 @@ def _obtain_arff_trace(
         if not self._is_hpo_class(model):
             raise AssertionError(
                 "Flow model %s is not an instance of sklearn.model_selection._search.BaseSearchCV"
-                % model
+                % model,
             )
         if not hasattr(model, "cv_results_"):
             raise ValueError("model should contain `cv_results_`")
@@ -2171,20 +2220,20 @@ def _obtain_arff_trace(
                         or param_value is np.ma.masked
                     ):
                         # basic string values
-                        type = "STRING"
+                        type = "STRING"  # noqa: A001
                     elif isinstance(param_value, (list, tuple)) and all(
                         isinstance(i, int) for i in param_value
                     ):
                         # list of integers (usually for selecting features)
                         # hyperparameter layer_sizes of MLPClassifier
-                        type = "STRING"
+                        type = "STRING"  # noqa: A001
                     else:
                         raise TypeError("Unsupported param type in param grid: %s" % key)
 
                 # renamed the attribute param to parameter, as this is a required
                 # OpenML convention - this also guards against name collisions
                 # with the required trace attributes
-                attribute = (PREFIX + key[6:], type)
+                attribute = (PREFIX + key[6:], type)  # type: ignore
                 trace_attributes.append(attribute)
 
         return OpenMLRunTrace.generate(
diff --git a/openml/flows/__init__.py b/openml/flows/__init__.py
index f8d35c3f5..ce32fec7d 100644
--- a/openml/flows/__init__.py
+++ b/openml/flows/__init__.py
@@ -1,14 +1,13 @@
 # License: BSD 3-Clause
 
 from .flow import OpenMLFlow
-
 from .functions import (
-    get_flow,
-    list_flows,
-    flow_exists,
-    get_flow_id,
     assert_flows_equal,
     delete_flow,
+    flow_exists,
+    get_flow,
+    get_flow_id,
+    list_flows,
 )
 
 __all__ = [
diff --git a/openml/flows/flow.py b/openml/flows/flow.py
index b9752e77c..4e437e35c 100644
--- a/openml/flows/flow.py
+++ b/openml/flows/flow.py
@@ -1,15 +1,16 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
-from collections import OrderedDict
-import os
-from typing import Dict, List, Union, Tuple, Optional  # noqa: F401
 import logging
+from collections import OrderedDict
+from pathlib import Path
+from typing import Any, Hashable, Sequence
 
 import xmltodict
 
 from openml.base import OpenMLBase
-from ..extensions import get_extension_by_flow
-from ..utils import extract_xml_tags
+from openml.extensions import Extension, get_extension_by_flow
+from openml.utils import extract_xml_tags
 
 
 class OpenMLFlow(OpenMLBase):
@@ -59,10 +60,10 @@ class OpenMLFlow(OpenMLBase):
         A list of dependencies necessary to run the flow. This field should
         contain all libraries the flow depends on. To allow reproducibility
         it should also specify the exact version numbers.
-    class_name : str
+    class_name : str, optional
         The development language name of the class which is described by this
         flow.
-    custom_name : str
+    custom_name : str, optional
         Custom name of the flow given by the owner.
     binary_url : str, optional
         Url from which the binary can be downloaded. Added by the server.
@@ -81,32 +82,34 @@ class OpenMLFlow(OpenMLBase):
         Date the flow was uploaded. Filled in by the server.
     flow_id : int, optional
         Flow ID. Assigned by the server.
+    extension : Extension, optional
+        The extension for a flow (e.g., sklearn).
     version : str, optional
         OpenML version of the flow. Assigned by the server.
     """
 
-    def __init__(
+    def __init__(  # noqa: PLR0913
         self,
-        name,
-        description,
-        model,
-        components,
-        parameters,
-        parameters_meta_info,
-        external_version,
-        tags,
-        language,
-        dependencies,
-        class_name=None,
-        custom_name=None,
-        binary_url=None,
-        binary_format=None,
-        binary_md5=None,
-        uploader=None,
-        upload_date=None,
-        flow_id=None,
-        extension=None,
-        version=None,
+        name: str,
+        description: str,
+        model: object,
+        components: dict,
+        parameters: dict,
+        parameters_meta_info: dict,
+        external_version: str,
+        tags: list,
+        language: str,
+        dependencies: str,
+        class_name: str | None = None,
+        custom_name: str | None = None,
+        binary_url: str | None = None,
+        binary_format: str | None = None,
+        binary_md5: str | None = None,
+        uploader: str | None = None,
+        upload_date: str | None = None,
+        flow_id: int | None = None,
+        extension: Extension | None = None,
+        version: str | None = None,
     ):
         self.name = name
         self.description = description
@@ -117,10 +120,10 @@ def __init__(
             [parameters, "parameters"],
             [parameters_meta_info, "parameters_meta_info"],
         ]:
-            if not isinstance(variable, OrderedDict):
+            if not isinstance(variable, (OrderedDict, dict)):
                 raise TypeError(
-                    "%s must be of type OrderedDict, "
-                    "but is %s." % (variable_name, type(variable))
+                    f"{variable_name} must be of type OrderedDict or dict, "
+                    f"but is {type(variable)}.",
                 )
 
         self.components = components
@@ -133,13 +136,14 @@ def __init__(
         if len(keys_parameters.difference(keys_parameters_meta_info)) > 0:
             raise ValueError(
                 "Parameter %s only in parameters, but not in "
-                "parameters_meta_info." % str(keys_parameters.difference(keys_parameters_meta_info))
+                "parameters_meta_info."
+                % str(keys_parameters.difference(keys_parameters_meta_info)),
             )
         if len(keys_parameters_meta_info.difference(keys_parameters)) > 0:
             raise ValueError(
                 "Parameter %s only in parameters_meta_info, "
                 "but not in parameters."
-                % str(keys_parameters_meta_info.difference(keys_parameters))
+                % str(keys_parameters_meta_info.difference(keys_parameters)),
             )
 
         self.external_version = external_version
@@ -161,19 +165,21 @@ def __init__(
             self._extension = extension
 
     @property
-    def id(self) -> Optional[int]:
+    def id(self) -> int | None:
+        """The ID of the flow."""
         return self.flow_id
 
     @property
-    def extension(self):
+    def extension(self) -> Extension:
+        """The extension of the flow (e.g., sklearn)."""
         if self._extension is not None:
             return self._extension
-        else:
-            raise RuntimeError(
-                "No extension could be found for flow {}: {}".format(self.flow_id, self.name)
-            )
 
-    def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
+        raise RuntimeError(
+            f"No extension could be found for flow {self.flow_id}: {self.name}",
+        )
+
+    def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str]]]:
         """Collect all information to display in the __repr__ body."""
         fields = {
             "Flow Name": self.name,
@@ -181,10 +187,10 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
             "Dependencies": self.dependencies,
         }
         if self.flow_id is not None:
-            fields["Flow URL"] = self.openml_url
+            fields["Flow URL"] = self.openml_url if self.openml_url is not None else "None"
             fields["Flow ID"] = str(self.flow_id)
             if self.version is not None:
-                fields["Flow ID"] += " (version {})".format(self.version)
+                fields["Flow ID"] += f" (version {self.version})"
         if self.upload_date is not None:
             fields["Upload Date"] = self.upload_date.replace("T", " ")
         if self.binary_url is not None:
@@ -202,18 +208,18 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
         ]
         return [(key, fields[key]) for key in order if key in fields]
 
-    def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
+    def _to_dict(self) -> dict[str, dict]:  # noqa: C901, PLR0912
         """Creates a dictionary representation of self."""
-        flow_container = OrderedDict()  # type: 'OrderedDict[str, OrderedDict]'
+        flow_container = OrderedDict()  # type: 'dict[str, dict]'
         flow_dict = OrderedDict(
-            [("@xmlns:oml", "http://openml.org/openml")]
-        )  # type: 'OrderedDict[str, Union[List, str]]'  # noqa E501
+            [("@xmlns:oml", "http://openml.org/openml")],
+        )  # type: 'dict[str, list | str]'  # E501
         flow_container["oml:flow"] = flow_dict
         _add_if_nonempty(flow_dict, "oml:id", self.flow_id)
 
         for required in ["name", "external_version"]:
             if getattr(self, required) is None:
-                raise ValueError("self.{} is required but None".format(required))
+                raise ValueError(f"self.{required} is required but None")
         for attribute in [
             "uploader",
             "name",
@@ -226,7 +232,7 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
             "language",
             "dependencies",
         ]:
-            _add_if_nonempty(flow_dict, "oml:{}".format(attribute), getattr(self, attribute))
+            _add_if_nonempty(flow_dict, f"oml:{attribute}", getattr(self, attribute))
 
         if not self.description:
             logger = logging.getLogger(__name__)
@@ -245,15 +251,15 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
             for key_, value in param_dict.items():
                 if key_ is not None and not isinstance(key_, str):
                     raise ValueError(
-                        "Parameter name %s cannot be serialized "
-                        "because it is of type %s. Only strings "
-                        "can be serialized." % (key_, type(key_))
+                        f"Parameter name {key_} cannot be serialized "
+                        f"because it is of type {type(key_)}. Only strings "
+                        "can be serialized.",
                     )
                 if value is not None and not isinstance(value, str):
                     raise ValueError(
-                        "Parameter value %s cannot be serialized "
-                        "because it is of type %s. Only strings "
-                        "can be serialized." % (value, type(value))
+                        f"Parameter value {value} cannot be serialized "
+                        f"because it is of type {type(value)}. Only strings "
+                        "can be serialized.",
                     )
 
             flow_parameters.append(param_dict)
@@ -262,7 +268,7 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
 
         components = []
         for key in self.components:
-            component_dict = OrderedDict()  # type: 'OrderedDict[str, Dict]'
+            component_dict = OrderedDict()  # type: 'OrderedDict[str, dict]'
             component_dict["oml:identifier"] = key
             if self.components[key] in ["passthrough", "drop"]:
                 component_dict["oml:flow"] = {
@@ -277,9 +283,9 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
                 # value is a flow. The flow itself is valid by recursion
                 if key_ is not None and not isinstance(key_, str):
                     raise ValueError(
-                        "Parameter name %s cannot be serialized "
-                        "because it is of type %s. Only strings "
-                        "can be serialized." % (key_, type(key_))
+                        f"Parameter name {key_} cannot be serialized "
+                        f"because it is of type {type(key_)}. Only strings "
+                        "can be serialized.",
                     )
 
             components.append(component_dict)
@@ -287,12 +293,12 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
         flow_dict["oml:component"] = components
         flow_dict["oml:tag"] = self.tags
         for attribute in ["binary_url", "binary_format", "binary_md5"]:
-            _add_if_nonempty(flow_dict, "oml:{}".format(attribute), getattr(self, attribute))
+            _add_if_nonempty(flow_dict, f"oml:{attribute}", getattr(self, attribute))
 
         return flow_container
 
     @classmethod
-    def _from_dict(cls, xml_dict):
+    def _from_dict(cls, xml_dict: dict) -> OpenMLFlow:
         """Create a flow from an xml description.
 
         Calls itself recursively to create :class:`OpenMLFlow` objects of
@@ -310,7 +316,7 @@ def _from_dict(cls, xml_dict):
         -------
             OpenMLFlow
 
-        """  # noqa E501
+        """  # E501
         arguments = OrderedDict()
         dic = xml_dict["oml:flow"]
 
@@ -380,30 +386,34 @@ def _from_dict(cls, xml_dict):
         arguments["tags"] = extract_xml_tags("oml:tag", dic)
 
         arguments["model"] = None
-        flow = cls(**arguments)
+        return cls(**arguments)
 
-        return flow
+    def to_filesystem(self, output_directory: str | Path) -> None:
+        """Write a flow to the filesystem as XML to output_directory."""
+        output_directory = Path(output_directory)
+        output_directory.mkdir(parents=True, exist_ok=True)
 
-    def to_filesystem(self, output_directory: str) -> None:
-        os.makedirs(output_directory, exist_ok=True)
-        if "flow.xml" in os.listdir(output_directory):
+        output_path = output_directory / "flow.xml"
+        if output_path.exists():
             raise ValueError("Output directory already contains a flow.xml file.")
 
         run_xml = self._to_xml()
-        with open(os.path.join(output_directory, "flow.xml"), "w") as f:
+        with output_path.open("w") as f:
             f.write(run_xml)
 
     @classmethod
-    def from_filesystem(cls, input_directory) -> "OpenMLFlow":
-        with open(os.path.join(input_directory, "flow.xml"), "r") as f:
+    def from_filesystem(cls, input_directory: str | Path) -> OpenMLFlow:
+        """Read a flow from an XML in input_directory on the filesystem."""
+        input_directory = Path(input_directory) / "flow.xml"
+        with input_directory.open() as f:
             xml_string = f.read()
         return OpenMLFlow._from_dict(xmltodict.parse(xml_string))
 
-    def _parse_publish_response(self, xml_response: Dict):
+    def _parse_publish_response(self, xml_response: dict) -> None:
         """Parse the id from the xml_response and assign it to self."""
         self.flow_id = int(xml_response["oml:upload_flow"]["oml:id"])
 
-    def publish(self, raise_error_if_exists: bool = False) -> "OpenMLFlow":
+    def publish(self, raise_error_if_exists: bool = False) -> OpenMLFlow:  # noqa: FBT001, FBT002
         """Publish this flow to OpenML server.
 
         Raises a PyOpenMLError if the flow exists on the server, but
@@ -430,17 +440,17 @@ def publish(self, raise_error_if_exists: bool = False) -> "OpenMLFlow":
         if not flow_id:
             if self.flow_id:
                 raise openml.exceptions.PyOpenMLError(
-                    "Flow does not exist on the server, " "but 'flow.flow_id' is not None."
+                    "Flow does not exist on the server, " "but 'flow.flow_id' is not None.",
                 )
             super().publish()
+            assert self.flow_id is not None  # for mypy
             flow_id = self.flow_id
         elif raise_error_if_exists:
-            error_message = "This OpenMLFlow already exists with id: {}.".format(flow_id)
+            error_message = f"This OpenMLFlow already exists with id: {flow_id}."
             raise openml.exceptions.PyOpenMLError(error_message)
         elif self.flow_id is not None and self.flow_id != flow_id:
             raise openml.exceptions.PyOpenMLError(
-                "Local flow_id does not match server flow_id: "
-                "'{}' vs '{}'".format(self.flow_id, flow_id)
+                "Local flow_id does not match server flow_id: " f"'{self.flow_id}' vs '{flow_id}'",
             )
 
         flow = openml.flows.functions.get_flow(flow_id)
@@ -457,12 +467,12 @@ def publish(self, raise_error_if_exists: bool = False) -> "OpenMLFlow":
             message = e.args[0]
             raise ValueError(
                 "The flow on the server is inconsistent with the local flow. "
-                "The server flow ID is {}. Please check manually and remove "
-                "the flow if necessary! Error is:\n'{}'".format(flow_id, message)
-            )
+                f"The server flow ID is {flow_id}. Please check manually and remove "
+                f"the flow if necessary! Error is:\n'{message}'",
+            ) from e
         return self
 
-    def get_structure(self, key_item: str) -> Dict[str, List[str]]:
+    def get_structure(self, key_item: str) -> dict[str, list[str]]:
         """
         Returns for each sub-component of the flow the path of identifiers
         that should be traversed to reach this component. The resulting dict
@@ -482,15 +492,15 @@ def get_structure(self, key_item: str) -> Dict[str, List[str]]:
         """
         if key_item not in ["flow_id", "name"]:
             raise ValueError("key_item should be in {flow_id, name}")
-        structure = dict()
+        structure = {}
         for key, sub_flow in self.components.items():
             sub_structure = sub_flow.get_structure(key_item)
             for flow_name, flow_sub_structure in sub_structure.items():
-                structure[flow_name] = [key] + flow_sub_structure
+                structure[flow_name] = [key, *flow_sub_structure]
         structure[getattr(self, key_item)] = []
         return structure
 
-    def get_subflow(self, structure):
+    def get_subflow(self, structure: list[str]) -> OpenMLFlow:
         """
         Returns a subflow from the tree of dependencies.
 
@@ -512,17 +522,30 @@ def get_subflow(self, structure):
         sub_identifier = structure[0]
         if sub_identifier not in self.components:
             raise ValueError(
-                "Flow %s does not contain component with "
-                "identifier %s" % (self.name, sub_identifier)
+                f"Flow {self.name} does not contain component with " f"identifier {sub_identifier}",
             )
         if len(structure) == 1:
-            return self.components[sub_identifier]
-        else:
-            structure.pop(0)
-            return self.components[sub_identifier].get_subflow(structure)
+            return self.components[sub_identifier]  # type: ignore
+
+        structure.pop(0)
+        return self.components[sub_identifier].get_subflow(structure)  # type: ignore
 
 
-def _copy_server_fields(source_flow, target_flow):
+def _copy_server_fields(source_flow: OpenMLFlow, target_flow: OpenMLFlow) -> None:
+    """Recursively copies the fields added by the server
+    from the `source_flow` to the `target_flow`.
+
+    Parameters
+    ----------
+    source_flow : OpenMLFlow
+        To copy the fields from.
+    target_flow : OpenMLFlow
+        To copy the fields to.
+
+    Returns
+    -------
+    None
+    """
     fields_added_by_the_server = ["flow_id", "uploader", "version", "upload_date"]
     for field in fields_added_by_the_server:
         setattr(target_flow, field, getattr(source_flow, field))
@@ -532,6 +555,21 @@ def _copy_server_fields(source_flow, target_flow):
         _copy_server_fields(component, target_flow.components[name])
 
 
-def _add_if_nonempty(dic, key, value):
+def _add_if_nonempty(dic: dict, key: Hashable, value: Any) -> None:
+    """Adds a key-value pair to a dictionary if the value is not None.
+
+    Parameters
+    ----------
+    dic: dict
+        To add the key-value pair to.
+    key: hashable
+        To add to the dictionary.
+    value: Any
+        To add to the dictionary.
+
+    Returns
+    -------
+    None
+    """
     if value is not None:
         dic[key] = value
diff --git a/openml/flows/functions.py b/openml/flows/functions.py
index 0e278d33a..b01e54b44 100644
--- a/openml/flows/functions.py
+++ b/openml/flows/functions.py
@@ -1,20 +1,22 @@
 # License: BSD 3-Clause
-import warnings
+from __future__ import annotations
 
-import dateutil.parser
-from collections import OrderedDict
 import os
-import io
 import re
-import xmltodict
+import warnings
+from collections import OrderedDict
+from typing import Any, Dict, overload
+from typing_extensions import Literal
+
+import dateutil.parser
 import pandas as pd
-from typing import Any, Union, Dict, Optional, List
+import xmltodict
 
-from ..exceptions import OpenMLCacheException
 import openml._api_calls
-from . import OpenMLFlow
 import openml.utils
+from openml.exceptions import OpenMLCacheException
 
+from . import OpenMLFlow
 
 FLOWS_CACHE_DIR_NAME = "flows"
 
@@ -57,20 +59,19 @@ def _get_cached_flow(fid: int) -> OpenMLFlow:
     -------
     OpenMLFlow.
     """
-
     fid_cache_dir = openml.utils._create_cache_directory_for_id(FLOWS_CACHE_DIR_NAME, fid)
-    flow_file = os.path.join(fid_cache_dir, "flow.xml")
+    flow_file = fid_cache_dir / "flow.xml"
 
     try:
-        with io.open(flow_file, encoding="utf8") as fh:
+        with flow_file.open(encoding="utf8") as fh:
             return _create_flow_from_xml(fh.read())
-    except (OSError, IOError):
+    except OSError as e:
         openml.utils._remove_cache_dir_for_id(FLOWS_CACHE_DIR_NAME, fid_cache_dir)
-        raise OpenMLCacheException("Flow file for fid %d not " "cached" % fid)
+        raise OpenMLCacheException("Flow file for fid %d not " "cached" % fid) from e
 
 
 @openml.utils.thread_safe_if_oslo_installed
-def get_flow(flow_id: int, reinstantiate: bool = False, strict_version: bool = True) -> OpenMLFlow:
+def get_flow(flow_id: int, reinstantiate: bool = False, strict_version: bool = True) -> OpenMLFlow:  # noqa: FBT001, FBT002
     """Download the OpenML flow for a given flow ID.
 
     Parameters
@@ -121,25 +122,58 @@ def _get_flow_description(flow_id: int) -> OpenMLFlow:
     try:
         return _get_cached_flow(flow_id)
     except OpenMLCacheException:
-        xml_file = os.path.join(
-            openml.utils._create_cache_directory_for_id(FLOWS_CACHE_DIR_NAME, flow_id),
-            "flow.xml",
+        xml_file = (
+            openml.utils._create_cache_directory_for_id(FLOWS_CACHE_DIR_NAME, flow_id) / "flow.xml"
         )
-
         flow_xml = openml._api_calls._perform_api_call("flow/%d" % flow_id, request_method="get")
-        with io.open(xml_file, "w", encoding="utf8") as fh:
+
+        with xml_file.open("w", encoding="utf8") as fh:
             fh.write(flow_xml)
 
         return _create_flow_from_xml(flow_xml)
 
 
+@overload
 def list_flows(
-    offset: Optional[int] = None,
-    size: Optional[int] = None,
-    tag: Optional[str] = None,
-    output_format: str = "dict",
-    **kwargs
-) -> Union[Dict, pd.DataFrame]:
+    offset: int | None = ...,
+    size: int | None = ...,
+    tag: str | None = ...,
+    output_format: Literal["dict"] = "dict",
+    **kwargs: Any,
+) -> dict:
+    ...
+
+
+@overload
+def list_flows(
+    offset: int | None = ...,
+    size: int | None = ...,
+    tag: str | None = ...,
+    *,
+    output_format: Literal["dataframe"],
+    **kwargs: Any,
+) -> pd.DataFrame:
+    ...
+
+
+@overload
+def list_flows(
+    offset: int | None,
+    size: int | None,
+    tag: str | None,
+    output_format: Literal["dataframe"],
+    **kwargs: Any,
+) -> pd.DataFrame:
+    ...
+
+
+def list_flows(
+    offset: int | None = None,
+    size: int | None = None,
+    tag: str | None = None,
+    output_format: Literal["dict", "dataframe"] = "dict",
+    **kwargs: Any,
+) -> dict | pd.DataFrame:
     """
     Return a list of all flows which are on OpenML.
     (Supports large amount of results)
@@ -186,7 +220,7 @@ def list_flows(
     """
     if output_format not in ["dataframe", "dict"]:
         raise ValueError(
-            "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
+            "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable.",
         )
 
     # TODO: [0.15]
@@ -199,16 +233,33 @@ def list_flows(
         warnings.warn(msg, category=FutureWarning, stacklevel=2)
 
     return openml.utils._list_all(
-        output_format=output_format,
+        list_output_format=output_format,
         listing_call=_list_flows,
         offset=offset,
         size=size,
         tag=tag,
-        **kwargs
+        **kwargs,
     )
 
 
-def _list_flows(output_format="dict", **kwargs) -> Union[Dict, pd.DataFrame]:
+@overload
+def _list_flows(output_format: Literal["dict"] = ..., **kwargs: Any) -> dict:
+    ...
+
+
+@overload
+def _list_flows(*, output_format: Literal["dataframe"], **kwargs: Any) -> pd.DataFrame:
+    ...
+
+
+@overload
+def _list_flows(output_format: Literal["dataframe"], **kwargs: Any) -> pd.DataFrame:
+    ...
+
+
+def _list_flows(
+    output_format: Literal["dict", "dataframe"] = "dict", **kwargs: Any
+) -> dict | pd.DataFrame:
     """
     Perform the api call that return a list of all flows.
 
@@ -230,12 +281,12 @@ def _list_flows(output_format="dict", **kwargs) -> Union[Dict, pd.DataFrame]:
 
     if kwargs is not None:
         for operator, value in kwargs.items():
-            api_call += "/%s/%s" % (operator, value)
+            api_call += f"/{operator}/{value}"
 
     return __list_flows(api_call=api_call, output_format=output_format)
 
 
-def flow_exists(name: str, external_version: str) -> Union[int, bool]:
+def flow_exists(name: str, external_version: str) -> int | bool:
     """Retrieves the flow id.
 
     A flow is uniquely identified by name + external_version.
@@ -273,10 +324,10 @@ def flow_exists(name: str, external_version: str) -> Union[int, bool]:
 
 
 def get_flow_id(
-    model: Optional[Any] = None,
-    name: Optional[str] = None,
-    exact_version=True,
-) -> Union[int, bool, List[int]]:
+    model: Any | None = None,
+    name: str | None = None,
+    exact_version: bool = True,  # noqa: FBT001, FBT002
+) -> int | bool | list[int]:
     """Retrieves the flow id for a model or a flow name.
 
     Provide either a model or a name to this function. Depending on the input, it does
@@ -300,18 +351,14 @@ def get_flow_id(
     exact_version : bool
         Whether to return the flow id of the exact version or all flow ids where the name
         of the flow matches. This is only taken into account for a model where a version number
-        is available.
+        is available (requires ``model`` to be set).
 
     Returns
     -------
     int or bool, List
         flow id iff exists, ``False`` otherwise, List if ``exact_version is False``
     """
-    if model is None and name is None:
-        raise ValueError(
-            "Need to provide either argument `model` or argument `name`, but both are `None`."
-        )
-    elif model is not None and name is not None:
+    if model is not None and name is not None:
         raise ValueError("Must provide either argument `model` or argument `name`, but not both.")
 
     if model is not None:
@@ -323,30 +370,63 @@ def get_flow_id(
         flow = extension.model_to_flow(model)
         flow_name = flow.name
         external_version = flow.external_version
-    else:
+    elif name is not None:
         flow_name = name
         exact_version = False
+        external_version = None
+    else:
+        raise ValueError(
+            "Need to provide either argument `model` or argument `name`, but both are `None`."
+        )
 
     if exact_version:
+        if external_version is None:
+            raise ValueError("exact_version should be False if model is None!")
         return flow_exists(name=flow_name, external_version=external_version)
-    else:
-        flows = list_flows(output_format="dataframe")
-        assert isinstance(flows, pd.DataFrame)  # Make mypy happy
-        flows = flows.query('name == "{}"'.format(flow_name))
-        return flows["id"].to_list()
 
+    flows = list_flows(output_format="dataframe")
+    assert isinstance(flows, pd.DataFrame)  # Make mypy happy
+    flows = flows.query(f'name == "{flow_name}"')
+    return flows["id"].to_list()  # type: ignore[no-any-return]
+
+
+@overload
+def __list_flows(api_call: str, output_format: Literal["dict"] = "dict") -> dict:
+    ...
+
+
+@overload
+def __list_flows(api_call: str, output_format: Literal["dataframe"]) -> pd.DataFrame:
+    ...
 
-def __list_flows(api_call: str, output_format: str = "dict") -> Union[Dict, pd.DataFrame]:
+
+def __list_flows(
+    api_call: str, output_format: Literal["dict", "dataframe"] = "dict"
+) -> dict | pd.DataFrame:
+    """Retrieve information about flows from OpenML API
+    and parse it to a dictionary or a Pandas DataFrame.
+
+    Parameters
+    ----------
+    api_call: str
+        Retrieves the information about flows.
+    output_format: str in {"dict", "dataframe"}
+        The output format.
+
+    Returns
+    -------
+        The flows information in the specified output format.
+    """
     xml_string = openml._api_calls._perform_api_call(api_call, "get")
     flows_dict = xmltodict.parse(xml_string, force_list=("oml:flow",))
 
     # Minimalistic check if the XML is useful
-    assert type(flows_dict["oml:flows"]["oml:flow"]) == list, type(flows_dict["oml:flows"])
+    assert isinstance(flows_dict["oml:flows"]["oml:flow"], list), type(flows_dict["oml:flows"])
     assert flows_dict["oml:flows"]["@xmlns:oml"] == "http://openml.org/openml", flows_dict[
         "oml:flows"
     ]["@xmlns:oml"]
 
-    flows = dict()
+    flows = {}
     for flow_ in flows_dict["oml:flows"]["oml:flow"]:
         fid = int(flow_["oml:id"])
         flow = {
@@ -367,27 +447,25 @@ def __list_flows(api_call: str, output_format: str = "dict") -> Union[Dict, pd.D
 
 def _check_flow_for_server_id(flow: OpenMLFlow) -> None:
     """Raises a ValueError if the flow or any of its subflows has no flow id."""
-
     # Depth-first search to check if all components were uploaded to the
     # server before parsing the parameters
-    stack = list()
-    stack.append(flow)
+    stack = [flow]
     while len(stack) > 0:
         current = stack.pop()
         if current.flow_id is None:
             raise ValueError("Flow %s has no flow_id!" % current.name)
-        else:
-            for component in current.components.values():
-                stack.append(component)
 
+        for component in current.components.values():
+            stack.append(component)
 
-def assert_flows_equal(
+
+def assert_flows_equal(  # noqa: C901, PLR0912, PLR0913, PLR0915
     flow1: OpenMLFlow,
     flow2: OpenMLFlow,
-    ignore_parameter_values_on_older_children: Optional[str] = None,
-    ignore_parameter_values: bool = False,
-    ignore_custom_name_if_none: bool = False,
-    check_description: bool = True,
+    ignore_parameter_values_on_older_children: str | None = None,
+    ignore_parameter_values: bool = False,  # noqa: FBT001, FBT002
+    ignore_custom_name_if_none: bool = False,  # noqa:  FBT001, FBT002
+    check_description: bool = True,  # noqa:  FBT001, FBT002
 ) -> None:
     """Check equality of two flows.
 
@@ -444,11 +522,11 @@ def assert_flows_equal(
             for name in set(attr1.keys()).union(attr2.keys()):
                 if name not in attr1:
                     raise ValueError(
-                        "Component %s only available in " "argument2, but not in argument1." % name
+                        "Component %s only available in " "argument2, but not in argument1." % name,
                     )
                 if name not in attr2:
                     raise ValueError(
-                        "Component %s only available in " "argument2, but not in argument1." % name
+                        "Component %s only available in " "argument2, but not in argument1." % name,
                     )
                 assert_flows_equal(
                     attr1[name],
@@ -473,13 +551,16 @@ def assert_flows_equal(
                         raise ValueError(
                             "Flow %s: parameter set of flow "
                             "differs from the parameters stored "
-                            "on the server." % flow1.name
+                            "on the server." % flow1.name,
                         )
 
                 if ignore_parameter_values_on_older_children:
+                    assert (
+                        flow1.upload_date is not None
+                    ), "Flow1 has no upload date that allows us to compare age of children."
                     upload_date_current_flow = dateutil.parser.parse(flow1.upload_date)
                     upload_date_parent_flow = dateutil.parser.parse(
-                        ignore_parameter_values_on_older_children
+                        ignore_parameter_values_on_older_children,
                     )
                     if upload_date_current_flow < upload_date_parent_flow:
                         continue
@@ -506,7 +587,7 @@ def assert_flows_equal(
                 params2 = set(flow2.parameters_meta_info)
                 if params1 != params2:
                     raise ValueError(
-                        "Parameter list in meta info for parameters differ " "in the two flows."
+                        "Parameter list in meta info for parameters differ " "in the two flows.",
                     )
                 # iterating over the parameter's meta info list
                 for param in params1:
@@ -523,18 +604,19 @@ def assert_flows_equal(
                         value2 = flow2.parameters_meta_info[param]
                     if value1 is None or value2 is None:
                         continue
-                    elif value1 != value2:
+
+                    if value1 != value2:
                         raise ValueError(
-                            "Flow {}: data type for parameter {} in {} differ "
-                            "as {}\nvs\n{}".format(flow1.name, param, key, value1, value2)
+                            f"Flow {flow1.name}: data type for parameter {param} in {key} differ "
+                            f"as {value1}\nvs\n{value2}",
                         )
                 # the continue is to avoid the 'attr != attr2' check at end of function
                 continue
 
             if attr1 != attr2:
                 raise ValueError(
-                    "Flow %s: values for attribute '%s' differ: "
-                    "'%s'\nvs\n'%s'." % (str(flow1.name), str(key), str(attr1), str(attr2))
+                    f"Flow {flow1.name!s}: values for attribute '{key!s}' differ: "
+                    f"'{attr1!s}'\nvs\n'{attr2!s}'.",
                 )
 
 
@@ -549,7 +631,6 @@ def _create_flow_from_xml(flow_xml: str) -> OpenMLFlow:
     -------
     OpenMLFlow
     """
-
     return OpenMLFlow._from_dict(xmltodict.parse(flow_xml))
 
 
diff --git a/openml/runs/__init__.py b/openml/runs/__init__.py
index 2abbd8f29..6d3dca504 100644
--- a/openml/runs/__init__.py
+++ b/openml/runs/__init__.py
@@ -1,19 +1,19 @@
 # License: BSD 3-Clause
 
-from .run import OpenMLRun
-from .trace import OpenMLRunTrace, OpenMLTraceIteration
 from .functions import (
-    run_model_on_task,
-    run_flow_on_task,
+    delete_run,
     get_run,
-    list_runs,
-    get_runs,
     get_run_trace,
-    run_exists,
+    get_runs,
     initialize_model_from_run,
     initialize_model_from_trace,
-    delete_run,
+    list_runs,
+    run_exists,
+    run_flow_on_task,
+    run_model_on_task,
 )
+from .run import OpenMLRun
+from .trace import OpenMLRunTrace, OpenMLTraceIteration
 
 __all__ = [
     "OpenMLRun",
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
index 96e031aee..7a082e217 100644
--- a/openml/runs/functions.py
+++ b/openml/runs/functions.py
@@ -1,62 +1,73 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
-from collections import OrderedDict
-import io
 import itertools
-import os
 import time
-from typing import Any, List, Dict, Optional, Set, Tuple, Union, TYPE_CHECKING, cast  # noqa F401
 import warnings
+from collections import OrderedDict
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+from typing_extensions import Literal
 
-import sklearn.metrics
-import xmltodict
 import numpy as np
 import pandas as pd
+import sklearn.metrics
+import xmltodict
 from joblib.parallel import Parallel, delayed
 
 import openml
-import openml.utils
 import openml._api_calls
-from openml.exceptions import PyOpenMLError
-from openml.extensions import get_extension_by_model
+import openml.utils
 from openml import config
+from openml.exceptions import (
+    OpenMLCacheException,
+    OpenMLRunsExistError,
+    OpenMLServerException,
+    PyOpenMLError,
+)
+from openml.extensions import get_extension_by_model
+from openml.flows import OpenMLFlow, flow_exists, get_flow
 from openml.flows.flow import _copy_server_fields
-from ..flows import get_flow, flow_exists, OpenMLFlow
-from ..setups import setup_exists, initialize_model
-from ..exceptions import OpenMLCacheException, OpenMLServerException, OpenMLRunsExistError
-from ..tasks import (
-    OpenMLTask,
+from openml.setups import initialize_model, setup_exists
+from openml.tasks import (
     OpenMLClassificationTask,
     OpenMLClusteringTask,
+    OpenMLLearningCurveTask,
     OpenMLRegressionTask,
     OpenMLSupervisedTask,
-    OpenMLLearningCurveTask,
+    OpenMLTask,
+    TaskType,
+    get_task,
 )
+
 from .run import OpenMLRun
 from .trace import OpenMLRunTrace
-from ..tasks import TaskType, get_task
 
 # Avoid import cycles: https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles
 if TYPE_CHECKING:
+    from openml.config import _Config
     from openml.extensions.extension_interface import Extension
 
 # get_dict is in run.py to avoid circular imports
 
 RUNS_CACHE_DIR_NAME = "runs"
+ERROR_CODE = 512
 
 
-def run_model_on_task(
+# TODO(eddiebergman): Could potentially overload this but
+# it seems very big to do so
+def run_model_on_task(  # noqa: PLR0913
     model: Any,
-    task: Union[int, str, OpenMLTask],
-    avoid_duplicate_runs: bool = True,
-    flow_tags: Optional[List[str]] = None,
-    seed: Optional[int] = None,
-    add_local_measures: bool = True,
-    upload_flow: bool = False,
-    return_flow: bool = False,
-    dataset_format: str = "dataframe",
-    n_jobs: Optional[int] = None,
-) -> Union[OpenMLRun, Tuple[OpenMLRun, OpenMLFlow]]:
+    task: int | str | OpenMLTask,
+    avoid_duplicate_runs: bool = True,  # noqa: FBT001, FBT002
+    flow_tags: list[str] | None = None,
+    seed: int | None = None,
+    add_local_measures: bool = True,  # noqa: FBT001, FBT002
+    upload_flow: bool = False,  # noqa: FBT001, FBT002
+    return_flow: bool = False,  # noqa: FBT001, FBT002
+    dataset_format: Literal["array", "dataframe"] = "dataframe",
+    n_jobs: int | None = None,
+) -> OpenMLRun | tuple[OpenMLRun, OpenMLFlow]:
     """Run the model on the dataset defined by the task.
 
     Parameters
@@ -104,6 +115,8 @@ def run_model_on_task(
             "Please set your API key in the OpenML configuration file, see"
             "https://openml.github.io/openml-python/main/examples/20_basic/introduction_tutorial"
             ".html#authentication for more information on authentication.",
+            RuntimeWarning,
+            stacklevel=2,
         )
 
     # TODO: At some point in the future do not allow for arguments in old order (6-2018).
@@ -116,6 +129,7 @@ def run_model_on_task(
             "will not be supported in the future. Please use the "
             "order (model, task).",
             DeprecationWarning,
+            stacklevel=2,
         )
         task, model = model, task
 
@@ -127,11 +141,24 @@ def run_model_on_task(
 
     flow = extension.model_to_flow(model)
 
-    def get_task_and_type_conversion(task: Union[int, str, OpenMLTask]) -> OpenMLTask:
-        if isinstance(task, (int, str)):
-            return get_task(int(task))
-        else:
-            return task
+    def get_task_and_type_conversion(_task: int | str | OpenMLTask) -> OpenMLTask:
+        """Retrieve an OpenMLTask object from either an integer or string ID,
+        or directly from an OpenMLTask object.
+
+        Parameters
+        ----------
+        _task : Union[int, str, OpenMLTask]
+            The task ID or the OpenMLTask object.
+
+        Returns
+        -------
+        OpenMLTask
+            The OpenMLTask object.
+        """
+        if isinstance(_task, (int, str)):
+            return get_task(int(_task))  # type: ignore
+
+        return _task
 
     task = get_task_and_type_conversion(task)
 
@@ -151,16 +178,16 @@ def get_task_and_type_conversion(task: Union[int, str, OpenMLTask]) -> OpenMLTas
     return run
 
 
-def run_flow_on_task(
+def run_flow_on_task(  # noqa: C901, PLR0912, PLR0915, PLR0913
     flow: OpenMLFlow,
     task: OpenMLTask,
-    avoid_duplicate_runs: bool = True,
-    flow_tags: Optional[List[str]] = None,
-    seed: Optional[int] = None,
-    add_local_measures: bool = True,
-    upload_flow: bool = False,
-    dataset_format: str = "dataframe",
-    n_jobs: Optional[int] = None,
+    avoid_duplicate_runs: bool = True,  # noqa: FBT002, FBT001
+    flow_tags: list[str] | None = None,
+    seed: int | None = None,
+    add_local_measures: bool = True,  # noqa: FBT001, FBT002
+    upload_flow: bool = False,  # noqa: FBT001, FBT002
+    dataset_format: Literal["array", "dataframe"] = "dataframe",
+    n_jobs: int | None = None,
 ) -> OpenMLRun:
     """Run the model provided by the flow on the dataset defined by task.
 
@@ -217,6 +244,7 @@ def run_flow_on_task(
             "will not be supported in the future. Please use the "
             "order (model, Flow).",
             DeprecationWarning,
+            stacklevel=2,
         )
         task, flow = flow, task
 
@@ -225,6 +253,7 @@ def run_flow_on_task(
 
     if flow.model is None:
         flow.model = flow.extension.flow_to_model(flow)
+
     flow.model = flow.extension.seed_model(flow.model, seed=seed)
 
     # We only need to sync with the server right now if we want to upload the flow,
@@ -233,17 +262,16 @@ def run_flow_on_task(
     if upload_flow or avoid_duplicate_runs:
         flow_id = flow_exists(flow.name, flow.external_version)
         if isinstance(flow.flow_id, int) and flow_id != flow.flow_id:
-            if flow_id:
+            if flow_id is not False:
                 raise PyOpenMLError(
                     "Local flow_id does not match server flow_id: "
-                    "'{}' vs '{}'".format(flow.flow_id, flow_id)
-                )
-            else:
-                raise PyOpenMLError(
-                    "Flow does not exist on the server, " "but 'flow.flow_id' is not None."
+                    f"'{flow.flow_id}' vs '{flow_id}'",
                 )
+            raise PyOpenMLError(
+                "Flow does not exist on the server, but 'flow.flow_id' is not None."
+            )
 
-        if upload_flow and not flow_id:
+        if upload_flow and flow_id is None:
             flow.publish()
             flow_id = flow.flow_id
         elif flow_id:
@@ -255,14 +283,13 @@ def run_flow_on_task(
                 ids = run_exists(task.task_id, setup_id)
                 if ids:
                     error_message = (
-                        "One or more runs of this setup were " "already performed on the task."
+                        "One or more runs of this setup were already performed on the task."
                     )
                     raise OpenMLRunsExistError(ids, error_message)
         else:
             # Flow does not exist on server and we do not want to upload it.
             # No sync with the server happens.
             flow_id = None
-            pass
 
     dataset = task.get_dataset()
 
@@ -272,7 +299,9 @@ def run_flow_on_task(
     if flow.extension.check_if_model_fitted(flow.model):
         warnings.warn(
             "The model is already fitted!"
-            " This might cause inconsistency in comparison of results."
+            " This might cause inconsistency in comparison of results.",
+            RuntimeWarning,
+            stacklevel=2,
         )
 
     # execute the run
@@ -315,9 +344,9 @@ def run_flow_on_task(
         run.fold_evaluations = fold_evaluations
 
     if flow_id:
-        message = "Executed Task {} with Flow id:{}".format(task.task_id, run.flow_id)
+        message = f"Executed Task {task.task_id} with Flow id:{run.flow_id}"
     else:
-        message = "Executed Task {} on local Flow with name {}.".format(task.task_id, flow.name)
+        message = f"Executed Task {task.task_id} on local Flow with name {flow.name}."
     config.logger.info(message)
 
     return run
@@ -336,8 +365,7 @@ def get_run_trace(run_id: int) -> OpenMLRunTrace:
     openml.runs.OpenMLTrace
     """
     trace_xml = openml._api_calls._perform_api_call("run/trace/%d" % run_id, "get")
-    run_trace = OpenMLRunTrace.trace_from_xml(trace_xml)
-    return run_trace
+    return OpenMLRunTrace.trace_from_xml(trace_xml)
 
 
 def initialize_model_from_run(run_id: int) -> Any:
@@ -355,6 +383,9 @@ def initialize_model_from_run(run_id: int) -> Any:
     model
     """
     run = get_run(run_id)
+    # TODO(eddiebergman): I imagine this is None if it's not published,
+    # might need to raise an explicit error for that
+    assert run.setup_id is not None
     return initialize_model(run.setup_id)
 
 
@@ -362,7 +393,7 @@ def initialize_model_from_trace(
     run_id: int,
     repeat: int,
     fold: int,
-    iteration: Optional[int] = None,
+    iteration: int | None = None,
 ) -> Any:
     """
     Initialize a model based on the parameters that were set
@@ -392,6 +423,10 @@ def initialize_model_from_trace(
     model
     """
     run = get_run(run_id)
+    # TODO(eddiebergman): I imagine this is None if it's not published,
+    # might need to raise an explicit error for that
+    assert run.flow_id is not None
+
     flow = get_flow(run.flow_id)
     run_trace = get_run_trace(run_id)
 
@@ -404,11 +439,10 @@ def initialize_model_from_trace(
     current = run_trace.trace_iterations[(repeat, fold, iteration)]
 
     search_model = initialize_model_from_run(run_id)
-    model = flow.extension.instantiate_model_from_hpo_class(search_model, current)
-    return model
+    return flow.extension.instantiate_model_from_hpo_class(search_model, current)
 
 
-def run_exists(task_id: int, setup_id: int) -> Set[int]:
+def run_exists(task_id: int, setup_id: int) -> set[int]:
     """Checks whether a task/setup combination is already present on the
     server.
 
@@ -428,31 +462,58 @@ def run_exists(task_id: int, setup_id: int) -> Set[int]:
         return set()
 
     try:
-        result = cast(
-            pd.DataFrame, list_runs(task=[task_id], setup=[setup_id], output_format="dataframe")
-        )
+        result = list_runs(task=[task_id], setup=[setup_id], output_format="dataframe")
+        assert isinstance(result, pd.DataFrame)  # TODO(eddiebergman): Remove once #1299
         return set() if result.empty else set(result["run_id"])
     except OpenMLServerException as exception:
-        # error code 512 implies no results. The run does not exist yet
-        assert exception.code == 512
+        # error code implies no results. The run does not exist yet
+        if exception.code != ERROR_CODE:
+            raise exception
         return set()
 
 
-def _run_task_get_arffcontent(
+def _run_task_get_arffcontent(  # noqa: PLR0915, PLR0912, PLR0913, C901
+    *,
     model: Any,
     task: OpenMLTask,
-    extension: "Extension",
+    extension: Extension,
     add_local_measures: bool,
-    dataset_format: str,
-    n_jobs: Optional[int] = None,
-) -> Tuple[
-    List[List],
-    Optional[OpenMLRunTrace],
-    "OrderedDict[str, OrderedDict]",
-    "OrderedDict[str, OrderedDict]",
+    dataset_format: Literal["array", "dataframe"],
+    n_jobs: int | None = None,
+) -> tuple[
+    list[list],
+    OpenMLRunTrace | None,
+    OrderedDict[str, OrderedDict],
+    OrderedDict[str, OrderedDict],
 ]:
-    arff_datacontent = []  # type: List[List]
-    traces = []  # type: List[OpenMLRunTrace]
+    """Runs the hyperparameter optimization on the given task
+    and returns the arfftrace content.
+
+    Parameters
+    ----------
+    model : Any
+        The model that is to be evalauted.
+    task : OpenMLTask
+        The OpenMLTask to evaluate.
+    extension : Extension
+        The OpenML extension object.
+    add_local_measures : bool
+        Whether to compute additional local evaluation measures.
+    dataset_format : str
+        The format in which to download the dataset.
+    n_jobs : int
+        Number of jobs to run in parallel.
+        If None, use 1 core by default. If -1, use all available cores.
+
+    Returns
+    -------
+    Tuple[List[List], Optional[OpenMLRunTrace],
+        OrderedDict[str, OrderedDict], OrderedDict[str, OrderedDict]]
+    A tuple containing the arfftrace content,
+    the OpenML run trace, the global and local evaluation measures.
+    """
+    arff_datacontent = []  # type: list[list]
+    traces = []  # type: list[OpenMLRunTrace]
     # stores fold-based evaluation measures. In case of a sample based task,
     # this information is multiple times overwritten, but due to the ordering
     # of tne loops, eventually it contains the information based on the full
@@ -484,7 +545,18 @@ def _run_task_get_arffcontent(
     # Execute runs in parallel
     # assuming the same number of tasks as workers (n_jobs), the total compute time for this
     # statement will be similar to the slowest run
-    job_rvals = Parallel(verbose=0, n_jobs=n_jobs)(
+    # TODO(eddiebergman): Simplify this
+    job_rvals: list[
+        tuple[
+            np.ndarray,
+            pd.DataFrame | None,
+            np.ndarray,
+            pd.DataFrame | None,
+            OpenMLRunTrace | None,
+            OrderedDict[str, float],
+        ],
+    ]
+    job_rvals = Parallel(verbose=0, n_jobs=n_jobs)(  # type: ignore
         delayed(_run_task_get_arffcontent_parallel_helper)(
             extension=extension,
             fold_no=fold_no,
@@ -495,22 +567,32 @@ def _run_task_get_arffcontent(
             dataset_format=dataset_format,
             configuration=_config,
         )
-        for n_fit, rep_no, fold_no, sample_no in jobs
+        for _n_fit, rep_no, fold_no, sample_no in jobs
     )  # job_rvals contain the output of all the runs with one-to-one correspondence with `jobs`
 
     for n_fit, rep_no, fold_no, sample_no in jobs:
-        pred_y, proba_y, test_indices, test_y, trace, user_defined_measures_fold = job_rvals[
+        pred_y, proba_y, test_indices, test_y, inner_trace, user_defined_measures_fold = job_rvals[
             n_fit - 1
         ]
-        if trace is not None:
-            traces.append(trace)
+
+        if inner_trace is not None:
+            traces.append(inner_trace)
 
         # add client-side calculated metrics. These is used on the server as
         # consistency check, only useful for supervised tasks
-        def _calculate_local_measure(sklearn_fn, openml_name):
-            user_defined_measures_fold[openml_name] = sklearn_fn(test_y, pred_y)
+        def _calculate_local_measure(  # type: ignore
+            sklearn_fn,
+            openml_name,
+            _test_y=test_y,
+            _pred_y=pred_y,
+            _user_defined_measures_fold=user_defined_measures_fold,
+        ):
+            _user_defined_measures_fold[openml_name] = sklearn_fn(_test_y, _pred_y)
 
         if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
+            assert test_y is not None
+            assert proba_y is not None
+
             for i, tst_idx in enumerate(test_indices):
                 if task.class_labels is not None:
                     prediction = (
@@ -554,6 +636,7 @@ def _calculate_local_measure(sklearn_fn, openml_name):
                 )
 
         elif isinstance(task, OpenMLRegressionTask):
+            assert test_y is not None
             for i, _ in enumerate(test_indices):
                 truth = test_y.iloc[i] if isinstance(test_y, pd.Series) else test_y[i]
                 arff_line = format_prediction(
@@ -601,15 +684,14 @@ def _calculate_local_measure(sklearn_fn, openml_name):
                 sample_no
             ] = user_defined_measures_fold[measure]
 
+    trace: OpenMLRunTrace | None = None
     if len(traces) > 0:
-        if len(traces) != n_fit:
+        if len(traces) != len(jobs):
             raise ValueError(
-                "Did not find enough traces (expected {}, found {})".format(n_fit, len(traces))
+                f"Did not find enough traces (expected {len(jobs)}, found {len(traces)})",
             )
-        else:
-            trace = OpenMLRunTrace.merge_traces(traces)
-    else:
-        trace = None
+
+        trace = OpenMLRunTrace.merge_traces(traces)
 
     return (
         arff_datacontent,
@@ -619,54 +701,88 @@ def _calculate_local_measure(sklearn_fn, openml_name):
     )
 
 
-def _run_task_get_arffcontent_parallel_helper(
-    extension: "Extension",
+def _run_task_get_arffcontent_parallel_helper(  # noqa: PLR0913
+    extension: Extension,
     fold_no: int,
     model: Any,
     rep_no: int,
     sample_no: int,
     task: OpenMLTask,
-    dataset_format: str,
-    configuration: Optional[Dict] = None,
-) -> Tuple[
+    dataset_format: Literal["array", "dataframe"],
+    configuration: _Config | None = None,
+) -> tuple[
     np.ndarray,
-    Optional[pd.DataFrame],
+    pd.DataFrame | None,
     np.ndarray,
-    Optional[pd.DataFrame],
-    Optional[OpenMLRunTrace],
-    "OrderedDict[str, float]",
+    pd.DataFrame | None,
+    OpenMLRunTrace | None,
+    OrderedDict[str, float],
 ]:
+    """Helper function that runs a single model on a single task fold sample.
+
+    Parameters
+    ----------
+    extension : Extension
+        An OpenML extension instance.
+    fold_no : int
+        The fold number to be run.
+    model : Any
+        The model that is to be evaluated.
+    rep_no : int
+        Repetition number to be run.
+    sample_no : int
+        Sample number to be run.
+    task : OpenMLTask
+        The task object from OpenML.
+    dataset_format : str
+        The dataset format to be used.
+    configuration : _Config
+        Hyperparameters to configure the model.
+
+    Returns
+    -------
+    Tuple[np.ndarray, Optional[pd.DataFrame], np.ndarray, Optional[pd.DataFrame],
+           Optional[OpenMLRunTrace], OrderedDict[str, float]]
+    A tuple containing the predictions, probability estimates (if applicable),
+    actual target values, actual target value probabilities (if applicable),
+    the trace object of the OpenML run (if applicable),
+    and a dictionary of local measures for this particular fold.
+    """
     # Sets up the OpenML instantiated in the child process to match that of the parent's
     # if configuration=None, loads the default
     config._setup(configuration)
 
     train_indices, test_indices = task.get_train_test_split_indices(
-        repeat=rep_no, fold=fold_no, sample=sample_no
+        repeat=rep_no,
+        fold=fold_no,
+        sample=sample_no,
     )
 
     if isinstance(task, OpenMLSupervisedTask):
         x, y = task.get_X_and_y(dataset_format=dataset_format)
-        if dataset_format == "dataframe":
+        if isinstance(x, pd.DataFrame):
+            assert isinstance(y, (pd.Series, pd.DataFrame))
             train_x = x.iloc[train_indices]
             train_y = y.iloc[train_indices]
             test_x = x.iloc[test_indices]
             test_y = y.iloc[test_indices]
         else:
-            train_x = x[train_indices]
+            # TODO(eddiebergman): Complains spmatrix doesn't support __getitem__ for typing
+            assert y is not None
+            train_x = x[train_indices]  # type: ignore
             train_y = y[train_indices]
-            test_x = x[test_indices]
+            test_x = x[test_indices]  # type: ignore
             test_y = y[test_indices]
     elif isinstance(task, OpenMLClusteringTask):
         x = task.get_X(dataset_format=dataset_format)
-        if dataset_format == "dataframe":
-            train_x = x.iloc[train_indices]
-        else:
-            train_x = x[train_indices]
+        # TODO(eddiebergman): Complains spmatrix doesn't support __getitem__ for typing
+        train_x = x.iloc[train_indices] if isinstance(x, pd.DataFrame) else x[train_indices]  # type: ignore
         train_y = None
         test_x = None
         test_y = None
     else:
         raise NotImplementedError(task.task_type)
+
     config.logger.info(
         "Going to run model {} on dataset {} for repeat {} fold {} sample {}".format(
             str(model),
@@ -674,7 +790,7 @@ def _run_task_get_arffcontent_parallel_helper(
             rep_no,
             fold_no,
             sample_no,
-        )
+        ),
     )
     (
         pred_y,
@@ -685,15 +801,16 @@ def _run_task_get_arffcontent_parallel_helper(
         model=model,
         task=task,
         X_train=train_x,
-        y_train=train_y,
+        # TODO(eddiebergman): Likely should not be ignored
+        y_train=train_y,  # type: ignore
         rep_no=rep_no,
         fold_no=fold_no,
         X_test=test_x,
     )
-    return pred_y, proba_y, test_indices, test_y, trace, user_defined_measures_fold
+    return pred_y, proba_y, test_indices, test_y, trace, user_defined_measures_fold  # type: ignore
 
 
-def get_runs(run_ids):
+def get_runs(run_ids: list[int]) -> list[OpenMLRun]:
     """Gets all runs in run_ids list.
 
     Parameters
@@ -705,7 +822,6 @@ def get_runs(run_ids):
     runs : list of OpenMLRun
         List of runs corresponding to IDs, fetched from the server.
     """
-
     runs = []
     for run_id in run_ids:
         runs.append(get_run(run_id))
@@ -713,7 +829,7 @@ def get_runs(run_ids):
 
 
 @openml.utils.thread_safe_if_oslo_installed
-def get_run(run_id: int, ignore_cache: bool = False) -> OpenMLRun:
+def get_run(run_id: int, ignore_cache: bool = False) -> OpenMLRun:  # noqa: FBT002, FBT001
     """Gets run corresponding to run_id.
 
     Parameters
@@ -731,29 +847,26 @@ def get_run(run_id: int, ignore_cache: bool = False) -> OpenMLRun:
     run : OpenMLRun
         Run corresponding to ID, fetched from the server.
     """
-    run_dir = openml.utils._create_cache_directory_for_id(RUNS_CACHE_DIR_NAME, run_id)
-    run_file = os.path.join(run_dir, "description.xml")
+    run_dir = Path(openml.utils._create_cache_directory_for_id(RUNS_CACHE_DIR_NAME, run_id))
+    run_file = run_dir / "description.xml"
 
-    if not os.path.exists(run_dir):
-        os.makedirs(run_dir)
+    run_dir.mkdir(parents=True, exist_ok=True)
 
     try:
         if not ignore_cache:
             return _get_cached_run(run_id)
-        else:
-            raise OpenMLCacheException(message="dummy")
+
+        raise OpenMLCacheException(message="dummy")
 
     except OpenMLCacheException:
         run_xml = openml._api_calls._perform_api_call("run/%d" % run_id, "get")
-        with io.open(run_file, "w", encoding="utf8") as fh:
+        with run_file.open("w", encoding="utf8") as fh:
             fh.write(run_xml)
 
-    run = _create_run_from_xml(run_xml)
+    return _create_run_from_xml(run_xml)
 
-    return run
 
-
-def _create_run_from_xml(xml, from_server=True):
+def _create_run_from_xml(xml: str, from_server: bool = True) -> OpenMLRun:  # noqa: PLR0915, PLR0912, C901, , FBT001, FBT002FBT
     """Create a run object from xml returned from server.
 
     Parameters
@@ -771,7 +884,7 @@ def _create_run_from_xml(xml, from_server=True):
         New run object representing run_xml.
     """
 
-    def obtain_field(xml_obj, fieldname, from_server, cast=None):
+    def obtain_field(xml_obj, fieldname, from_server, cast=None):  # type: ignore
         # this function can be used to check whether a field is present in an
         # object. if it is not present, either returns None or throws an error
         # (this is usually done if the xml comes from the server)
@@ -779,10 +892,11 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None):
             if cast is not None:
                 return cast(xml_obj[fieldname])
             return xml_obj[fieldname]
-        elif not from_server:
+
+        if not from_server:
             return None
-        else:
-            raise AttributeError("Run XML does not contain required (server) " "field: ", fieldname)
+
+        raise AttributeError("Run XML does not contain required (server) " "field: ", fieldname)
 
     run = xmltodict.parse(xml, force_list=["oml:file", "oml:evaluation", "oml:parameter_setting"])[
         "oml:run"
@@ -794,10 +908,7 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None):
     task_type = obtain_field(run, "oml:task_type", from_server)
 
     # even with the server requirement this field may be empty.
-    if "oml:task_evaluation_measure" in run:
-        task_evaluation_measure = run["oml:task_evaluation_measure"]
-    else:
-        task_evaluation_measure = None
+    task_evaluation_measure = run.get("oml:task_evaluation_measure", None)
 
     if not from_server and run["oml:flow_id"] is None:
         # This can happen for a locally stored run of which the flow is not yet published.
@@ -811,9 +922,10 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None):
         if "oml:parameter_setting" in run:
             obtained_parameter_settings = run["oml:parameter_setting"]
             for parameter_dict in obtained_parameter_settings:
-                current_parameter = OrderedDict()
-                current_parameter["oml:name"] = parameter_dict["oml:name"]
-                current_parameter["oml:value"] = parameter_dict["oml:value"]
+                current_parameter = {
+                    "oml:name": parameter_dict["oml:name"],
+                    "oml:value": parameter_dict["oml:value"],
+                }
                 if "oml:component" in parameter_dict:
                     current_parameter["oml:component"] = parameter_dict["oml:component"]
                 parameters.append(current_parameter)
@@ -834,15 +946,14 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None):
         t = openml.tasks.get_task(task_id, download_data=False)
         if not hasattr(t, "dataset_id"):
             raise ValueError(
-                "Unable to fetch dataset_id from the task({}) "
-                "linked to run({})".format(task_id, run_id)
+                f"Unable to fetch dataset_id from the task({task_id}) linked to run({run_id})",
             )
         dataset_id = t.dataset_id
 
-    files = OrderedDict()
-    evaluations = OrderedDict()
-    fold_evaluations = OrderedDict()
-    sample_evaluations = OrderedDict()
+    files: dict[str, int] = {}
+    evaluations: dict[str, float | Any] = {}
+    fold_evaluations: dict[str, dict[int, dict[int, float | Any]]] = {}
+    sample_evaluations: dict[str, dict[int, dict[int, dict[int, float | Any]]]] = {}
     if "oml:output_data" not in run:
         if from_server:
             raise ValueError("Run does not contain output_data " "(OpenML server error?)")
@@ -868,7 +979,7 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None):
                 else:
                     raise ValueError(
                         'Could not find keys "value" or '
-                        '"array_data" in %s' % str(evaluation_dict.keys())
+                        '"array_data" in %s' % str(evaluation_dict.keys()),
                     )
                 if (
                     "@repeat" in evaluation_dict
@@ -879,19 +990,19 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None):
                     fold = int(evaluation_dict["@fold"])
                     sample = int(evaluation_dict["@sample"])
                     if key not in sample_evaluations:
-                        sample_evaluations[key] = OrderedDict()
+                        sample_evaluations[key] = {}
                     if repeat not in sample_evaluations[key]:
-                        sample_evaluations[key][repeat] = OrderedDict()
+                        sample_evaluations[key][repeat] = {}
                     if fold not in sample_evaluations[key][repeat]:
-                        sample_evaluations[key][repeat][fold] = OrderedDict()
+                        sample_evaluations[key][repeat][fold] = {}
                     sample_evaluations[key][repeat][fold][sample] = value
                 elif "@repeat" in evaluation_dict and "@fold" in evaluation_dict:
                     repeat = int(evaluation_dict["@repeat"])
                     fold = int(evaluation_dict["@fold"])
                     if key not in fold_evaluations:
-                        fold_evaluations[key] = OrderedDict()
+                        fold_evaluations[key] = {}
                     if repeat not in fold_evaluations[key]:
-                        fold_evaluations[key][repeat] = OrderedDict()
+                        fold_evaluations[key][repeat] = {}
                     fold_evaluations[key][repeat][fold] = value
                 else:
                     evaluations[key] = value
@@ -903,12 +1014,12 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None):
         task = openml.tasks.get_task(task_id)
         if task.task_type_id == TaskType.SUBGROUP_DISCOVERY:
             raise NotImplementedError("Subgroup discovery tasks are not yet supported.")
-        else:
-            # JvR: actually, I am not sure whether this error should be raised.
-            # a run can consist without predictions. But for now let's keep it
-            # Matthias: yes, it should stay as long as we do not really handle
-            # this stuff
-            raise ValueError("No prediction files for run %d in run " "description XML" % run_id)
+
+        # JvR: actually, I am not sure whether this error should be raised.
+        # a run can consist without predictions. But for now let's keep it
+        # Matthias: yes, it should stay as long as we do not really handle
+        # this stuff
+        raise ValueError("No prediction files for run %d in run description XML" % run_id)
 
     tags = openml.utils.extract_xml_tags("oml:tag", run)
 
@@ -936,36 +1047,33 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None):
     )
 
 
-def _get_cached_run(run_id):
+def _get_cached_run(run_id: int) -> OpenMLRun:
     """Load a run from the cache."""
-    run_cache_dir = openml.utils._create_cache_directory_for_id(
-        RUNS_CACHE_DIR_NAME,
-        run_id,
-    )
+    run_cache_dir = openml.utils._create_cache_directory_for_id(RUNS_CACHE_DIR_NAME, run_id)
+    run_file = run_cache_dir / "description.xml"
     try:
-        run_file = os.path.join(run_cache_dir, "description.xml")
-        with io.open(run_file, encoding="utf8") as fh:
-            run = _create_run_from_xml(xml=fh.read())
-        return run
-
-    except (OSError, IOError):
-        raise OpenMLCacheException("Run file for run id %d not " "cached" % run_id)
-
-
-def list_runs(
-    offset: Optional[int] = None,
-    size: Optional[int] = None,
-    id: Optional[List] = None,
-    task: Optional[List[int]] = None,
-    setup: Optional[List] = None,
-    flow: Optional[List] = None,
-    uploader: Optional[List] = None,
-    tag: Optional[str] = None,
-    study: Optional[int] = None,
-    display_errors: bool = False,
-    output_format: str = "dict",
-    **kwargs,
-) -> Union[Dict, pd.DataFrame]:
+        with run_file.open(encoding="utf8") as fh:
+            return _create_run_from_xml(xml=fh.read())
+    except OSError as e:
+        raise OpenMLCacheException(f"Run file for run id {run_id} not cached") from e
+
+
+# TODO(eddiebergman): Could overload, likely too large an annoying to do
+# nvm, will be deprecated in 0.15
+def list_runs(  # noqa: PLR0913
+    offset: int | None = None,
+    size: int | None = None,
+    id: list | None = None,  # noqa: A002
+    task: list[int] | None = None,
+    setup: list | None = None,
+    flow: list | None = None,
+    uploader: list | None = None,
+    tag: str | None = None,
+    study: int | None = None,
+    display_errors: bool = False,  # noqa: FBT001, FBT002
+    output_format: Literal["dict", "dataframe"] = "dict",
+    **kwargs: Any,
+) -> dict | pd.DataFrame:
     """
     List all runs matching all of the given filters.
     (Supports large amount of results)
@@ -1008,9 +1116,8 @@ def list_runs(
     dict of dicts, or dataframe
     """
     if output_format not in ["dataframe", "dict"]:
-        raise ValueError(
-            "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
-        )
+        raise ValueError("Invalid output format selected. Only 'dict' or 'dataframe' applicable.")
+
     # TODO: [0.15]
     if output_format == "dict":
         msg = (
@@ -1020,6 +1127,7 @@ def list_runs(
         )
         warnings.warn(msg, category=FutureWarning, stacklevel=2)
 
+    # TODO(eddiebergman): Do we really need this runtime type validation?
     if id is not None and (not isinstance(id, list)):
         raise TypeError("id must be of type list.")
     if task is not None and (not isinstance(task, list)):
@@ -1031,8 +1139,8 @@ def list_runs(
     if uploader is not None and (not isinstance(uploader, list)):
         raise TypeError("uploader must be of type list.")
 
-    return openml.utils._list_all(
-        output_format=output_format,
+    return openml.utils._list_all(  # type: ignore
+        list_output_format=output_format,  # type: ignore
         listing_call=_list_runs,
         offset=offset,
         size=size,
@@ -1048,17 +1156,17 @@ def list_runs(
     )
 
 
-def _list_runs(
-    id: Optional[List] = None,
-    task: Optional[List] = None,
-    setup: Optional[List] = None,
-    flow: Optional[List] = None,
-    uploader: Optional[List] = None,
-    study: Optional[int] = None,
-    display_errors: bool = False,
-    output_format: str = "dict",
-    **kwargs,
-) -> Union[Dict, pd.DataFrame]:
+def _list_runs(  # noqa: PLR0913
+    id: list | None = None,  # noqa: A002
+    task: list | None = None,
+    setup: list | None = None,
+    flow: list | None = None,
+    uploader: list | None = None,
+    study: int | None = None,
+    display_errors: bool = False,  # noqa: FBT002, FBT001
+    output_format: Literal["dict", "dataframe"] = "dict",
+    **kwargs: Any,
+) -> dict | pd.DataFrame:
     """
     Perform API call `/run/list/{filters}'
     <https://www.openml.org/api_docs/#!/run/get_run_list_filters>`
@@ -1099,11 +1207,10 @@ def _list_runs(
     dict, or dataframe
         List of found runs.
     """
-
     api_call = "run/list"
     if kwargs is not None:
         for operator, value in kwargs.items():
-            api_call += "/%s/%s" % (operator, value)
+            api_call += f"/{operator}/{value}"
     if id is not None:
         api_call += "/run/%s" % ",".join([str(int(i)) for i in id])
     if task is not None:
@@ -1121,40 +1228,43 @@ def _list_runs(
     return __list_runs(api_call=api_call, output_format=output_format)
 
 
-def __list_runs(api_call, output_format="dict"):
+def __list_runs(
+    api_call: str, output_format: Literal["dict", "dataframe"] = "dict"
+) -> dict | pd.DataFrame:
     """Helper function to parse API calls which are lists of runs"""
     xml_string = openml._api_calls._perform_api_call(api_call, "get")
     runs_dict = xmltodict.parse(xml_string, force_list=("oml:run",))
     # Minimalistic check if the XML is useful
     if "oml:runs" not in runs_dict:
-        raise ValueError('Error in return XML, does not contain "oml:runs": %s' % str(runs_dict))
-    elif "@xmlns:oml" not in runs_dict["oml:runs"]:
+        raise ValueError(f'Error in return XML, does not contain "oml:runs": {runs_dict}')
+
+    if "@xmlns:oml" not in runs_dict["oml:runs"]:
         raise ValueError(
-            "Error in return XML, does not contain " '"oml:runs"/@xmlns:oml: %s' % str(runs_dict)
+            f'Error in return XML, does not contain "oml:runs"/@xmlns:oml: {runs_dict}'
         )
-    elif runs_dict["oml:runs"]["@xmlns:oml"] != "http://openml.org/openml":
+
+    if runs_dict["oml:runs"]["@xmlns:oml"] != "http://openml.org/openml":
         raise ValueError(
             "Error in return XML, value of  "
             '"oml:runs"/@xmlns:oml is not '
-            '"http://openml.org/openml": %s' % str(runs_dict)
+            f'"http://openml.org/openml": {runs_dict}',
         )
 
-    assert type(runs_dict["oml:runs"]["oml:run"]) == list, type(runs_dict["oml:runs"])
-
-    runs = OrderedDict()
-    for run_ in runs_dict["oml:runs"]["oml:run"]:
-        run_id = int(run_["oml:run_id"])
-        run = {
-            "run_id": run_id,
-            "task_id": int(run_["oml:task_id"]),
-            "setup_id": int(run_["oml:setup_id"]),
-            "flow_id": int(run_["oml:flow_id"]),
-            "uploader": int(run_["oml:uploader"]),
-            "task_type": TaskType(int(run_["oml:task_type_id"])),
-            "upload_time": str(run_["oml:upload_time"]),
-            "error_message": str((run_["oml:error_message"]) or ""),
+    assert isinstance(runs_dict["oml:runs"]["oml:run"], list), type(runs_dict["oml:runs"])
+
+    runs = {
+        int(r["oml:run_id"]): {
+            "run_id": int(r["oml:run_id"]),
+            "task_id": int(r["oml:task_id"]),
+            "setup_id": int(r["oml:setup_id"]),
+            "flow_id": int(r["oml:flow_id"]),
+            "uploader": int(r["oml:uploader"]),
+            "task_type": TaskType(int(r["oml:task_type_id"])),
+            "upload_time": str(r["oml:upload_time"]),
+            "error_message": str((r["oml:error_message"]) or ""),
         }
-        runs[run_id] = run
+        for r in runs_dict["oml:runs"]["oml:run"]
+    }
 
     if output_format == "dataframe":
         runs = pd.DataFrame.from_dict(runs, orient="index")
@@ -1162,16 +1272,16 @@ def __list_runs(api_call, output_format="dict"):
     return runs
 
 
-def format_prediction(
+def format_prediction(  # noqa: PLR0913
     task: OpenMLSupervisedTask,
     repeat: int,
     fold: int,
     index: int,
-    prediction: Union[str, int, float],
-    truth: Union[str, int, float],
-    sample: Optional[int] = None,
-    proba: Optional[Dict[str, float]] = None,
-) -> List[Union[str, int, float]]:
+    prediction: str | int | float,
+    truth: str | int | float,
+    sample: int | None = None,
+    proba: dict[str, float] | None = None,
+) -> list[str | int | float]:
     """Format the predictions in the specific order as required for the run results.
 
     Parameters
@@ -1216,14 +1326,15 @@ def format_prediction(
         if sample is None:
             if isinstance(task, OpenMLLearningCurveTask):
                 raise ValueError("`sample` can not be none for LearningCurveTask")
-            else:
-                sample = 0
+
+            sample = 0
         probabilities = [proba[c] for c in task.class_labels]
         return [repeat, fold, sample, index, prediction, truth, *probabilities]
-    elif isinstance(task, OpenMLRegressionTask):
+
+    if isinstance(task, OpenMLRegressionTask):
         return [repeat, fold, index, prediction, truth]
-    else:
-        raise NotImplementedError(f"Formatting for {type(task)} is not supported.")
+
+    raise NotImplementedError(f"Formatting for {type(task)} is not supported.")
 
 
 def delete_run(run_id: int) -> bool:
diff --git a/openml/runs/run.py b/openml/runs/run.py
index 5528c8a67..766f8c97f 100644
--- a/openml/runs/run.py
+++ b/openml/runs/run.py
@@ -1,10 +1,16 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
-from collections import OrderedDict
 import pickle
 import time
-from typing import Any, IO, TextIO, List, Union, Tuple, Optional, Dict  # noqa F401
-import os
+from collections import OrderedDict
+from pathlib import Path
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Sequence,
+)
 
 import arff
 import numpy as np
@@ -13,17 +19,21 @@
 import openml
 import openml._api_calls
 from openml.base import OpenMLBase
-from ..exceptions import PyOpenMLError
-from ..flows import get_flow
-from ..tasks import (
-    get_task,
-    TaskType,
+from openml.exceptions import PyOpenMLError
+from openml.flows import OpenMLFlow, get_flow
+from openml.tasks import (
     OpenMLClassificationTask,
-    OpenMLLearningCurveTask,
     OpenMLClusteringTask,
+    OpenMLLearningCurveTask,
     OpenMLRegressionTask,
+    OpenMLTask,
+    TaskType,
+    get_task,
 )
 
+if TYPE_CHECKING:
+    from openml.runs.trace import OpenMLRunTrace
+
 
 class OpenMLRun(OpenMLBase):
     """OpenML Run: result of running a model on an OpenML dataset.
@@ -38,7 +48,7 @@ class OpenMLRun(OpenMLBase):
         The ID of the OpenML dataset used for the run.
     setup_string: str
         The setup string of the run.
-    output_files: Dict[str, str]
+    output_files: Dict[str, int]
         Specifies where each related file can be found.
     setup_id: int
         An integer representing the ID of the setup used for the run.
@@ -66,7 +76,7 @@ class OpenMLRun(OpenMLBase):
         The evaluation measure used for the task.
     flow_name: str
         The name of the OpenML flow associated with the run.
-    parameter_settings: List[OrderedDict]
+    parameter_settings: list[OrderedDict]
         Representing the parameter settings used for the run.
     predictions_url: str
         The URL of the predictions file.
@@ -85,33 +95,33 @@ class OpenMLRun(OpenMLBase):
         Description of the run stored in the run meta-data.
     """
 
-    def __init__(
+    def __init__(  # noqa: PLR0913
         self,
-        task_id,
-        flow_id,
-        dataset_id,
-        setup_string=None,
-        output_files=None,
-        setup_id=None,
-        tags=None,
-        uploader=None,
-        uploader_name=None,
-        evaluations=None,
-        fold_evaluations=None,
-        sample_evaluations=None,
-        data_content=None,
-        trace=None,
-        model=None,
-        task_type=None,
-        task_evaluation_measure=None,
-        flow_name=None,
-        parameter_settings=None,
-        predictions_url=None,
-        task=None,
-        flow=None,
-        run_id=None,
-        description_text=None,
-        run_details=None,
+        task_id: int,
+        flow_id: int | None,
+        dataset_id: int | None,
+        setup_string: str | None = None,
+        output_files: dict[str, int] | None = None,
+        setup_id: int | None = None,
+        tags: list[str] | None = None,
+        uploader: int | None = None,
+        uploader_name: str | None = None,
+        evaluations: dict | None = None,
+        fold_evaluations: dict | None = None,
+        sample_evaluations: dict | None = None,
+        data_content: list[list] | None = None,
+        trace: OpenMLRunTrace | None = None,
+        model: object | None = None,
+        task_type: str | None = None,
+        task_evaluation_measure: str | None = None,
+        flow_name: str | None = None,
+        parameter_settings: list[dict[str, Any]] | None = None,
+        predictions_url: str | None = None,
+        task: OpenMLTask | None = None,
+        flow: OpenMLFlow | None = None,
+        run_id: int | None = None,
+        description_text: str | None = None,
+        run_details: str | None = None,
     ):
         self.uploader = uploader
         self.uploader_name = uploader_name
@@ -153,12 +163,14 @@ def predictions(self) -> pd.DataFrame:
             else:
                 raise RuntimeError("Run has no predictions.")
             self._predictions = pd.DataFrame(
-                arff_dict["data"], columns=[name for name, _ in arff_dict["attributes"]]
+                arff_dict["data"],
+                columns=[name for name, _ in arff_dict["attributes"]],
             )
         return self._predictions
 
     @property
-    def id(self) -> Optional[int]:
+    def id(self) -> int | None:
+        """The ID of the run, None if not uploaded to the server yet."""
         return self.run_id
 
     def _evaluation_summary(self, metric: str) -> str:
@@ -181,15 +193,17 @@ def _evaluation_summary(self, metric: str) -> str:
             A formatted string that displays the metric's evaluation summary.
             The summary consists of the mean and std.
         """
+        if self.fold_evaluations is None:
+            raise ValueError("No fold evaluations available.")
         fold_score_lists = self.fold_evaluations[metric].values()
 
         # Get the mean and std over all repetitions
         rep_means = [np.mean(list(x.values())) for x in fold_score_lists]
         rep_stds = [np.std(list(x.values())) for x in fold_score_lists]
 
-        return "{:.4f} +- {:.4f}".format(np.mean(rep_means), np.mean(rep_stds))
+        return f"{np.mean(rep_means):.4f} +- {np.mean(rep_stds):.4f}"
 
-    def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
+    def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str]]]:
         """Collect all information to display in the __repr__ body."""
         # Set up fields
         fields = {
@@ -201,20 +215,26 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
             "Task URL": openml.tasks.OpenMLTask.url_for_id(self.task_id),
             "Flow ID": self.flow_id,
             "Flow Name": self.flow_name,
-            "Flow URL": openml.flows.OpenMLFlow.url_for_id(self.flow_id),
+            "Flow URL": (
+                openml.flows.OpenMLFlow.url_for_id(self.flow_id)
+                if self.flow_id is not None
+                else None
+            ),
             "Setup ID": self.setup_id,
             "Setup String": self.setup_string,
             "Dataset ID": self.dataset_id,
-            "Dataset URL": openml.datasets.OpenMLDataset.url_for_id(self.dataset_id),
+            "Dataset URL": (
+                openml.datasets.OpenMLDataset.url_for_id(self.dataset_id)
+                if self.dataset_id is not None
+                else None
+            ),
         }
 
         # determines the order of the initial fields in which the information will be printed
         order = ["Uploader Name", "Uploader Profile", "Metric", "Result"]
 
         if self.uploader is not None:
-            fields["Uploader Profile"] = "{}/u/{}".format(
-                openml.config.get_server_base_url(), self.uploader
-            )
+            fields["Uploader Profile"] = f"{openml.config.get_server_base_url()}/u/{self.uploader}"
         if self.run_id is not None:
             fields["Run URL"] = self.openml_url
         if self.evaluations is not None and self.task_evaluation_measure in self.evaluations:
@@ -223,13 +243,11 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
             # -- Add locally computed summary values if possible
             if "predictive_accuracy" in self.fold_evaluations:
                 # OpenMLClassificationTask; OpenMLLearningCurveTask
-                # default: predictive_accuracy
                 result_field = "Local Result - Accuracy (+- STD)"
                 fields[result_field] = self._evaluation_summary("predictive_accuracy")
                 order.append(result_field)
             elif "mean_absolute_error" in self.fold_evaluations:
                 # OpenMLRegressionTask
-                # default: mean_absolute_error
                 result_field = "Local Result - MAE (+- STD)"
                 fields[result_field] = self._evaluation_summary("mean_absolute_error")
                 order.append(result_field)
@@ -255,10 +273,14 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
             "Dataset ID",
             "Dataset URL",
         ]
-        return [(key, fields[key]) for key in order if key in fields]
+        return [
+            (key, "None" if fields[key] is None else fields[key])  # type: ignore
+            for key in order
+            if key in fields
+        ]
 
     @classmethod
-    def from_filesystem(cls, directory: str, expect_model: bool = True) -> "OpenMLRun":
+    def from_filesystem(cls, directory: str | Path, expect_model: bool = True) -> OpenMLRun:  # noqa: FBT001, FBT002
         """
         The inverse of the to_filesystem method. Instantiates an OpenMLRun
         object based on files stored on the file system.
@@ -279,26 +301,26 @@ def from_filesystem(cls, directory: str, expect_model: bool = True) -> "OpenMLRu
         run : OpenMLRun
             the re-instantiated run object
         """
-
         # Avoiding cyclic imports
         import openml.runs.functions
 
-        if not os.path.isdir(directory):
+        directory = Path(directory)
+        if not directory.is_dir():
             raise ValueError("Could not find folder")
 
-        description_path = os.path.join(directory, "description.xml")
-        predictions_path = os.path.join(directory, "predictions.arff")
-        trace_path = os.path.join(directory, "trace.arff")
-        model_path = os.path.join(directory, "model.pkl")
+        description_path = directory / "description.xml"
+        predictions_path = directory / "predictions.arff"
+        trace_path = directory / "trace.arff"
+        model_path = directory / "model.pkl"
 
-        if not os.path.isfile(description_path):
+        if not description_path.is_file():
             raise ValueError("Could not find description.xml")
-        if not os.path.isfile(predictions_path):
+        if not predictions_path.is_file():
             raise ValueError("Could not find predictions.arff")
-        if not os.path.isfile(model_path) and expect_model:
+        if (not model_path.is_file()) and expect_model:
             raise ValueError("Could not find model.pkl")
 
-        with open(description_path, "r") as fht:
+        with description_path.open() as fht:
             xml_string = fht.read()
         run = openml.runs.functions._create_run_from_xml(xml_string, from_server=False)
 
@@ -307,25 +329,25 @@ def from_filesystem(cls, directory: str, expect_model: bool = True) -> "OpenMLRu
             run.flow = flow
             run.flow_name = flow.name
 
-        with open(predictions_path, "r") as fht:
+        with predictions_path.open() as fht:
             predictions = arff.load(fht)
             run.data_content = predictions["data"]
 
-        if os.path.isfile(model_path):
+        if model_path.is_file():
             # note that it will load the model if the file exists, even if
             # expect_model is False
-            with open(model_path, "rb") as fhb:
-                run.model = pickle.load(fhb)
+            with model_path.open("rb") as fhb:
+                run.model = pickle.load(fhb)  # noqa: S301
 
-        if os.path.isfile(trace_path):
+        if trace_path.is_file():
             run.trace = openml.runs.OpenMLRunTrace._from_filesystem(trace_path)
 
         return run
 
     def to_filesystem(
         self,
-        directory: str,
-        store_model: bool = True,
+        directory: str | Path,
+        store_model: bool = True,  # noqa: FBT001, FBT002
     ) -> None:
         """
         The inverse of the from_filesystem method. Serializes a run
@@ -344,32 +366,31 @@ def to_filesystem(
         """
         if self.data_content is None or self.model is None:
             raise ValueError("Run should have been executed (and contain " "model / predictions)")
+        directory = Path(directory)
+        directory.mkdir(exist_ok=True, parents=True)
 
-        os.makedirs(directory, exist_ok=True)
-        if not os.listdir(directory) == []:
-            raise ValueError(
-                "Output directory {} should be empty".format(os.path.abspath(directory))
-            )
+        if any(directory.iterdir()):
+            raise ValueError(f"Output directory {directory.expanduser().resolve()} should be empty")
 
         run_xml = self._to_xml()
         predictions_arff = arff.dumps(self._generate_arff_dict())
 
         # It seems like typing does not allow to define the same variable multiple times
-        with open(os.path.join(directory, "description.xml"), "w") as fh:  # type: TextIO
+        with (directory / "description.xml").open("w") as fh:
             fh.write(run_xml)
-        with open(os.path.join(directory, "predictions.arff"), "w") as fh:
+        with (directory / "predictions.arff").open("w") as fh:
             fh.write(predictions_arff)
         if store_model:
-            with open(os.path.join(directory, "model.pkl"), "wb") as fh_b:  # type: IO[bytes]
+            with (directory / "model.pkl").open("wb") as fh_b:
                 pickle.dump(self.model, fh_b)
 
-        if self.flow_id is None:
+        if self.flow_id is None and self.flow is not None:
             self.flow.to_filesystem(directory)
 
         if self.trace is not None:
             self.trace._to_filesystem(directory)
 
-    def _generate_arff_dict(self) -> "OrderedDict[str, Any]":
+    def _generate_arff_dict(self) -> OrderedDict[str, Any]:
         """Generates the arff dictionary for uploading predictions to the
         server.
 
@@ -386,6 +407,7 @@ def _generate_arff_dict(self) -> "OrderedDict[str, Any]":
         if self.data_content is None:
             raise ValueError("Run has not been executed.")
         if self.flow is None:
+            assert self.flow_id is not None, "Run has no associated flow id!"
             self.flow = get_flow(self.flow_id)
 
         if self.description_text is None:
@@ -395,7 +417,7 @@ def _generate_arff_dict(self) -> "OrderedDict[str, Any]":
         arff_dict = OrderedDict()  # type: 'OrderedDict[str, Any]'
         arff_dict["data"] = self.data_content
         arff_dict["description"] = self.description_text
-        arff_dict["relation"] = "openml_task_{}_predictions".format(task.task_id)
+        arff_dict["relation"] = f"openml_task_{task.task_id}_predictions"
 
         if isinstance(task, OpenMLLearningCurveTask):
             class_labels = task.class_labels
@@ -462,7 +484,7 @@ def _generate_arff_dict(self) -> "OrderedDict[str, Any]":
 
         return arff_dict
 
-    def get_metric_fn(self, sklearn_fn, kwargs=None):
+    def get_metric_fn(self, sklearn_fn: Callable, kwargs: dict | None = None) -> np.ndarray:  # noqa: PLR0915, PLR0912, C901
         """Calculates metric scores based on predicted values. Assumes the
         run has been executed locally (and contains run_data). Furthermore,
         it assumes that the 'correct' or 'truth' attribute is specified in
@@ -474,16 +496,18 @@ def get_metric_fn(self, sklearn_fn, kwargs=None):
         sklearn_fn : function
             a function pointer to a sklearn function that
             accepts ``y_true``, ``y_pred`` and ``**kwargs``
+        kwargs : dict
+            kwargs for the function
 
         Returns
         -------
-        scores : list
-            a list of floats, of length num_folds * num_repeats
+        scores : ndarray of scores of length num_folds * num_repeats
+            metric results
         """
-        kwargs = kwargs if kwargs else dict()
+        kwargs = kwargs if kwargs else {}
         if self.data_content is not None and self.task_id is not None:
             predictions_arff = self._generate_arff_dict()
-        elif "predictions" in self.output_files:
+        elif (self.output_files is not None) and ("predictions" in self.output_files):
             predictions_file_url = openml._api_calls._file_id_to_url(
                 self.output_files["predictions"],
                 "predictions.arff",
@@ -493,7 +517,7 @@ def get_metric_fn(self, sklearn_fn, kwargs=None):
             # TODO: make this a stream reader
         else:
             raise ValueError(
-                "Run should have been locally executed or " "contain outputfile reference."
+                "Run should have been locally executed or " "contain outputfile reference.",
             )
 
         # Need to know more about the task to compute scores correctly
@@ -510,7 +534,7 @@ def get_metric_fn(self, sklearn_fn, kwargs=None):
         if task.task_type_id != TaskType.CLUSTERING and "prediction" not in attribute_names:
             raise ValueError('Attribute "predict" should be set for ' "supervised task runs")
 
-        def _attribute_list_to_dict(attribute_list):
+        def _attribute_list_to_dict(attribute_list):  # type: ignore
             # convenience function: Creates a mapping to map from the name of
             # attributes present in the arff prediction file to their index.
             # This is necessary because the number of classes can be different
@@ -526,10 +550,7 @@ def _attribute_list_to_dict(attribute_list):
         fold_idx = attribute_dict["fold"]
         predicted_idx = attribute_dict["prediction"]  # Assume supervised task
 
-        if (
-            task.task_type_id == TaskType.SUPERVISED_CLASSIFICATION
-            or task.task_type_id == TaskType.LEARNING_CURVE
-        ):
+        if task.task_type_id in (TaskType.SUPERVISED_CLASSIFICATION, TaskType.LEARNING_CURVE):
             correct_idx = attribute_dict["correct"]
         elif task.task_type_id == TaskType.SUPERVISED_REGRESSION:
             correct_idx = attribute_dict["truth"]
@@ -545,27 +566,23 @@ def _attribute_list_to_dict(attribute_list):
             pred = predictions_arff["attributes"][predicted_idx][1]
             corr = predictions_arff["attributes"][correct_idx][1]
             raise ValueError(
-                "Predicted and Correct do not have equal values:"
-                " %s Vs. %s" % (str(pred), str(corr))
+                "Predicted and Correct do not have equal values:" f" {pred!s} Vs. {corr!s}",
             )
 
         # TODO: these could be cached
-        values_predict = {}
-        values_correct = {}
-        for line_idx, line in enumerate(predictions_arff["data"]):
+        values_predict: dict[int, dict[int, dict[int, list[float]]]] = {}
+        values_correct: dict[int, dict[int, dict[int, list[float]]]] = {}
+        for _line_idx, line in enumerate(predictions_arff["data"]):
             rep = line[repeat_idx]
             fold = line[fold_idx]
-            if has_samples:
-                samp = line[sample_idx]
-            else:
-                samp = 0  # No learning curve sample, always 0
+            samp = line[sample_idx] if has_samples else 0
 
             if task.task_type_id in [
                 TaskType.SUPERVISED_CLASSIFICATION,
                 TaskType.LEARNING_CURVE,
             ]:
                 prediction = predictions_arff["attributes"][predicted_idx][1].index(
-                    line[predicted_idx]
+                    line[predicted_idx],
                 )
                 correct = predictions_arff["attributes"][predicted_idx][1].index(line[correct_idx])
             elif task.task_type_id == TaskType.SUPERVISED_REGRESSION:
@@ -585,19 +602,19 @@ def _attribute_list_to_dict(attribute_list):
             values_correct[rep][fold][samp].append(correct)
 
         scores = []
-        for rep in values_predict.keys():
-            for fold in values_predict[rep].keys():
+        for rep in values_predict:
+            for fold in values_predict[rep]:
                 last_sample = len(values_predict[rep][fold]) - 1
                 y_pred = values_predict[rep][fold][last_sample]
                 y_true = values_correct[rep][fold][last_sample]
                 scores.append(sklearn_fn(y_true, y_pred, **kwargs))
         return np.array(scores)
 
-    def _parse_publish_response(self, xml_response: Dict):
+    def _parse_publish_response(self, xml_response: dict) -> None:
         """Parse the id from the xml_response and assign it to self."""
         self.run_id = int(xml_response["oml:upload_run"]["oml:run_id"])
 
-    def _get_file_elements(self) -> Dict:
+    def _get_file_elements(self) -> dict:
         """Get file_elements to upload to the server.
 
         Derived child classes should overwrite this method as necessary.
@@ -605,21 +622,22 @@ def _get_file_elements(self) -> Dict:
         """
         if self.parameter_settings is None and self.model is None:
             raise PyOpenMLError(
-                "OpenMLRun must contain a model or be initialized with parameter_settings."
+                "OpenMLRun must contain a model or be initialized with parameter_settings.",
             )
         if self.flow_id is None:
             if self.flow is None:
                 raise PyOpenMLError(
                     "OpenMLRun object does not contain a flow id or reference to OpenMLFlow "
-                    "(these should have been added while executing the task). "
+                    "(these should have been added while executing the task). ",
                 )
-            else:
-                # publish the linked Flow before publishing the run.
-                self.flow.publish()
-                self.flow_id = self.flow.flow_id
+
+            # publish the linked Flow before publishing the run.
+            self.flow.publish()
+            self.flow_id = self.flow.flow_id
 
         if self.parameter_settings is None:
             if self.flow is None:
+                assert self.flow_id is not None  # for mypy
                 self.flow = openml.flows.get_flow(self.flow_id)
             self.parameter_settings = self.flow.extension.obtain_parameter_values(
                 self.flow,
@@ -637,7 +655,7 @@ def _get_file_elements(self) -> Dict:
             file_elements["trace"] = ("trace.arff", trace_arff)
         return file_elements
 
-    def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
+    def _to_dict(self) -> dict[str, dict]:  # noqa: PLR0912, C901
         """Creates a dictionary representation of self."""
         description = OrderedDict()  # type: 'OrderedDict'
         description["oml:run"] = OrderedDict()
@@ -657,7 +675,7 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
             self.sample_evaluations is not None and len(self.sample_evaluations) > 0
         ):
             description["oml:run"]["oml:output_data"] = OrderedDict()
-            description["oml:run"]["oml:output_data"]["oml:evaluation"] = list()
+            description["oml:run"]["oml:output_data"]["oml:evaluation"] = []
         if self.fold_evaluations is not None:
             for measure in self.fold_evaluations:
                 for repeat in self.fold_evaluations[measure]:
@@ -668,7 +686,7 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
                                 ("@fold", str(fold)),
                                 ("oml:name", measure),
                                 ("oml:value", str(value)),
-                            ]
+                            ],
                         )
                         description["oml:run"]["oml:output_data"]["oml:evaluation"].append(current)
         if self.sample_evaluations is not None:
@@ -683,9 +701,9 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
                                     ("@sample", str(sample)),
                                     ("oml:name", measure),
                                     ("oml:value", str(value)),
-                                ]
+                                ],
                             )
                             description["oml:run"]["oml:output_data"]["oml:evaluation"].append(
-                                current
+                                current,
                             )
         return description
diff --git a/openml/runs/trace.py b/openml/runs/trace.py
index f6b038a55..3b7d60c2f 100644
--- a/openml/runs/trace.py
+++ b/openml/runs/trace.py
@@ -1,10 +1,12 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
+import json
 from collections import OrderedDict
 from dataclasses import dataclass
-import json
-import os
-from typing import List, Tuple, Optional  # noqa F401
+from pathlib import Path
+from typing import IO, Any, Iterator
+from typing_extensions import Self
 
 import arff
 import xmltodict
@@ -19,7 +21,83 @@
 ]
 
 
-class OpenMLRunTrace(object):
+@dataclass
+class OpenMLTraceIteration:
+    """
+    OpenML Trace Iteration: parsed output from Run Trace call
+    Exactly one of `setup_string` or `parameters` must be provided.
+
+    Parameters
+    ----------
+    repeat : int
+        repeat number (in case of no repeats: 0)
+
+    fold : int
+        fold number (in case of no folds: 0)
+
+    iteration : int
+        iteration number of optimization procedure
+
+    setup_string : str, optional
+        json string representing the parameters
+        If not provided, ``parameters`` should be set.
+
+    evaluation : double
+        The evaluation that was awarded to this trace iteration.
+        Measure is defined by the task
+
+    selected : bool
+        Whether this was the best of all iterations, and hence
+        selected for making predictions. Per fold/repeat there
+        should be only one iteration selected
+
+    parameters : OrderedDict, optional
+        Dictionary specifying parameter names and their values.
+        If not provided, ``setup_string`` should be set.
+    """
+
+    repeat: int
+    fold: int
+    iteration: int
+
+    evaluation: float
+    selected: bool
+
+    setup_string: dict[str, str] | None = None
+    parameters: dict[str, str | int | float] | None = None
+
+    def __post_init__(self) -> None:
+        # TODO: refactor into one argument of type <str | OrderedDict>
+        if self.setup_string and self.parameters:
+            raise ValueError(
+                "Can only be instantiated with either `setup_string` or `parameters` argument.",
+            )
+
+        if not (self.setup_string or self.parameters):
+            raise ValueError(
+                "Either `setup_string` or `parameters` needs to be passed as argument.",
+            )
+
+        if self.parameters is not None and not isinstance(self.parameters, dict):
+            raise TypeError(
+                "argument parameters is not an instance of OrderedDict, but %s"
+                % str(type(self.parameters)),
+            )
+
+    def get_parameters(self) -> dict[str, Any]:
+        """Get the parameters of this trace iteration."""
+        # parameters have prefix 'parameter_'
+        if self.setup_string:
+            return {
+                param[len(PREFIX) :]: json.loads(value)
+                for param, value in self.setup_string.items()
+            }
+
+        assert self.parameters is not None
+        return {param[len(PREFIX) :]: value for param, value in self.parameters.items()}
+
+
+class OpenMLRunTrace:
     """OpenML Run Trace: parsed output from Run Trace call
 
     Parameters
@@ -33,7 +111,20 @@ class OpenMLRunTrace(object):
 
     """
 
-    def __init__(self, run_id, trace_iterations):
+    def __init__(
+        self,
+        run_id: int | None,
+        trace_iterations: dict[tuple[int, int, int], OpenMLTraceIteration],
+    ):
+        """Object to hold the trace content of a run.
+
+        Parameters
+        ----------
+        run_id : int
+            Id for which the trace content is to be stored.
+        trace_iterations : List[List]
+            The trace content obtained by running a flow on a task.
+        """
         self.run_id = run_id
         self.trace_iterations = trace_iterations
 
@@ -50,7 +141,7 @@ def get_selected_iteration(self, fold: int, repeat: int) -> int:
         repeat: int
 
         Returns
-        ----------
+        -------
         int
             The trace iteration from the given fold and repeat that was
             selected as the best iteration by the search procedure
@@ -59,11 +150,15 @@ def get_selected_iteration(self, fold: int, repeat: int) -> int:
             if r == repeat and f == fold and self.trace_iterations[(r, f, i)].selected is True:
                 return i
         raise ValueError(
-            "Could not find the selected iteration for rep/fold %d/%d" % (repeat, fold)
+            "Could not find the selected iteration for rep/fold %d/%d" % (repeat, fold),
         )
 
     @classmethod
-    def generate(cls, attributes, content):
+    def generate(
+        cls,
+        attributes: list[tuple[str, str]],
+        content: list[list[int | float | str]],
+    ) -> OpenMLRunTrace:
         """Generates an OpenMLRunTrace.
 
         Generates the trace object from the attributes and content extracted
@@ -71,7 +166,6 @@ def generate(cls, attributes, content):
 
         Parameters
         ----------
-
         attributes : list
             List of tuples describing the arff attributes.
 
@@ -83,17 +177,16 @@ def generate(cls, attributes, content):
         -------
         OpenMLRunTrace
         """
-
         if content is None:
             raise ValueError("Trace content not available.")
-        elif attributes is None:
+        if attributes is None:
             raise ValueError("Trace attributes not available.")
-        elif len(content) == 0:
+        if len(content) == 0:
             raise ValueError("Trace content is empty.")
-        elif len(attributes) != len(content[0]):
+        if len(attributes) != len(content[0]):
             raise ValueError(
                 "Trace_attributes and trace_content not compatible:"
-                " %s vs %s" % (attributes, content[0])
+                f" {attributes} vs {content[0]}",
             )
 
         return cls._trace_from_arff_struct(
@@ -104,23 +197,25 @@ def generate(cls, attributes, content):
         )
 
     @classmethod
-    def _from_filesystem(cls, file_path: str) -> "OpenMLRunTrace":
+    def _from_filesystem(cls, file_path: str | Path) -> OpenMLRunTrace:
         """
         Logic to deserialize the trace from the filesystem.
 
         Parameters
         ----------
-        file_path: str
+        file_path: str | Path
             File path where the trace arff is stored.
 
         Returns
-        ----------
+        -------
         OpenMLRunTrace
         """
-        if not os.path.isfile(file_path):
+        file_path = Path(file_path)
+
+        if not file_path.exists():
             raise ValueError("Trace file doesn't exist")
 
-        with open(file_path, "r") as fp:
+        with file_path.open("r") as fp:
             trace_arff = arff.load(fp)
 
         for trace_idx in range(len(trace_arff["data"])):
@@ -128,27 +223,28 @@ def _from_filesystem(cls, file_path: str) -> "OpenMLRunTrace":
             # (fold, repeat, trace_iteration) these should be int
             for line_idx in range(3):
                 trace_arff["data"][trace_idx][line_idx] = int(
-                    trace_arff["data"][trace_idx][line_idx]
+                    trace_arff["data"][trace_idx][line_idx],
                 )
 
         return cls.trace_from_arff(trace_arff)
 
-    def _to_filesystem(self, file_path):
+    def _to_filesystem(self, file_path: str | Path) -> None:
         """Serialize the trace object to the filesystem.
 
         Serialize the trace object as an arff.
 
         Parameters
         ----------
-        file_path: str
+        file_path: str | Path
             File path where the trace arff will be stored.
         """
+        trace_path = Path(file_path) / "trace.arff"
 
         trace_arff = arff.dumps(self.trace_to_arff())
-        with open(os.path.join(file_path, "trace.arff"), "w") as f:
+        with trace_path.open("w") as f:
             f.write(trace_arff)
 
-    def trace_to_arff(self):
+    def trace_to_arff(self) -> dict[str, Any]:
         """Generate the arff dictionary for uploading predictions to the server.
 
         Uses the trace object to generate an arff dictionary representation.
@@ -174,24 +270,23 @@ def trace_to_arff(self):
             [
                 (PREFIX + parameter, "STRING")
                 for parameter in next(iter(self.trace_iterations.values())).get_parameters()
-            ]
+            ],
         )
 
-        arff_dict = OrderedDict()
+        arff_dict: dict[str, Any] = {}
         data = []
         for trace_iteration in self.trace_iterations.values():
             tmp_list = []
-            for attr, _ in trace_attributes:
-                if attr.startswith(PREFIX):
-                    attr = attr[len(PREFIX) :]
+            for _attr, _ in trace_attributes:
+                if _attr.startswith(PREFIX):
+                    attr = _attr[len(PREFIX) :]
                     value = trace_iteration.get_parameters()[attr]
                 else:
+                    attr = _attr
                     value = getattr(trace_iteration, attr)
+
                 if attr == "selected":
-                    if value:
-                        tmp_list.append("true")
-                    else:
-                        tmp_list.append("false")
+                    tmp_list.append("true" if value else "false")
                 else:
                     tmp_list.append(value)
             data.append(tmp_list)
@@ -203,7 +298,7 @@ def trace_to_arff(self):
         return arff_dict
 
     @classmethod
-    def trace_from_arff(cls, arff_obj):
+    def trace_from_arff(cls, arff_obj: dict[str, Any]) -> OpenMLRunTrace:
         """Generate trace from arff trace.
 
         Creates a trace file from arff object (for example, generated by a
@@ -227,7 +322,30 @@ def trace_from_arff(cls, arff_obj):
         )
 
     @classmethod
-    def _trace_from_arff_struct(cls, attributes, content, error_message):
+    def _trace_from_arff_struct(
+        cls,
+        attributes: list[tuple[str, str]],
+        content: list[list[int | float | str]],
+        error_message: str,
+    ) -> Self:
+        """Generate a trace dictionary from ARFF structure.
+
+        Parameters
+        ----------
+        cls : type
+            The trace object to be created.
+        attributes : list[tuple[str, str]]
+            Attribute descriptions.
+        content : list[list[int | float | str]]]
+            List of instances.
+        error_message : str
+            Error message to raise if `setup_string` is in `attributes`.
+
+        Returns
+        -------
+        OrderedDict
+            A dictionary representing the trace.
+        """
         trace = OrderedDict()
         attribute_idx = {att[0]: idx for idx, att in enumerate(attributes)}
 
@@ -241,17 +359,16 @@ def _trace_from_arff_struct(cls, attributes, content, error_message):
         # they are not parameters
         parameter_attributes = []
         for attribute in attribute_idx:
-            if attribute in REQUIRED_ATTRIBUTES:
-                continue
-            elif attribute == "setup_string":
+            if attribute in REQUIRED_ATTRIBUTES or attribute == "setup_string":
                 continue
-            elif not attribute.startswith(PREFIX):
+
+            if not attribute.startswith(PREFIX):
                 raise ValueError(
-                    "Encountered unknown attribute %s that does not start "
-                    "with prefix %s" % (attribute, PREFIX)
+                    f"Encountered unknown attribute {attribute} that does not start "
+                    f"with prefix {PREFIX}",
                 )
-            else:
-                parameter_attributes.append(attribute)
+
+            parameter_attributes.append(attribute)
 
         for itt in content:
             repeat = int(itt[attribute_idx["repeat"]])
@@ -266,12 +383,12 @@ def _trace_from_arff_struct(cls, attributes, content, error_message):
             else:
                 raise ValueError(
                     'expected {"true", "false"} value for selected field, '
-                    "received: %s" % selected_value
+                    "received: %s" % selected_value,
                 )
 
-            parameters = OrderedDict(
-                [(attribute, itt[attribute_idx[attribute]]) for attribute in parameter_attributes]
-            )
+            parameters = {
+                attribute: itt[attribute_idx[attribute]] for attribute in parameter_attributes
+            }
 
             current = OpenMLTraceIteration(
                 repeat=repeat,
@@ -287,7 +404,7 @@ def _trace_from_arff_struct(cls, attributes, content, error_message):
         return cls(None, trace)
 
     @classmethod
-    def trace_from_xml(cls, xml):
+    def trace_from_xml(cls, xml: str | Path | IO) -> OpenMLRunTrace:
         """Generate trace from xml.
 
         Creates a trace file from the xml description.
@@ -304,6 +421,9 @@ def trace_from_xml(cls, xml):
             Object containing the run id and a dict containing the trace
             iterations.
         """
+        if isinstance(xml, Path):
+            xml = str(xml.absolute())
+
         result_dict = xmltodict.parse(xml, force_list=("oml:trace_iteration",))["oml:trace"]
 
         run_id = result_dict["oml:run_id"]
@@ -328,7 +448,7 @@ def trace_from_xml(cls, xml):
             else:
                 raise ValueError(
                     'expected {"true", "false"} value for '
-                    "selected field, received: %s" % selected_value
+                    "selected field, received: %s" % selected_value,
                 )
 
             current = OpenMLTraceIteration(
@@ -344,30 +464,55 @@ def trace_from_xml(cls, xml):
         return cls(run_id, trace)
 
     @classmethod
-    def merge_traces(cls, traces: List["OpenMLRunTrace"]) -> "OpenMLRunTrace":
-        merged_trace = (
-            OrderedDict()
-        )  # type: OrderedDict[Tuple[int, int, int], OpenMLTraceIteration]  # noqa E501
+    def merge_traces(cls, traces: list[OpenMLRunTrace]) -> OpenMLRunTrace:
+        """Merge multiple traces into a single trace.
+
+        Parameters
+        ----------
+        cls : type
+            Type of the trace object to be created.
+        traces : List[OpenMLRunTrace]
+            List of traces to merge.
+
+        Returns
+        -------
+        OpenMLRunTrace
+            A trace object representing the merged traces.
+
+        Raises
+        ------
+        ValueError
+            If the parameters in the iterations of the traces being merged are not equal.
+            If a key (repeat, fold, iteration) is encountered twice while merging the traces.
+        """
+        merged_trace: dict[tuple[int, int, int], OpenMLTraceIteration] = {}
 
         previous_iteration = None
         for trace in traces:
             for iteration in trace:
                 key = (iteration.repeat, iteration.fold, iteration.iteration)
+
+                assert iteration.parameters is not None
+                param_keys = iteration.parameters.keys()
+
                 if previous_iteration is not None:
-                    if list(merged_trace[previous_iteration].parameters.keys()) != list(
-                        iteration.parameters.keys()
-                    ):
+                    trace_itr = merged_trace[previous_iteration]
+
+                    assert trace_itr.parameters is not None
+                    trace_itr_keys = trace_itr.parameters.keys()
+
+                    if list(param_keys) != list(trace_itr_keys):
                         raise ValueError(
                             "Cannot merge traces because the parameters are not equal: "
                             "{} vs {}".format(
-                                list(merged_trace[previous_iteration].parameters.keys()),
+                                list(trace_itr.parameters.keys()),
                                 list(iteration.parameters.keys()),
-                            )
+                            ),
                         )
 
                 if key in merged_trace:
                     raise ValueError(
-                        "Cannot merge traces because key '{}' was encountered twice".format(key)
+                        f"Cannot merge traces because key '{key}' was encountered twice",
                     )
 
                 merged_trace[key] = iteration
@@ -375,88 +520,11 @@ def merge_traces(cls, traces: List["OpenMLRunTrace"]) -> "OpenMLRunTrace":
 
         return cls(None, merged_trace)
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return "[Run id: {}, {} trace iterations]".format(
             -1 if self.run_id is None else self.run_id,
             len(self.trace_iterations),
         )
 
-    def __iter__(self):
-        for val in self.trace_iterations.values():
-            yield val
-
-
-@dataclass
-class OpenMLTraceIteration:
-    """
-    OpenML Trace Iteration: parsed output from Run Trace call
-    Exactly one of `setup_string` or `parameters` must be provided.
-
-    Parameters
-    ----------
-    repeat : int
-        repeat number (in case of no repeats: 0)
-
-    fold : int
-        fold number (in case of no folds: 0)
-
-    iteration : int
-        iteration number of optimization procedure
-
-    setup_string : str, optional
-        json string representing the parameters
-        If not provided, ``parameters`` should be set.
-
-    evaluation : double
-        The evaluation that was awarded to this trace iteration.
-        Measure is defined by the task
-
-    selected : bool
-        Whether this was the best of all iterations, and hence
-        selected for making predictions. Per fold/repeat there
-        should be only one iteration selected
-
-    parameters : OrderedDict, optional
-        Dictionary specifying parameter names and their values.
-        If not provided, ``setup_string`` should be set.
-    """
-
-    repeat: int
-    fold: int
-    iteration: int
-
-    evaluation: float
-    selected: bool
-
-    setup_string: Optional[str] = None
-    parameters: Optional[OrderedDict] = None
-
-    def __post_init__(self):
-        # TODO: refactor into one argument of type <str | OrderedDict>
-        if self.setup_string and self.parameters:
-            raise ValueError(
-                "Can only be instantiated with either `setup_string` or `parameters` argument."
-            )
-        elif not (self.setup_string or self.parameters):
-            raise ValueError(
-                "Either `setup_string` or `parameters` needs to be passed as argument."
-            )
-        if self.parameters is not None and not isinstance(self.parameters, OrderedDict):
-            raise TypeError(
-                "argument parameters is not an instance of OrderedDict, but %s"
-                % str(type(self.parameters))
-            )
-
-    def get_parameters(self):
-        result = {}
-        # parameters have prefix 'parameter_'
-
-        if self.setup_string:
-            for param in self.setup_string:
-                key = param[len(PREFIX) :]
-                value = self.setup_string[param]
-                result[key] = json.loads(value)
-        else:
-            for param, value in self.parameters.items():
-                result[param[len(PREFIX) :]] = value
-        return result
+    def __iter__(self) -> Iterator[OpenMLTraceIteration]:
+        yield from self.trace_iterations.values()
diff --git a/openml/setups/__init__.py b/openml/setups/__init__.py
index 31f4f503f..dd38cb9b7 100644
--- a/openml/setups/__init__.py
+++ b/openml/setups/__init__.py
@@ -1,7 +1,7 @@
 # License: BSD 3-Clause
 
-from .setup import OpenMLSetup, OpenMLParameter
-from .functions import get_setup, list_setups, setup_exists, initialize_model
+from .functions import get_setup, initialize_model, list_setups, setup_exists
+from .setup import OpenMLParameter, OpenMLSetup
 
 __all__ = [
     "OpenMLSetup",
diff --git a/openml/setups/functions.py b/openml/setups/functions.py
index b9af97c6e..ee0c6d707 100644
--- a/openml/setups/functions.py
+++ b/openml/setups/functions.py
@@ -1,28 +1,31 @@
 # License: BSD 3-Clause
+from __future__ import annotations
+
 import warnings
 from collections import OrderedDict
-import io
-import os
-from typing import Any, Union, List, Dict, Optional
+from pathlib import Path
+from typing import Any, Iterable
+from typing_extensions import Literal
 
-import xmltodict
 import pandas as pd
+import xmltodict
 
 import openml
-from .. import config
-from .setup import OpenMLSetup, OpenMLParameter
-from openml.flows import flow_exists
 import openml.exceptions
 import openml.utils
+from openml import config
+from openml.flows import OpenMLFlow, flow_exists
 
+from .setup import OpenMLParameter, OpenMLSetup
 
-def setup_exists(flow) -> int:
+
+def setup_exists(flow: OpenMLFlow) -> int:
     """
     Checks whether a hyperparameter configuration already exists on the server.
 
     Parameters
     ----------
-    flow : flow
+    flow : OpenMLFlow
         The openml flow object. Should have flow id present for the main flow
         and all subflows (i.e., it should be downloaded from the server by
         means of flow.get, and not instantiated locally)
@@ -44,40 +47,57 @@ def setup_exists(flow) -> int:
     if exists != flow.flow_id:
         raise ValueError(
             f"Local flow id ({flow.id}) differs from server id ({exists}). "
-            "If this issue persists, please contact the developers."
+            "If this issue persists, please contact the developers.",
         )
 
     openml_param_settings = flow.extension.obtain_parameter_values(flow)
     description = xmltodict.unparse(_to_dict(flow.flow_id, openml_param_settings), pretty=True)
     file_elements = {
-        "description": ("description.arff", description)
+        "description": ("description.arff", description),
     }  # type: openml._api_calls.FILE_ELEMENTS_TYPE
     result = openml._api_calls._perform_api_call(
-        "/setup/exists/", "post", file_elements=file_elements
+        "/setup/exists/",
+        "post",
+        file_elements=file_elements,
     )
     result_dict = xmltodict.parse(result)
     setup_id = int(result_dict["oml:setup_exists"]["oml:id"])
     return setup_id if setup_id > 0 else False
 
 
-def _get_cached_setup(setup_id):
-    """Load a run from the cache."""
-    cache_dir = config.get_cache_directory()
-    setup_cache_dir = os.path.join(cache_dir, "setups", str(setup_id))
+def _get_cached_setup(setup_id: int) -> OpenMLSetup:
+    """Load a run from the cache.
+
+    Parameters
+    ----------
+    setup_id : int
+        ID of the setup to be loaded.
+
+    Returns
+    -------
+    OpenMLSetup
+        The loaded setup object.
+
+    Raises
+    ------
+    OpenMLCacheException
+        If the setup file for the given setup ID is not cached.
+    """
+    cache_dir = Path(config.get_cache_directory())
+    setup_cache_dir = cache_dir / "setups" / str(setup_id)
     try:
-        setup_file = os.path.join(setup_cache_dir, "description.xml")
-        with io.open(setup_file, encoding="utf8") as fh:
+        setup_file = setup_cache_dir / "description.xml"
+        with setup_file.open(encoding="utf8") as fh:
             setup_xml = xmltodict.parse(fh.read())
-            setup = _create_setup_from_xml(setup_xml, output_format="object")
-        return setup
+            return _create_setup_from_xml(setup_xml, output_format="object")  # type: ignore
 
-    except (OSError, IOError):
+    except OSError as e:
         raise openml.exceptions.OpenMLCacheException(
-            "Setup file for setup id %d not cached" % setup_id
-        )
+            "Setup file for setup id %d not cached" % setup_id,
+        ) from e
 
 
-def get_setup(setup_id):
+def get_setup(setup_id: int) -> OpenMLSetup:
     """
      Downloads the setup (configuration) description from OpenML
      and returns a structured object
@@ -89,34 +109,33 @@ def get_setup(setup_id):
 
     Returns
     -------
-    dict or OpenMLSetup(an initialized openml setup object)
+    OpenMLSetup (an initialized openml setup object)
     """
-    setup_dir = os.path.join(config.get_cache_directory(), "setups", str(setup_id))
-    setup_file = os.path.join(setup_dir, "description.xml")
+    setup_dir = Path(config.get_cache_directory()) / "setups" / str(setup_id)
+    setup_dir.mkdir(exist_ok=True, parents=True)
 
-    if not os.path.exists(setup_dir):
-        os.makedirs(setup_dir)
+    setup_file = setup_dir / "description.xml"
 
     try:
         return _get_cached_setup(setup_id)
     except openml.exceptions.OpenMLCacheException:
         url_suffix = "/setup/%d" % setup_id
         setup_xml = openml._api_calls._perform_api_call(url_suffix, "get")
-        with io.open(setup_file, "w", encoding="utf8") as fh:
+        with setup_file.open("w", encoding="utf8") as fh:
             fh.write(setup_xml)
 
     result_dict = xmltodict.parse(setup_xml)
-    return _create_setup_from_xml(result_dict, output_format="object")
+    return _create_setup_from_xml(result_dict, output_format="object")  # type: ignore
 
 
-def list_setups(
-    offset: Optional[int] = None,
-    size: Optional[int] = None,
-    flow: Optional[int] = None,
-    tag: Optional[str] = None,
-    setup: Optional[List] = None,
-    output_format: str = "object",
-) -> Union[Dict, pd.DataFrame]:
+def list_setups(  # noqa: PLR0913
+    offset: int | None = None,
+    size: int | None = None,
+    flow: int | None = None,
+    tag: str | None = None,
+    setup: Iterable[int] | None = None,
+    output_format: Literal["object", "dict", "dataframe"] = "object",
+) -> dict | pd.DataFrame:
     """
     List all setups matching all of the given filters.
 
@@ -126,10 +145,9 @@ def list_setups(
     size : int, optional
     flow : int, optional
     tag : str, optional
-    setup : list(int), optional
+    setup : Iterable[int], optional
     output_format: str, optional (default='object')
         The parameter decides the format of the output.
-        - If 'object' the output is a dict of OpenMLSetup objects
         - If 'dict' the output is a dict of dict
         - If 'dataframe' the output is a pandas DataFrame
 
@@ -139,7 +157,7 @@ def list_setups(
     """
     if output_format not in ["dataframe", "dict", "object"]:
         raise ValueError(
-            "Invalid output format selected. " "Only 'dict', 'object', or 'dataframe' applicable."
+            "Invalid output format selected. " "Only 'dict', 'object', or 'dataframe' applicable.",
         )
 
     # TODO: [0.15]
@@ -152,8 +170,8 @@ def list_setups(
         warnings.warn(msg, category=FutureWarning, stacklevel=2)
 
     batch_size = 1000  # batch size for setups is lower
-    return openml.utils._list_all(
-        output_format=output_format,
+    return openml.utils._list_all(  # type: ignore
+        list_output_format=output_format,  # type: ignore
         listing_call=_list_setups,
         offset=offset,
         size=size,
@@ -164,7 +182,11 @@ def list_setups(
     )
 
 
-def _list_setups(setup=None, output_format="object", **kwargs):
+def _list_setups(
+    setup: Iterable[int] | None = None,
+    output_format: Literal["dict", "dataframe", "object"] = "object",
+    **kwargs: Any,
+) -> dict[int, dict] | pd.DataFrame | dict[int, OpenMLSetup]:
     """
     Perform API call `/setup/list/{filters}`
 
@@ -179,26 +201,28 @@ def _list_setups(setup=None, output_format="object", **kwargs):
         The parameter decides the format of the output.
         - If 'dict' the output is a dict of dict
         - If 'dataframe' the output is a pandas DataFrame
+        - If 'object' the output is a dict of OpenMLSetup objects
 
     kwargs: dict, optional
         Legal filter operators: flow, setup, limit, offset, tag.
 
     Returns
     -------
-    dict or dataframe
+    dict or dataframe or list[OpenMLSetup]
     """
-
     api_call = "setup/list"
     if setup is not None:
         api_call += "/setup/%s" % ",".join([str(int(i)) for i in setup])
     if kwargs is not None:
         for operator, value in kwargs.items():
-            api_call += "/%s/%s" % (operator, value)
+            api_call += f"/{operator}/{value}"
 
     return __list_setups(api_call=api_call, output_format=output_format)
 
 
-def __list_setups(api_call, output_format="object"):
+def __list_setups(
+    api_call: str, output_format: Literal["dict", "dataframe", "object"] = "object"
+) -> dict[int, dict] | pd.DataFrame | dict[int, OpenMLSetup]:
     """Helper function to parse API calls which are lists of setups"""
     xml_string = openml._api_calls._perform_api_call(api_call, "get")
     setups_dict = xmltodict.parse(xml_string, force_list=("oml:setup",))
@@ -206,32 +230,35 @@ def __list_setups(api_call, output_format="object"):
     # Minimalistic check if the XML is useful
     if "oml:setups" not in setups_dict:
         raise ValueError(
-            'Error in return XML, does not contain "oml:setups":' " %s" % str(setups_dict)
+            'Error in return XML, does not contain "oml:setups":' " %s" % str(setups_dict),
         )
-    elif "@xmlns:oml" not in setups_dict["oml:setups"]:
+
+    if "@xmlns:oml" not in setups_dict["oml:setups"]:
         raise ValueError(
             "Error in return XML, does not contain "
-            '"oml:setups"/@xmlns:oml: %s' % str(setups_dict)
+            '"oml:setups"/@xmlns:oml: %s' % str(setups_dict),
         )
-    elif setups_dict["oml:setups"]["@xmlns:oml"] != openml_uri:
+
+    if setups_dict["oml:setups"]["@xmlns:oml"] != openml_uri:
         raise ValueError(
             "Error in return XML, value of  "
             '"oml:seyups"/@xmlns:oml is not '
-            '"%s": %s' % (openml_uri, str(setups_dict))
+            f'"{openml_uri}": {setups_dict!s}',
         )
 
-    assert type(setups_dict["oml:setups"]["oml:setup"]) == list, type(setups_dict["oml:setups"])
+    assert isinstance(setups_dict["oml:setups"]["oml:setup"], list), type(setups_dict["oml:setups"])
 
-    setups = dict()
+    setups = {}
     for setup_ in setups_dict["oml:setups"]["oml:setup"]:
         # making it a dict to give it the right format
         current = _create_setup_from_xml(
-            {"oml:setup_parameters": setup_}, output_format=output_format
+            {"oml:setup_parameters": setup_},
+            output_format=output_format,
         )
         if output_format == "object":
-            setups[current.setup_id] = current
+            setups[current.setup_id] = current  # type: ignore
         else:
-            setups[current["setup_id"]] = current
+            setups[current["setup_id"]] = current  # type: ignore
 
     if output_format == "dataframe":
         setups = pd.DataFrame.from_dict(setups, orient="index")
@@ -259,21 +286,38 @@ def initialize_model(setup_id: int) -> Any:
     # instead of using scikit-learns or any other library's "set_params" function, we override the
     # OpenMLFlow objects default parameter value so we can utilize the
     # Extension.flow_to_model() function to reinitialize the flow with the set defaults.
-    for hyperparameter in setup.parameters.values():
-        structure = flow.get_structure("flow_id")
-        if len(structure[hyperparameter.flow_id]) > 0:
-            subflow = flow.get_subflow(structure[hyperparameter.flow_id])
-        else:
-            subflow = flow
-        subflow.parameters[hyperparameter.parameter_name] = hyperparameter.value
+    if setup.parameters is not None:
+        for hyperparameter in setup.parameters.values():
+            structure = flow.get_structure("flow_id")
+            if len(structure[hyperparameter.flow_id]) > 0:
+                subflow = flow.get_subflow(structure[hyperparameter.flow_id])
+            else:
+                subflow = flow
+            subflow.parameters[hyperparameter.parameter_name] = hyperparameter.value
+
+    return flow.extension.flow_to_model(flow)
 
-    model = flow.extension.flow_to_model(flow)
-    return model
 
+def _to_dict(
+    flow_id: int, openml_parameter_settings: list[OpenMLParameter] | list[dict[str, Any]]
+) -> OrderedDict:
+    """Convert a flow ID and a list of OpenML parameter settings to
+    a dictionary representation that can be serialized to XML.
+
+    Parameters
+    ----------
+    flow_id : int
+        ID of the flow.
+    openml_parameter_settings : List[OpenMLParameter]
+        A list of OpenML parameter settings.
 
-def _to_dict(flow_id, openml_parameter_settings):
+    Returns
+    -------
+    OrderedDict
+        A dictionary representation of the flow ID and parameter settings.
+    """
     # for convenience, this function (ab)uses the run object.
-    xml = OrderedDict()
+    xml: OrderedDict = OrderedDict()
     xml["oml:run"] = OrderedDict()
     xml["oml:run"]["@xmlns:oml"] = "http://openml.org/openml"
     xml["oml:run"]["oml:flow_id"] = flow_id
@@ -282,43 +326,56 @@ def _to_dict(flow_id, openml_parameter_settings):
     return xml
 
 
-def _create_setup_from_xml(result_dict, output_format="object"):
-    """
-    Turns an API xml result into a OpenMLSetup object (or dict)
-    """
+def _create_setup_from_xml(
+    result_dict: dict, output_format: Literal["dict", "dataframe", "object"] = "object"
+) -> OpenMLSetup | dict[str, int | dict[int, Any] | None]:
+    """Turns an API xml result into a OpenMLSetup object (or dict)"""
+    if output_format in ["dataframe", "dict"]:
+        _output_format: Literal["dict", "object"] = "dict"
+    elif output_format == "object":
+        _output_format = "object"
+    else:
+        raise ValueError(
+            f"Invalid output format selected: {output_format}"
+            "Only 'dict', 'object', or 'dataframe' applicable.",
+        )
+
     setup_id = int(result_dict["oml:setup_parameters"]["oml:setup_id"])
     flow_id = int(result_dict["oml:setup_parameters"]["oml:flow_id"])
-    parameters = {}
     if "oml:parameter" not in result_dict["oml:setup_parameters"]:
         parameters = None
     else:
+        parameters = {}
         # basically all others
         xml_parameters = result_dict["oml:setup_parameters"]["oml:parameter"]
         if isinstance(xml_parameters, dict):
-            id = int(xml_parameters["oml:id"])
-            parameters[id] = _create_setup_parameter_from_xml(
-                result_dict=xml_parameters, output_format=output_format
+            oml_id = int(xml_parameters["oml:id"])
+            parameters[oml_id] = _create_setup_parameter_from_xml(
+                result_dict=xml_parameters,
+                output_format=_output_format,
             )
         elif isinstance(xml_parameters, list):
             for xml_parameter in xml_parameters:
-                id = int(xml_parameter["oml:id"])
-                parameters[id] = _create_setup_parameter_from_xml(
-                    result_dict=xml_parameter, output_format=output_format
+                oml_id = int(xml_parameter["oml:id"])
+                parameters[oml_id] = _create_setup_parameter_from_xml(
+                    result_dict=xml_parameter,
+                    output_format=_output_format,
                 )
         else:
             raise ValueError(
                 "Expected None, list or dict, received "
-                "something else: %s" % str(type(xml_parameters))
+                "something else: %s" % str(type(xml_parameters)),
             )
 
-    if output_format in ["dataframe", "dict"]:
-        return_dict = {"setup_id": setup_id, "flow_id": flow_id}
-        return_dict["parameters"] = parameters
-        return return_dict
+    if _output_format in ["dataframe", "dict"]:
+        return {"setup_id": setup_id, "flow_id": flow_id, "parameters": parameters}
     return OpenMLSetup(setup_id, flow_id, parameters)
 
 
-def _create_setup_parameter_from_xml(result_dict, output_format="object"):
+def _create_setup_parameter_from_xml(
+    result_dict: dict[str, str], output_format: Literal["object", "dict"] = "object"
+) -> dict[str, int | str] | OpenMLParameter:
+    """Create an OpenMLParameter object or a dictionary from an API xml result."""
     if output_format == "object":
         return OpenMLParameter(
             input_id=int(result_dict["oml:id"]),
@@ -330,14 +387,16 @@ def _create_setup_parameter_from_xml(result_dict, output_format="object"):
             default_value=result_dict["oml:default_value"],
             value=result_dict["oml:value"],
         )
-    else:
-        return {
-            "input_id": int(result_dict["oml:id"]),
-            "flow_id": int(result_dict["oml:flow_id"]),
-            "flow_name": result_dict["oml:flow_name"],
-            "full_name": result_dict["oml:full_name"],
-            "parameter_name": result_dict["oml:parameter_name"],
-            "data_type": result_dict["oml:data_type"],
-            "default_value": result_dict["oml:default_value"],
-            "value": result_dict["oml:value"],
-        }
+
+    # FIXME: likely we want to crash here if unknown output_format but not backwards compatible
+    # output_format == "dict" case,
+    return {
+        "input_id": int(result_dict["oml:id"]),
+        "flow_id": int(result_dict["oml:flow_id"]),
+        "flow_name": result_dict["oml:flow_name"],
+        "full_name": result_dict["oml:full_name"],
+        "parameter_name": result_dict["oml:parameter_name"],
+        "data_type": result_dict["oml:data_type"],
+        "default_value": result_dict["oml:default_value"],
+        "value": result_dict["oml:value"],
+    }
diff --git a/openml/setups/setup.py b/openml/setups/setup.py
index 44919fd09..e8dc059e7 100644
--- a/openml/setups/setup.py
+++ b/openml/setups/setup.py
@@ -1,9 +1,13 @@
 # License: BSD 3-Clause
+from __future__ import annotations
+
+from typing import Any
 
 import openml.config
+import openml.flows
 
 
-class OpenMLSetup(object):
+class OpenMLSetup:
     """Setup object (a.k.a. Configuration).
 
     Parameters
@@ -16,20 +20,21 @@ class OpenMLSetup(object):
         The setting of the parameters
     """
 
-    def __init__(self, setup_id, flow_id, parameters):
+    def __init__(self, setup_id: int, flow_id: int, parameters: dict[int, Any] | None):
         if not isinstance(setup_id, int):
             raise ValueError("setup id should be int")
+
         if not isinstance(flow_id, int):
             raise ValueError("flow id should be int")
-        if parameters is not None:
-            if not isinstance(parameters, dict):
-                raise ValueError("parameters should be dict")
+
+        if parameters is not None and not isinstance(parameters, dict):
+            raise ValueError("parameters should be dict")
 
         self.setup_id = setup_id
         self.flow_id = flow_id
         self.parameters = parameters
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         header = "OpenML Setup"
         header = "{}\n{}\n".format(header, "=" * len(header))
 
@@ -37,20 +42,22 @@ def __repr__(self):
             "Setup ID": self.setup_id,
             "Flow ID": self.flow_id,
             "Flow URL": openml.flows.OpenMLFlow.url_for_id(self.flow_id),
-            "# of Parameters": len(self.parameters),
+            "# of Parameters": (
+                len(self.parameters) if self.parameters is not None else float("nan")
+            ),
         }
 
         # determines the order in which the information will be printed
         order = ["Setup ID", "Flow ID", "Flow URL", "# of Parameters"]
-        fields = [(key, fields[key]) for key in order if key in fields]
+        _fields = [(key, fields[key]) for key in order if key in fields]
 
-        longest_field_name_length = max(len(name) for name, value in fields)
-        field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length)
-        body = "\n".join(field_line_format.format(name, value) for name, value in fields)
+        longest_field_name_length = max(len(name) for name, _ in _fields)
+        field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}"
+        body = "\n".join(field_line_format.format(name, value) for name, value in _fields)
         return header + body
 
 
-class OpenMLParameter(object):
+class OpenMLParameter:
     """Parameter object (used in setup).
 
     Parameters
@@ -75,16 +82,16 @@ class OpenMLParameter(object):
         If the parameter was set, the value that it was set to.
     """
 
-    def __init__(
+    def __init__(  # noqa: PLR0913
         self,
-        input_id,
-        flow_id,
-        flow_name,
-        full_name,
-        parameter_name,
-        data_type,
-        default_value,
-        value,
+        input_id: int,
+        flow_id: int,
+        flow_name: str,
+        full_name: str,
+        parameter_name: str,
+        data_type: str,
+        default_value: str,
+        value: str,
     ):
         self.id = input_id
         self.flow_id = flow_id
@@ -95,7 +102,7 @@ def __init__(
         self.default_value = default_value
         self.value = value
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         header = "OpenML Parameter"
         header = "{}\n{}\n".format(header, "=" * len(header))
 
@@ -110,11 +117,11 @@ def __repr__(self):
         # indented prints for parameter attributes
         # indention = 2 spaces + 1 | + 2 underscores
         indent = "{}|{}".format(" " * 2, "_" * 2)
-        parameter_data_type = "{}Data Type".format(indent)
+        parameter_data_type = f"{indent}Data Type"
         fields[parameter_data_type] = self.data_type
-        parameter_default = "{}Default".format(indent)
+        parameter_default = f"{indent}Default"
         fields[parameter_default] = self.default_value
-        parameter_value = "{}Value".format(indent)
+        parameter_value = f"{indent}Value"
         fields[parameter_value] = self.value
 
         # determines the order in which the information will be printed
@@ -128,9 +135,9 @@ def __repr__(self):
             parameter_default,
             parameter_value,
         ]
-        fields = [(key, fields[key]) for key in order if key in fields]
+        _fields = [(key, fields[key]) for key in order if key in fields]
 
-        longest_field_name_length = max(len(name) for name, value in fields)
-        field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length)
-        body = "\n".join(field_line_format.format(name, value) for name, value in fields)
+        longest_field_name_length = max(len(name) for name, _ in _fields)
+        field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}"
+        body = "\n".join(field_line_format.format(name, value) for name, value in _fields)
         return header + body
diff --git a/openml/study/__init__.py b/openml/study/__init__.py
index 030ee05c2..b7d77fec4 100644
--- a/openml/study/__init__.py
+++ b/openml/study/__init__.py
@@ -1,23 +1,22 @@
 # License: BSD 3-Clause
 
-from .study import OpenMLStudy, OpenMLBenchmarkSuite
 from .functions import (
-    get_study,
-    get_suite,
-    create_study,
-    create_benchmark_suite,
-    update_study_status,
-    update_suite_status,
     attach_to_study,
     attach_to_suite,
-    detach_from_study,
-    detach_from_suite,
+    create_benchmark_suite,
+    create_study,
     delete_study,
     delete_suite,
+    detach_from_study,
+    detach_from_suite,
+    get_study,
+    get_suite,
     list_studies,
     list_suites,
+    update_study_status,
+    update_suite_status,
 )
-
+from .study import OpenMLBenchmarkSuite, OpenMLStudy
 
 __all__ = [
     "OpenMLStudy",
diff --git a/openml/study/functions.py b/openml/study/functions.py
index 1db09b8ad..9d726d286 100644
--- a/openml/study/functions.py
+++ b/openml/study/functions.py
@@ -1,17 +1,24 @@
 # License: BSD 3-Clause
+# ruff: noqa: PLR0913
+from __future__ import annotations
 
-from typing import cast, Dict, List, Optional, Union
 import warnings
+from typing import TYPE_CHECKING, Any, overload
+from typing_extensions import Literal
 
-import xmltodict
 import pandas as pd
+import xmltodict
 
-from openml.study import OpenMLStudy, OpenMLBenchmarkSuite
-from openml.study.study import BaseStudy
 import openml._api_calls
+import openml.config
+import openml.utils
+from openml.study.study import OpenMLBenchmarkSuite, OpenMLStudy
+
+if TYPE_CHECKING:
+    from openml.study.study import BaseStudy
 
 
-def get_suite(suite_id: Union[int, str]) -> OpenMLBenchmarkSuite:
+def get_suite(suite_id: int | str) -> OpenMLBenchmarkSuite:
     """
     Retrieves all relevant information of an OpenML benchmarking suite from the server.
 
@@ -25,14 +32,16 @@ def get_suite(suite_id: Union[int, str]) -> OpenMLBenchmarkSuite:
     OpenMLSuite
         The OpenML suite object
     """
-    suite = cast(OpenMLBenchmarkSuite, _get_study(suite_id, entity_type="task"))
-    return suite
+    study = _get_study(suite_id, entity_type="task")
+    assert isinstance(study, OpenMLBenchmarkSuite)
+
+    return study
 
 
 def get_study(
-    study_id: Union[int, str],
-    arg_for_backwards_compat: Optional[str] = None,
-) -> OpenMLStudy:  # noqa F401
+    study_id: int | str,
+    arg_for_backwards_compat: str | None = None,  # noqa: ARG001
+) -> OpenMLStudy:  # F401
     """
     Retrieves all relevant information of an OpenML study from the server.
 
@@ -57,18 +66,20 @@ def get_study(
             "It looks like you are running code from the OpenML100 paper. It still works, but lots "
             "of things have changed since then. Please use `get_suite('OpenML100')` instead."
         )
-        warnings.warn(message, DeprecationWarning)
+        warnings.warn(message, DeprecationWarning, stacklevel=2)
         openml.config.logger.warning(message)
         study = _get_study(study_id, entity_type="task")
-        return cast(OpenMLBenchmarkSuite, study)  # type: ignore
-    else:
-        study = cast(OpenMLStudy, _get_study(study_id, entity_type="run"))
-        return study
+        assert isinstance(study, OpenMLBenchmarkSuite)
 
+        return study  # type: ignore
 
-def _get_study(id_: Union[int, str], entity_type) -> BaseStudy:
-    call_suffix = "study/{}".format(str(id_))
-    xml_string = openml._api_calls._perform_api_call(call_suffix, "get")
+    study = _get_study(study_id, entity_type="run")
+    assert isinstance(study, OpenMLStudy)
+    return study
+
+
+def _get_study(id_: int | str, entity_type: str) -> BaseStudy:
+    xml_string = openml._api_calls._perform_api_call(f"study/{id_}", "get")
     force_list_tags = (
         "oml:data_id",
         "oml:flow_id",
@@ -81,13 +92,13 @@ def _get_study(id_: Union[int, str], entity_type) -> BaseStudy:
     study_id = int(result_dict["oml:id"])
     alias = result_dict["oml:alias"] if "oml:alias" in result_dict else None
     main_entity_type = result_dict["oml:main_entity_type"]
+
     if entity_type != main_entity_type:
         raise ValueError(
-            "Unexpected entity type '{}' reported by the server, expected '{}'".format(
-                main_entity_type,
-                entity_type,
-            )
+            f"Unexpected entity type '{main_entity_type}' reported by the server"
+            f", expected '{entity_type}'"
         )
+
     benchmark_suite = (
         result_dict["oml:benchmark_suite"] if "oml:benchmark_suite" in result_dict else None
     )
@@ -106,7 +117,21 @@ def _get_study(id_: Union[int, str], entity_type) -> BaseStudy:
                 current_tag["window_start"] = tag["oml:window_start"]
             tags.append(current_tag)
 
-    def get_nested_ids_from_result_dict(key: str, subkey: str) -> Optional[List]:
+    def get_nested_ids_from_result_dict(key: str, subkey: str) -> list[int] | None:
+        """Extracts a list of nested IDs from a result dictionary.
+
+        Parameters
+        ----------
+        key : str
+            Nested OpenML IDs.
+        subkey : str
+            The subkey contains the nested OpenML IDs.
+
+        Returns
+        -------
+        Optional[List]
+            A list of nested OpenML IDs, or None if the key is not present in the dictionary.
+        """
         if result_dict.get(key) is not None:
             return [int(oml_id) for oml_id in result_dict[key][subkey]]
         return None
@@ -137,7 +162,6 @@ def get_nested_ids_from_result_dict(key: str, subkey: str) -> Optional[List]:
         )  # type: BaseStudy
 
     elif main_entity_type in ["tasks", "task"]:
-        tasks = cast("List[int]", tasks)
         study = OpenMLBenchmarkSuite(
             suite_id=study_id,
             alias=alias,
@@ -152,7 +176,7 @@ def get_nested_ids_from_result_dict(key: str, subkey: str) -> Optional[List]:
         )
 
     else:
-        raise ValueError("Unknown entity type {}".format(main_entity_type))
+        raise ValueError(f"Unknown entity type {main_entity_type}")
 
     return study
 
@@ -160,9 +184,9 @@ def get_nested_ids_from_result_dict(key: str, subkey: str) -> Optional[List]:
 def create_study(
     name: str,
     description: str,
-    run_ids: Optional[List[int]] = None,
-    alias: Optional[str] = None,
-    benchmark_suite: Optional[int] = None,
+    run_ids: list[int] | None = None,
+    alias: str | None = None,
+    benchmark_suite: int | None = None,
 ) -> OpenMLStudy:
     """
     Creates an OpenML study (collection of data, tasks, flows, setups and run),
@@ -211,8 +235,8 @@ def create_study(
 def create_benchmark_suite(
     name: str,
     description: str,
-    task_ids: List[int],
-    alias: Optional[str] = None,
+    task_ids: list[int],
+    alias: str | None = None,
 ) -> OpenMLBenchmarkSuite:
     """
     Creates an OpenML benchmark suite (collection of entity types, where
@@ -319,7 +343,7 @@ def delete_study(study_id: int) -> bool:
     return openml.utils._delete_entity("study", study_id)
 
 
-def attach_to_suite(suite_id: int, task_ids: List[int]) -> int:
+def attach_to_suite(suite_id: int, task_ids: list[int]) -> int:
     """Attaches a set of tasks to a benchmarking suite.
 
     Parameters
@@ -338,7 +362,7 @@ def attach_to_suite(suite_id: int, task_ids: List[int]) -> int:
     return attach_to_study(suite_id, task_ids)
 
 
-def attach_to_study(study_id: int, run_ids: List[int]) -> int:
+def attach_to_study(study_id: int, run_ids: list[int]) -> int:
     """Attaches a set of runs to a study.
 
     Parameters
@@ -354,18 +378,17 @@ def attach_to_study(study_id: int, run_ids: List[int]) -> int:
     int
         new size of the study (in terms of explicitly linked entities)
     """
-
     # Interestingly, there's no need to tell the server about the entity type, it knows by itself
-    uri = "study/%d/attach" % study_id
-    post_variables = {"ids": ",".join(str(x) for x in run_ids)}  # type: openml._api_calls.DATA_TYPE
     result_xml = openml._api_calls._perform_api_call(
-        call=uri, request_method="post", data=post_variables
+        call=f"study/{study_id}/attach",
+        request_method="post",
+        data={"ids": ",".join(str(x) for x in run_ids)},
     )
     result = xmltodict.parse(result_xml)["oml:study_attach"]
     return int(result["oml:linked_entities"])
 
 
-def detach_from_suite(suite_id: int, task_ids: List[int]) -> int:
+def detach_from_suite(suite_id: int, task_ids: list[int]) -> int:
     """Detaches a set of task ids from a suite.
 
     Parameters
@@ -379,11 +402,12 @@ def detach_from_suite(suite_id: int, task_ids: List[int]) -> int:
     Returns
     -------
     int
-        new size of the study (in terms of explicitly linked entities)"""
+    new size of the study (in terms of explicitly linked entities)
+    """
     return detach_from_study(suite_id, task_ids)
 
 
-def detach_from_study(study_id: int, run_ids: List[int]) -> int:
+def detach_from_study(study_id: int, run_ids: list[int]) -> int:
     """Detaches a set of run ids from a study.
 
     Parameters
@@ -399,24 +423,47 @@ def detach_from_study(study_id: int, run_ids: List[int]) -> int:
     int
         new size of the study (in terms of explicitly linked entities)
     """
-
     # Interestingly, there's no need to tell the server about the entity type, it knows by itself
     uri = "study/%d/detach" % study_id
     post_variables = {"ids": ",".join(str(x) for x in run_ids)}  # type: openml._api_calls.DATA_TYPE
     result_xml = openml._api_calls._perform_api_call(
-        call=uri, request_method="post", data=post_variables
+        call=uri,
+        request_method="post",
+        data=post_variables,
     )
     result = xmltodict.parse(result_xml)["oml:study_detach"]
     return int(result["oml:linked_entities"])
 
 
+@overload
+def list_suites(
+    offset: int | None = ...,
+    size: int | None = ...,
+    status: str | None = ...,
+    uploader: list[int] | None = ...,
+    output_format: Literal["dict"] = "dict",
+) -> dict:
+    ...
+
+
+@overload
+def list_suites(
+    offset: int | None = ...,
+    size: int | None = ...,
+    status: str | None = ...,
+    uploader: list[int] | None = ...,
+    output_format: Literal["dataframe"] = "dataframe",
+) -> pd.DataFrame:
+    ...
+
+
 def list_suites(
-    offset: Optional[int] = None,
-    size: Optional[int] = None,
-    status: Optional[str] = None,
-    uploader: Optional[List[int]] = None,
-    output_format: str = "dict",
-) -> Union[Dict, pd.DataFrame]:
+    offset: int | None = None,
+    size: int | None = None,
+    status: str | None = None,
+    uploader: list[int] | None = None,
+    output_format: Literal["dict", "dataframe"] = "dict",
+) -> dict | pd.DataFrame:
     """
     Return a list of all suites which are on OpenML.
 
@@ -461,7 +508,7 @@ def list_suites(
     """
     if output_format not in ["dataframe", "dict"]:
         raise ValueError(
-            "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
+            "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable.",
         )
     # TODO: [0.15]
     if output_format == "dict":
@@ -472,8 +519,8 @@ def list_suites(
         )
         warnings.warn(msg, category=FutureWarning, stacklevel=2)
 
-    return openml.utils._list_all(
-        output_format=output_format,
+    return openml.utils._list_all(  # type: ignore
+        list_output_format=output_format,  # type: ignore
         listing_call=_list_studies,
         offset=offset,
         size=size,
@@ -483,14 +530,38 @@ def list_suites(
     )
 
 
+@overload
 def list_studies(
-    offset: Optional[int] = None,
-    size: Optional[int] = None,
-    status: Optional[str] = None,
-    uploader: Optional[List[str]] = None,
-    benchmark_suite: Optional[int] = None,
-    output_format: str = "dict",
-) -> Union[Dict, pd.DataFrame]:
+    offset: int | None = ...,
+    size: int | None = ...,
+    status: str | None = ...,
+    uploader: list[str] | None = ...,
+    benchmark_suite: int | None = ...,
+    output_format: Literal["dict"] = "dict",
+) -> dict:
+    ...
+
+
+@overload
+def list_studies(
+    offset: int | None = ...,
+    size: int | None = ...,
+    status: str | None = ...,
+    uploader: list[str] | None = ...,
+    benchmark_suite: int | None = ...,
+    output_format: Literal["dataframe"] = "dataframe",
+) -> pd.DataFrame:
+    ...
+
+
+def list_studies(
+    offset: int | None = None,
+    size: int | None = None,
+    status: str | None = None,
+    uploader: list[str] | None = None,
+    benchmark_suite: int | None = None,
+    output_format: Literal["dict", "dataframe"] = "dict",
+) -> dict | pd.DataFrame:
     """
     Return a list of all studies which are on OpenML.
 
@@ -542,7 +613,7 @@ def list_studies(
     """
     if output_format not in ["dataframe", "dict"]:
         raise ValueError(
-            "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
+            "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable.",
         )
     # TODO: [0.15]
     if output_format == "dict":
@@ -553,8 +624,8 @@ def list_studies(
         )
         warnings.warn(msg, category=FutureWarning, stacklevel=2)
 
-    return openml.utils._list_all(
-        output_format=output_format,
+    return openml.utils._list_all(  # type: ignore
+        list_output_format=output_format,  # type: ignore
         listing_call=_list_studies,
         offset=offset,
         size=size,
@@ -565,7 +636,19 @@ def list_studies(
     )
 
 
-def _list_studies(output_format="dict", **kwargs) -> Union[Dict, pd.DataFrame]:
+@overload
+def _list_studies(output_format: Literal["dict"] = "dict", **kwargs: Any) -> dict:
+    ...
+
+
+@overload
+def _list_studies(output_format: Literal["dataframe"], **kwargs: Any) -> pd.DataFrame:
+    ...
+
+
+def _list_studies(
+    output_format: Literal["dict", "dataframe"] = "dict", **kwargs: Any
+) -> dict | pd.DataFrame:
     """
     Perform api call to return a list of studies.
 
@@ -586,23 +669,52 @@ def _list_studies(output_format="dict", **kwargs) -> Union[Dict, pd.DataFrame]:
     api_call = "study/list"
     if kwargs is not None:
         for operator, value in kwargs.items():
-            api_call += "/%s/%s" % (operator, value)
+            api_call += f"/{operator}/{value}"
     return __list_studies(api_call=api_call, output_format=output_format)
 
 
-def __list_studies(api_call, output_format="object") -> Union[Dict, pd.DataFrame]:
+@overload
+def __list_studies(api_call: str, output_format: Literal["dict"] = "dict") -> dict:
+    ...
+
+
+@overload
+def __list_studies(api_call: str, output_format: Literal["dataframe"]) -> pd.DataFrame:
+    ...
+
+
+def __list_studies(
+    api_call: str, output_format: Literal["dict", "dataframe"] = "dict"
+) -> dict | pd.DataFrame:
+    """Retrieves the list of OpenML studies and
+    returns it in a dictionary or a Pandas DataFrame.
+
+    Parameters
+    ----------
+    api_call : str
+        The API call for retrieving the list of OpenML studies.
+    output_format : str in {"dict", "dataframe"}
+        Format of the output, either 'object' for a dictionary
+        or 'dataframe' for a Pandas DataFrame.
+
+    Returns
+    -------
+    Union[Dict, pd.DataFrame]
+        A dictionary or Pandas DataFrame of OpenML studies,
+        depending on the value of 'output_format'.
+    """
     xml_string = openml._api_calls._perform_api_call(api_call, "get")
     study_dict = xmltodict.parse(xml_string, force_list=("oml:study",))
 
     # Minimalistic check if the XML is useful
-    assert type(study_dict["oml:study_list"]["oml:study"]) == list, type(
-        study_dict["oml:study_list"]
+    assert isinstance(study_dict["oml:study_list"]["oml:study"], list), type(
+        study_dict["oml:study_list"],
     )
     assert study_dict["oml:study_list"]["@xmlns:oml"] == "http://openml.org/openml", study_dict[
         "oml:study_list"
     ]["@xmlns:oml"]
 
-    studies = dict()
+    studies = {}
     for study_ in study_dict["oml:study_list"]["oml:study"]:
         # maps from xml name to a tuple of (dict name, casting fn)
         expected_fields = {
@@ -616,7 +728,7 @@ def __list_studies(api_call, output_format="object") -> Union[Dict, pd.DataFrame
             "oml:creator": ("creator", int),
         }
         study_id = int(study_["oml:id"])
-        current_study = dict()
+        current_study = {}
         for oml_field_name, (real_field_name, cast_fn) in expected_fields.items():
             if oml_field_name in study_:
                 current_study[real_field_name] = cast_fn(study_[oml_field_name])
diff --git a/openml/study/study.py b/openml/study/study.py
index cfc4cab3b..83bbf0497 100644
--- a/openml/study/study.py
+++ b/openml/study/study.py
@@ -1,10 +1,11 @@
 # License: BSD 3-Clause
+# TODO(eddiebergman): Begging for dataclassses to shorten this all
+from __future__ import annotations
 
-from collections import OrderedDict
-from typing import Dict, List, Optional, Tuple, Union, Any
+from typing import Any, Sequence
 
-import openml
 from openml.base import OpenMLBase
+from openml.config import get_server_base_url
 
 
 class BaseStudy(OpenMLBase):
@@ -55,23 +56,23 @@ class BaseStudy(OpenMLBase):
         a list of setup ids associated with this study
     """
 
-    def __init__(
+    def __init__(  # noqa: PLR0913
         self,
-        study_id: Optional[int],
-        alias: Optional[str],
+        study_id: int | None,
+        alias: str | None,
         main_entity_type: str,
-        benchmark_suite: Optional[int],
+        benchmark_suite: int | None,
         name: str,
         description: str,
-        status: Optional[str],
-        creation_date: Optional[str],
-        creator: Optional[int],
-        tags: Optional[List[Dict]],
-        data: Optional[List[int]],
-        tasks: Optional[List[int]],
-        flows: Optional[List[int]],
-        runs: Optional[List[int]],
-        setups: Optional[List[int]],
+        status: str | None,
+        creation_date: str | None,
+        creator: int | None,
+        tags: list[dict] | None,
+        data: list[int] | None,
+        tasks: list[int] | None,
+        flows: list[int] | None,
+        runs: list[int] | None,
+        setups: list[int] | None,
     ):
         self.study_id = study_id
         self.alias = alias
@@ -94,12 +95,13 @@ def _entity_letter(cls) -> str:
         return "s"
 
     @property
-    def id(self) -> Optional[int]:
+    def id(self) -> int | None:
+        """Return the id of the study."""
         return self.study_id
 
-    def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
+    def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str]]]:
         """Collect all information to display in the __repr__ body."""
-        fields: Dict[str, Any] = {
+        fields: dict[str, Any] = {
             "Name": self.name,
             "Status": self.status,
             "Main Entity Type": self.main_entity_type,
@@ -108,7 +110,7 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
             fields["ID"] = self.study_id
             fields["Study URL"] = self.openml_url
         if self.creator is not None:
-            fields["Creator"] = "{}/u/{}".format(openml.config.get_server_base_url(), self.creator)
+            fields["Creator"] = f"{get_server_base_url()}/u/{self.creator}"
         if self.creation_date is not None:
             fields["Upload Time"] = self.creation_date.replace("T", " ")
         if self.data is not None:
@@ -136,42 +138,47 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
         ]
         return [(key, fields[key]) for key in order if key in fields]
 
-    def _parse_publish_response(self, xml_response: Dict):
+    def _parse_publish_response(self, xml_response: dict) -> None:
         """Parse the id from the xml_response and assign it to self."""
         self.study_id = int(xml_response["oml:study_upload"]["oml:id"])
 
-    def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
+    def _to_dict(self) -> dict[str, dict]:
         """Creates a dictionary representation of self."""
         # some can not be uploaded, e.g., id, creator, creation_date
         simple_props = ["alias", "main_entity_type", "name", "description"]
-        # maps from attribute name (which is used as outer tag name) to immer
-        # tag name (e.g., self.tasks -> <oml:tasks><oml:task_id>1987
-        # </oml:task_id></oml:tasks>)
-        complex_props = {
-            "tasks": "task_id",
-            "runs": "run_id",
-        }
-
-        study_container = OrderedDict()  # type: 'OrderedDict'
-        namespace_list = [("@xmlns:oml", "http://openml.org/openml")]
-        study_dict = OrderedDict(namespace_list)  # type: 'OrderedDict'
-        study_container["oml:study"] = study_dict
 
+        # TODO(eddiebergman): Begging for a walrus if we can drop 3.7
+        simple_prop_values = {}
         for prop_name in simple_props:
             content = getattr(self, prop_name, None)
             if content is not None:
-                study_dict["oml:" + prop_name] = content
+                simple_prop_values["oml:" + prop_name] = content
+
+        # maps from attribute name (which is used as outer tag name) to immer
+        # tag name e.g., self.tasks -> <oml:tasks><oml:task_id>1987</oml:task_id></oml:tasks>
+        complex_props = {"tasks": "task_id", "runs": "run_id"}
+
+        # TODO(eddiebergman): Begging for a walrus if we can drop 3.7
+        complex_prop_values = {}
         for prop_name, inner_name in complex_props.items():
             content = getattr(self, prop_name, None)
             if content is not None:
-                sub_dict = {"oml:" + inner_name: content}
-                study_dict["oml:" + prop_name] = sub_dict
-        return study_container
+                complex_prop_values["oml:" + prop_name] = {"oml:" + inner_name: content}
+
+        return {
+            "oml:study": {
+                "@xmlns:oml": "http://openml.org/openml",
+                **simple_prop_values,
+                **complex_prop_values,
+            }
+        }
 
-    def push_tag(self, tag: str):
+    def push_tag(self, tag: str) -> None:
+        """Add a tag to the study."""
         raise NotImplementedError("Tags for studies is not (yet) supported.")
 
-    def remove_tag(self, tag: str):
+    def remove_tag(self, tag: str) -> None:
+        """Remove a tag from the study."""
         raise NotImplementedError("Tags for studies is not (yet) supported.")
 
 
@@ -219,22 +226,22 @@ class OpenMLStudy(BaseStudy):
         a list of setup ids associated with this study
     """
 
-    def __init__(
+    def __init__(  # noqa: PLR0913
         self,
-        study_id: Optional[int],
-        alias: Optional[str],
-        benchmark_suite: Optional[int],
+        study_id: int | None,
+        alias: str | None,
+        benchmark_suite: int | None,
         name: str,
         description: str,
-        status: Optional[str],
-        creation_date: Optional[str],
-        creator: Optional[int],
-        tags: Optional[List[Dict]],
-        data: Optional[List[int]],
-        tasks: Optional[List[int]],
-        flows: Optional[List[int]],
-        runs: Optional[List[int]],
-        setups: Optional[List[int]],
+        status: str | None,
+        creation_date: str | None,
+        creator: int | None,
+        tags: list[dict] | None,
+        data: list[int] | None,
+        tasks: list[int] | None,
+        flows: list[int] | None,
+        runs: list[int] | None,
+        setups: list[int] | None,
     ):
         super().__init__(
             study_id=study_id,
@@ -293,18 +300,18 @@ class OpenMLBenchmarkSuite(BaseStudy):
         a list of task ids associated with this study
     """
 
-    def __init__(
+    def __init__(  # noqa: PLR0913
         self,
-        suite_id: Optional[int],
-        alias: Optional[str],
+        suite_id: int | None,
+        alias: str | None,
         name: str,
         description: str,
-        status: Optional[str],
-        creation_date: Optional[str],
-        creator: Optional[int],
-        tags: Optional[List[Dict]],
-        data: Optional[List[int]],
-        tasks: List[int],
+        status: str | None,
+        creation_date: str | None,
+        creator: int | None,
+        tags: list[dict] | None,
+        data: list[int] | None,
+        tasks: list[int] | None,
     ):
         super().__init__(
             study_id=suite_id,
diff --git a/openml/tasks/__init__.py b/openml/tasks/__init__.py
index a5d578d2d..f6df3a8d4 100644
--- a/openml/tasks/__init__.py
+++ b/openml/tasks/__init__.py
@@ -1,21 +1,21 @@
 # License: BSD 3-Clause
 
-from .task import (
-    OpenMLTask,
-    OpenMLSupervisedTask,
-    OpenMLClassificationTask,
-    OpenMLRegressionTask,
-    OpenMLClusteringTask,
-    OpenMLLearningCurveTask,
-    TaskType,
-)
-from .split import OpenMLSplit
 from .functions import (
     create_task,
+    delete_task,
     get_task,
     get_tasks,
     list_tasks,
-    delete_task,
+)
+from .split import OpenMLSplit
+from .task import (
+    OpenMLClassificationTask,
+    OpenMLClusteringTask,
+    OpenMLLearningCurveTask,
+    OpenMLRegressionTask,
+    OpenMLSupervisedTask,
+    OpenMLTask,
+    TaskType,
 )
 
 __all__ = [
diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
index b038179fc..c763714bf 100644
--- a/openml/tasks/functions.py
+++ b/openml/tasks/functions.py
@@ -1,55 +1,50 @@
 # License: BSD 3-Clause
-import warnings
-from collections import OrderedDict
-import io
-import re
+from __future__ import annotations
+
 import os
-from typing import Union, Dict, Optional, List
+import re
+import warnings
+from typing import Any
+from typing_extensions import Literal
 
 import pandas as pd
 import xmltodict
 
-from ..exceptions import OpenMLCacheException
-from ..datasets import get_dataset
+import openml._api_calls
+import openml.utils
+from openml.datasets import get_dataset
+from openml.exceptions import OpenMLCacheException
+
 from .task import (
     OpenMLClassificationTask,
     OpenMLClusteringTask,
     OpenMLLearningCurveTask,
-    TaskType,
     OpenMLRegressionTask,
     OpenMLSupervisedTask,
     OpenMLTask,
+    TaskType,
 )
-import openml.utils
-import openml._api_calls
 
 TASKS_CACHE_DIR_NAME = "tasks"
 
 
-def _get_cached_tasks():
+def _get_cached_tasks() -> dict[int, OpenMLTask]:
     """Return a dict of all the tasks which are cached locally.
+
     Returns
     -------
     tasks : OrderedDict
         A dict of all the cached tasks. Each task is an instance of
         OpenMLTask.
     """
-    tasks = OrderedDict()
-
     task_cache_dir = openml.utils._create_cache_directory(TASKS_CACHE_DIR_NAME)
     directory_content = os.listdir(task_cache_dir)
     directory_content.sort()
+
     # Find all dataset ids for which we have downloaded the dataset
     # description
-
-    for filename in directory_content:
-        if not re.match(r"[0-9]*", filename):
-            continue
-
-        tid = int(filename)
-        tasks[tid] = _get_cached_task(tid)
-
-    return tasks
+    tids = (int(did) for did in directory_content if re.match(r"[0-9]*", did))
+    return {tid: _get_cached_task(tid) for tid in tids}
 
 
 def _get_cached_task(tid: int) -> OpenMLTask:
@@ -66,16 +61,18 @@ def _get_cached_task(tid: int) -> OpenMLTask:
     """
     tid_cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, tid)
 
+    task_xml_path = tid_cache_dir / "task.xml"
     try:
-        with io.open(os.path.join(tid_cache_dir, "task.xml"), encoding="utf8") as fh:
+        with task_xml_path.open(encoding="utf8") as fh:
             return _create_task_from_xml(fh.read())
-    except (OSError, IOError):
+    except OSError as e:
         openml.utils._remove_cache_dir_for_id(TASKS_CACHE_DIR_NAME, tid_cache_dir)
-        raise OpenMLCacheException("Task file for tid %d not " "cached" % tid)
+        raise OpenMLCacheException(f"Task file for tid {tid} not cached") from e
 
 
-def _get_estimation_procedure_list():
+def _get_estimation_procedure_list() -> list[dict[str, Any]]:
     """Return a list of all estimation procedures which are on OpenML.
+
     Returns
     -------
     procedures : list
@@ -90,50 +87,52 @@ def _get_estimation_procedure_list():
     # Minimalistic check if the XML is useful
     if "oml:estimationprocedures" not in procs_dict:
         raise ValueError("Error in return XML, does not contain tag oml:estimationprocedures.")
-    elif "@xmlns:oml" not in procs_dict["oml:estimationprocedures"]:
+
+    if "@xmlns:oml" not in procs_dict["oml:estimationprocedures"]:
         raise ValueError(
             "Error in return XML, does not contain tag "
-            "@xmlns:oml as a child of oml:estimationprocedures."
+            "@xmlns:oml as a child of oml:estimationprocedures.",
         )
-    elif procs_dict["oml:estimationprocedures"]["@xmlns:oml"] != "http://openml.org/openml":
+
+    if procs_dict["oml:estimationprocedures"]["@xmlns:oml"] != "http://openml.org/openml":
         raise ValueError(
             "Error in return XML, value of "
             "oml:estimationprocedures/@xmlns:oml is not "
             "http://openml.org/openml, but %s"
-            % str(procs_dict["oml:estimationprocedures"]["@xmlns:oml"])
+            % str(procs_dict["oml:estimationprocedures"]["@xmlns:oml"]),
         )
 
-    procs = []
+    procs: list[dict[str, Any]] = []
     for proc_ in procs_dict["oml:estimationprocedures"]["oml:estimationprocedure"]:
         task_type_int = int(proc_["oml:ttid"])
         try:
             task_type_id = TaskType(task_type_int)
+            procs.append(
+                {
+                    "id": int(proc_["oml:id"]),
+                    "task_type_id": task_type_id,
+                    "name": proc_["oml:name"],
+                    "type": proc_["oml:type"],
+                },
+            )
         except ValueError as e:
             warnings.warn(
                 f"Could not create task type id for {task_type_int} due to error {e}",
                 RuntimeWarning,
+                stacklevel=2,
             )
-            continue
-        procs.append(
-            {
-                "id": int(proc_["oml:id"]),
-                "task_type_id": task_type_id,
-                "name": proc_["oml:name"],
-                "type": proc_["oml:type"],
-            }
-        )
 
     return procs
 
 
 def list_tasks(
-    task_type: Optional[TaskType] = None,
-    offset: Optional[int] = None,
-    size: Optional[int] = None,
-    tag: Optional[str] = None,
-    output_format: str = "dict",
-    **kwargs,
-) -> Union[Dict, pd.DataFrame]:
+    task_type: TaskType | None = None,
+    offset: int | None = None,
+    size: int | None = None,
+    tag: str | None = None,
+    output_format: Literal["dict", "dataframe"] = "dict",
+    **kwargs: Any,
+) -> dict | pd.DataFrame:
     """
     Return a number of tasks having the given tag and task_type
 
@@ -174,7 +173,7 @@ def list_tasks(
     """
     if output_format not in ["dataframe", "dict"]:
         raise ValueError(
-            "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
+            "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable.",
         )
     # TODO: [0.15]
     if output_format == "dict":
@@ -184,8 +183,8 @@ def list_tasks(
             "will continue to work, use `output_format`='dataframe'."
         )
         warnings.warn(msg, category=FutureWarning, stacklevel=2)
-    return openml.utils._list_all(
-        output_format=output_format,
+    return openml.utils._list_all(  # type: ignore
+        list_output_format=output_format,  # type: ignore
         listing_call=_list_tasks,
         task_type=task_type,
         offset=offset,
@@ -195,9 +194,14 @@ def list_tasks(
     )
 
 
-def _list_tasks(task_type=None, output_format="dict", **kwargs):
+def _list_tasks(
+    task_type: TaskType | None = None,
+    output_format: Literal["dict", "dataframe"] = "dict",
+    **kwargs: Any,
+) -> dict | pd.DataFrame:
     """
     Perform the api call to return a number of tasks having the given filters.
+
     Parameters
     ----------
     Filter task_type is separated from the other filters because
@@ -224,33 +228,62 @@ def _list_tasks(task_type=None, output_format="dict", **kwargs):
     if kwargs is not None:
         for operator, value in kwargs.items():
             if operator == "task_id":
-                value = ",".join([str(int(i)) for i in value])
-            api_call += "/%s/%s" % (operator, value)
+                value = ",".join([str(int(i)) for i in value])  # noqa: PLW2901
+            api_call += f"/{operator}/{value}"
+
     return __list_tasks(api_call=api_call, output_format=output_format)
 
 
-def __list_tasks(api_call, output_format="dict"):
+# TODO(eddiebergman): overload todefine type returned
+def __list_tasks(  # noqa: PLR0912, C901
+    api_call: str,
+    output_format: Literal["dict", "dataframe"] = "dict",
+) -> dict | pd.DataFrame:
+    """Returns a dictionary or a Pandas DataFrame with information about OpenML tasks.
+
+    Parameters
+    ----------
+    api_call : str
+        The API call specifying which tasks to return.
+    output_format : str in {"dict", "dataframe"}
+        Output format for the returned object.
+
+    Returns
+    -------
+    Union[Dict, pd.DataFrame]
+        A dictionary or a Pandas DataFrame with information about OpenML tasks.
+
+    Raises
+    ------
+    ValueError
+        If the XML returned by the OpenML API does not contain 'oml:tasks', '@xmlns:oml',
+        or has an incorrect value for '@xmlns:oml'.
+    KeyError
+        If an invalid key is found in the XML for a task.
+    """
     xml_string = openml._api_calls._perform_api_call(api_call, "get")
     tasks_dict = xmltodict.parse(xml_string, force_list=("oml:task", "oml:input"))
     # Minimalistic check if the XML is useful
     if "oml:tasks" not in tasks_dict:
-        raise ValueError('Error in return XML, does not contain "oml:runs": %s' % str(tasks_dict))
-    elif "@xmlns:oml" not in tasks_dict["oml:tasks"]:
+        raise ValueError(f'Error in return XML, does not contain "oml:runs": {tasks_dict}')
+
+    if "@xmlns:oml" not in tasks_dict["oml:tasks"]:
         raise ValueError(
-            "Error in return XML, does not contain " '"oml:runs"/@xmlns:oml: %s' % str(tasks_dict)
+            f'Error in return XML, does not contain "oml:runs"/@xmlns:oml: {tasks_dict}'
         )
-    elif tasks_dict["oml:tasks"]["@xmlns:oml"] != "http://openml.org/openml":
+
+    if tasks_dict["oml:tasks"]["@xmlns:oml"] != "http://openml.org/openml":
         raise ValueError(
             "Error in return XML, value of  "
             '"oml:runs"/@xmlns:oml is not '
-            '"http://openml.org/openml": %s' % str(tasks_dict)
+            '"http://openml.org/openml": %s' % str(tasks_dict),
         )
 
-    assert type(tasks_dict["oml:tasks"]["oml:task"]) == list, type(tasks_dict["oml:tasks"])
+    assert isinstance(tasks_dict["oml:tasks"]["oml:task"], list), type(tasks_dict["oml:tasks"])
 
-    tasks = dict()
+    tasks = {}
     procs = _get_estimation_procedure_list()
-    proc_dict = dict((x["id"], x) for x in procs)
+    proc_dict = {x["id"]: x for x in procs}
 
     for task_ in tasks_dict["oml:tasks"]["oml:task"]:
         tid = None
@@ -263,8 +296,10 @@ def __list_tasks(api_call, output_format="dict"):
                 warnings.warn(
                     f"Could not create task type id for {task_type_int} due to error {e}",
                     RuntimeWarning,
+                    stacklevel=2,
                 )
                 continue
+
             task = {
                 "tid": tid,
                 "ttid": task_type_id,
@@ -275,15 +310,15 @@ def __list_tasks(api_call, output_format="dict"):
             }
 
             # Other task inputs
-            for input in task_.get("oml:input", list()):
-                if input["@name"] == "estimation_procedure":
-                    task[input["@name"]] = proc_dict[int(input["#text"])]["name"]
+            for _input in task_.get("oml:input", []):
+                if _input["@name"] == "estimation_procedure":
+                    task[_input["@name"]] = proc_dict[int(_input["#text"])]["name"]
                 else:
-                    value = input.get("#text")
-                    task[input["@name"]] = value
+                    value = _input.get("#text")
+                    task[_input["@name"]] = value
 
             # The number of qualities can range from 0 to infinity
-            for quality in task_.get("oml:quality", list()):
+            for quality in task_.get("oml:quality", []):
                 if "#text" not in quality:
                     quality_value = 0.0
                 else:
@@ -295,10 +330,13 @@ def __list_tasks(api_call, output_format="dict"):
             tasks[tid] = task
         except KeyError as e:
             if tid is not None:
-                warnings.warn("Invalid xml for task %d: %s\nFrom %s" % (tid, e, task_))
+                warnings.warn(
+                    "Invalid xml for task %d: %s\nFrom %s" % (tid, e, task_),
+                    RuntimeWarning,
+                    stacklevel=2,
+                )
             else:
-                warnings.warn("Could not find key %s in %s!" % (e, task_))
-            continue
+                warnings.warn(f"Could not find key {e} in {task_}!", RuntimeWarning, stacklevel=2)
 
     if output_format == "dataframe":
         tasks = pd.DataFrame.from_dict(tasks, orient="index")
@@ -306,9 +344,12 @@ def __list_tasks(api_call, output_format="dict"):
     return tasks
 
 
+# TODO(eddiebergman): Maybe since this isn't public api, we can make it keyword only?
 def get_tasks(
-    task_ids: List[int], download_data: bool = True, download_qualities: bool = True
-) -> List[OpenMLTask]:
+    task_ids: list[int],
+    download_data: bool = True,  # noqa: FBT001, FBT002
+    download_qualities: bool = True,  # noqa: FBT001, FBT002
+) -> list[OpenMLTask]:
     """Download tasks.
 
     This function iterates :meth:`openml.tasks.get_task`.
@@ -334,7 +375,10 @@ def get_tasks(
 
 @openml.utils.thread_safe_if_oslo_installed
 def get_task(
-    task_id: int, *dataset_args, download_splits: Optional[bool] = None, **get_dataset_kwargs
+    task_id: int,
+    *dataset_args: Any,
+    download_splits: bool | None = None,
+    **get_dataset_kwargs: Any,
 ) -> OpenMLTask:
     """Download OpenML task for a given task ID.
 
@@ -374,6 +418,7 @@ def get_task(
             "of ``True`` and be independent from `download_data`. To disable this message until "
             "version 0.15 explicitly set `download_splits` to a bool.",
             FutureWarning,
+            stacklevel=3,
         )
         download_splits = get_dataset_kwargs.get("download_data", True)
 
@@ -382,17 +427,15 @@ def get_task(
         warnings.warn(
             "Task id must be specified as `int` from 0.14.0 onwards.",
             FutureWarning,
+            stacklevel=3,
         )
 
     try:
         task_id = int(task_id)
-    except (ValueError, TypeError):
-        raise ValueError("Dataset ID is neither an Integer nor can be cast to an Integer.")
+    except (ValueError, TypeError) as e:
+        raise ValueError("Dataset ID is neither an Integer nor can be cast to an Integer.") from e
 
-    tid_cache_dir = openml.utils._create_cache_directory_for_id(
-        TASKS_CACHE_DIR_NAME,
-        task_id,
-    )
+    tid_cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id)
 
     try:
         task = _get_task_description(task_id)
@@ -404,38 +447,29 @@ def get_task(
             task.class_labels = dataset.retrieve_class_labels(task.target_name)
         # Clustering tasks do not have class labels
         # and do not offer download_split
-        if download_splits:
-            if isinstance(task, OpenMLSupervisedTask):
-                task.download_split()
+        if download_splits and isinstance(task, OpenMLSupervisedTask):
+            task.download_split()
     except Exception as e:
-        openml.utils._remove_cache_dir_for_id(
-            TASKS_CACHE_DIR_NAME,
-            tid_cache_dir,
-        )
+        openml.utils._remove_cache_dir_for_id(TASKS_CACHE_DIR_NAME, tid_cache_dir)
         raise e
 
     return task
 
 
-def _get_task_description(task_id):
+def _get_task_description(task_id: int) -> OpenMLTask:
     try:
         return _get_cached_task(task_id)
     except OpenMLCacheException:
-        xml_file = os.path.join(
-            openml.utils._create_cache_directory_for_id(
-                TASKS_CACHE_DIR_NAME,
-                task_id,
-            ),
-            "task.xml",
-        )
+        _cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id)
+        xml_file = _cache_dir / "task.xml"
         task_xml = openml._api_calls._perform_api_call("task/%d" % task_id, "get")
 
-        with io.open(xml_file, "w", encoding="utf8") as fh:
+        with xml_file.open("w", encoding="utf8") as fh:
             fh.write(task_xml)
         return _create_task_from_xml(task_xml)
 
 
-def _create_task_from_xml(xml):
+def _create_task_from_xml(xml: str) -> OpenMLTask:
     """Create a task given a xml string.
 
     Parameters
@@ -448,8 +482,8 @@ def _create_task_from_xml(xml):
     OpenMLTask
     """
     dic = xmltodict.parse(xml)["oml:task"]
-    estimation_parameters = dict()
-    inputs = dict()
+    estimation_parameters = {}
+    inputs = {}
     # Due to the unordered structure we obtain, we first have to extract
     # the possible keys of oml:input; dic["oml:input"] is a list of
     # OrderedDicts
@@ -508,22 +542,20 @@ def _create_task_from_xml(xml):
     }.get(task_type)
     if cls is None:
         raise NotImplementedError("Task type %s not supported." % common_kwargs["task_type"])
-    return cls(**common_kwargs)
+    return cls(**common_kwargs)  # type: ignore
 
 
+# TODO(eddiebergman): overload on `task_type`
 def create_task(
     task_type: TaskType,
     dataset_id: int,
     estimation_procedure_id: int,
-    target_name: Optional[str] = None,
-    evaluation_measure: Optional[str] = None,
-    **kwargs,
-) -> Union[
-    OpenMLClassificationTask,
-    OpenMLRegressionTask,
-    OpenMLLearningCurveTask,
-    OpenMLClusteringTask,
-]:
+    target_name: str | None = None,
+    evaluation_measure: str | None = None,
+    **kwargs: Any,
+) -> (
+    OpenMLClassificationTask | OpenMLRegressionTask | OpenMLLearningCurveTask | OpenMLClusteringTask
+):
     """Create a task based on different given attributes.
 
     Builds a task object with the function arguments as
@@ -556,25 +588,26 @@ def create_task(
     OpenMLClassificationTask, OpenMLRegressionTask,
     OpenMLLearningCurveTask, OpenMLClusteringTask
     """
-    task_cls = {
-        TaskType.SUPERVISED_CLASSIFICATION: OpenMLClassificationTask,
-        TaskType.SUPERVISED_REGRESSION: OpenMLRegressionTask,
-        TaskType.CLUSTERING: OpenMLClusteringTask,
-        TaskType.LEARNING_CURVE: OpenMLLearningCurveTask,
-    }.get(task_type)
-
-    if task_cls is None:
-        raise NotImplementedError("Task type {0:d} not supported.".format(task_type))
+    if task_type == TaskType.CLUSTERING:
+        task_cls = OpenMLClusteringTask
+    elif task_type == TaskType.LEARNING_CURVE:
+        task_cls = OpenMLLearningCurveTask  # type: ignore
+    elif task_type == TaskType.SUPERVISED_CLASSIFICATION:
+        task_cls = OpenMLClassificationTask  # type: ignore
+    elif task_type == TaskType.SUPERVISED_REGRESSION:
+        task_cls = OpenMLRegressionTask  # type: ignore
     else:
-        return task_cls(
-            task_type_id=task_type,
-            task_type=None,
-            data_set_id=dataset_id,
-            target_name=target_name,
-            estimation_procedure_id=estimation_procedure_id,
-            evaluation_measure=evaluation_measure,
-            **kwargs,
-        )
+        raise NotImplementedError(f"Task type {task_type:d} not supported.")
+
+    return task_cls(
+        task_type_id=task_type,
+        task_type="None",  # TODO: refactor to get task type string from ID.
+        data_set_id=dataset_id,
+        target_name=target_name,
+        estimation_procedure_id=estimation_procedure_id,
+        evaluation_measure=evaluation_measure,
+        **kwargs,
+    )
 
 
 def delete_task(task_id: int) -> bool:
diff --git a/openml/tasks/split.py b/openml/tasks/split.py
index bc0dac55d..81105f1fd 100644
--- a/openml/tasks/split.py
+++ b/openml/tasks/split.py
@@ -1,17 +1,24 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
-from collections import namedtuple, OrderedDict
-import os
 import pickle
+from collections import OrderedDict
+from pathlib import Path
+from typing import Any
+from typing_extensions import NamedTuple
 
+import arff  # type: ignore
 import numpy as np
-import arff
 
 
-Split = namedtuple("Split", ["train", "test"])
+class Split(NamedTuple):
+    """A single split of a dataset."""
 
+    train: np.ndarray
+    test: np.ndarray
 
-class OpenMLSplit(object):
+
+class OpenMLSplit:
     """OpenML Split object.
 
     Parameters
@@ -21,29 +28,37 @@ class OpenMLSplit(object):
     split : dict
     """
 
-    def __init__(self, name, description, split):
+    def __init__(
+        self,
+        name: int | str,
+        description: str,
+        split: dict[int, dict[int, dict[int, tuple[np.ndarray, np.ndarray]]]],
+    ):
         self.description = description
         self.name = name
-        self.split = dict()
+        self.split: dict[int, dict[int, dict[int, tuple[np.ndarray, np.ndarray]]]] = {}
 
         # Add splits according to repetition
         for repetition in split:
-            repetition = int(repetition)
-            self.split[repetition] = OrderedDict()
-            for fold in split[repetition]:
-                self.split[repetition][fold] = OrderedDict()
-                for sample in split[repetition][fold]:
-                    self.split[repetition][fold][sample] = split[repetition][fold][sample]
+            _rep = int(repetition)
+            self.split[_rep] = OrderedDict()
+            for fold in split[_rep]:
+                self.split[_rep][fold] = OrderedDict()
+                for sample in split[_rep][fold]:
+                    self.split[_rep][fold][sample] = split[_rep][fold][sample]
 
         self.repeats = len(self.split)
-        if any([len(self.split[0]) != len(self.split[i]) for i in range(self.repeats)]):
+
+        # TODO(eddiebergman): Better error message
+        if any(len(self.split[0]) != len(self.split[i]) for i in range(self.repeats)):
             raise ValueError("")
+
         self.folds = len(self.split[0])
         self.samples = len(self.split[0][0])
 
-    def __eq__(self, other):
+    def __eq__(self, other: Any) -> bool:
         if (
-            type(self) != type(other)
+            (not isinstance(self, type(other)))
             or self.name != other.name
             or self.description != other.description
             or self.split.keys() != other.split.keys()
@@ -69,23 +84,26 @@ def __eq__(self, other):
         return True
 
     @classmethod
-    def _from_arff_file(cls, filename: str) -> "OpenMLSplit":
+    def _from_arff_file(cls, filename: Path) -> OpenMLSplit:  # noqa: C901, PLR0912
         repetitions = None
+        name = None
 
-        pkl_filename = filename.replace(".arff", ".pkl.py3")
+        pkl_filename = filename.with_suffix(".pkl.py3")
 
-        if os.path.exists(pkl_filename):
-            with open(pkl_filename, "rb") as fh:
-                _ = pickle.load(fh)
-            repetitions = _["repetitions"]
-            name = _["name"]
+        if pkl_filename.exists():
+            with pkl_filename.open("rb") as fh:
+                # TODO(eddiebergman): Would be good to figure out what _split is and assert it is
+                _split = pickle.load(fh)  # noqa: S301
+            repetitions = _split["repetitions"]
+            name = _split["name"]
 
         # Cache miss
         if repetitions is None:
             # Faster than liac-arff and sufficient in this situation!
-            if not os.path.exists(filename):
-                raise FileNotFoundError("Split arff %s does not exist!" % filename)
-            file_data = arff.load(open(filename), return_type=arff.DENSE_GEN)
+            if not filename.exists():
+                raise FileNotFoundError(f"Split arff {filename} does not exist!")
+
+            file_data = arff.load(filename.open("r"), return_type=arff.DENSE_GEN)
             splits = file_data["data"]
             name = file_data["relation"]
             attrnames = [attr[0] for attr in file_data["attributes"]]
@@ -130,15 +148,34 @@ def _from_arff_file(cls, filename: str) -> "OpenMLSplit":
                             np.array(repetitions[repetition][fold][sample][1], dtype=np.int32),
                         )
 
-            with open(pkl_filename, "wb") as fh:
+            with pkl_filename.open("wb") as fh:
                 pickle.dump({"name": name, "repetitions": repetitions}, fh, protocol=2)
 
+        assert name is not None
         return cls(name, "", repetitions)
 
-    def from_dataset(self, X, Y, folds, repeats):
-        raise NotImplementedError()
-
-    def get(self, repeat=0, fold=0, sample=0):
+    def get(self, repeat: int = 0, fold: int = 0, sample: int = 0) -> tuple[np.ndarray, np.ndarray]:
+        """Returns the specified data split from the CrossValidationSplit object.
+
+        Parameters
+        ----------
+        repeat : int
+            Index of the repeat to retrieve.
+        fold : int
+            Index of the fold to retrieve.
+        sample : int
+            Index of the sample to retrieve.
+
+        Returns
+        -------
+        numpy.ndarray
+            The data split for the specified repeat, fold, and sample.
+
+        Raises
+        ------
+        ValueError
+            If the specified repeat, fold, or sample is not known.
+        """
         if repeat not in self.split:
             raise ValueError("Repeat %s not known" % str(repeat))
         if fold not in self.split[repeat]:
diff --git a/openml/tasks/task.py b/openml/tasks/task.py
index 36e0ada1c..4ad4cec62 100644
--- a/openml/tasks/task.py
+++ b/openml/tasks/task.py
@@ -1,25 +1,34 @@
 # License: BSD 3-Clause
+# TODO(eddbergman): Seems like a lot of the subclasses could just get away with setting
+# a `ClassVar` for whatever changes as their `__init__` defaults, less duplicated code.
+from __future__ import annotations
+
 import warnings
 from abc import ABC
-from collections import OrderedDict
 from enum import Enum
-import io
-import os
-from typing import Union, Tuple, Dict, List, Optional, Any
-from warnings import warn
-
-import numpy as np
-import pandas as pd
-import scipy.sparse
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Sequence
+from typing_extensions import Literal, TypedDict, overload
 
 import openml._api_calls
+import openml.config
+from openml import datasets
 from openml.base import OpenMLBase
-from .. import datasets
+from openml.utils import _create_cache_directory_for_id
+
 from .split import OpenMLSplit
-from ..utils import _create_cache_directory_for_id
+
+if TYPE_CHECKING:
+    import numpy as np
+    import pandas as pd
+    import scipy.sparse
 
 
+# TODO(eddiebergman): Should use `auto()` but might be too late if these numbers are used
+# and stored on server.
 class TaskType(Enum):
+    """Possible task types as defined in OpenML."""
+
     SUPERVISED_CLASSIFICATION = 1
     SUPERVISED_REGRESSION = 2
     LEARNING_CURVE = 3
@@ -31,61 +40,76 @@ class TaskType(Enum):
     MULTITASK_REGRESSION = 9
 
 
+class _EstimationProcedure(TypedDict):
+    type: str | None
+    parameters: dict[str, str] | None
+    data_splits_url: str | None
+
+
 class OpenMLTask(OpenMLBase):
     """OpenML Task object.
 
     Parameters
     ----------
-    task_type_id : TaskType
-        Refers to the type of task.
-    task_type : str
-        Refers to the task.
+    task_id: Union[int, None]
+        Refers to the unique identifier of OpenML task.
+    task_type_id: TaskType
+        Refers to the type of OpenML task.
+    task_type: str
+        Refers to the OpenML task.
     data_set_id: int
         Refers to the data.
     estimation_procedure_id: int
         Refers to the type of estimates used.
+    estimation_procedure_type: str, default=None
+        Refers to the type of estimation procedure used for the OpenML task.
+    estimation_parameters: [Dict[str, str]], default=None
+        Estimation parameters used for the OpenML task.
+    evaluation_measure: str, default=None
+        Refers to the evaluation measure.
+    data_splits_url: str, default=None
+        Refers to the URL of the data splits used for the OpenML task.
     """
 
-    def __init__(
+    def __init__(  # noqa: PLR0913
         self,
-        task_id: Optional[int],
+        task_id: int | None,
         task_type_id: TaskType,
         task_type: str,
         data_set_id: int,
         estimation_procedure_id: int = 1,
-        estimation_procedure_type: Optional[str] = None,
-        estimation_parameters: Optional[Dict[str, str]] = None,
-        evaluation_measure: Optional[str] = None,
-        data_splits_url: Optional[str] = None,
+        estimation_procedure_type: str | None = None,
+        estimation_parameters: dict[str, str] | None = None,
+        evaluation_measure: str | None = None,
+        data_splits_url: str | None = None,
     ):
         self.task_id = int(task_id) if task_id is not None else None
         self.task_type_id = task_type_id
         self.task_type = task_type
         self.dataset_id = int(data_set_id)
         self.evaluation_measure = evaluation_measure
-        self.estimation_procedure = (
-            dict()
-        )  # type: Dict[str, Optional[Union[str, Dict]]] # noqa E501
-        self.estimation_procedure["type"] = estimation_procedure_type
-        self.estimation_procedure["parameters"] = estimation_parameters
-        self.estimation_procedure["data_splits_url"] = data_splits_url
+        self.estimation_procedure: _EstimationProcedure = {
+            "type": estimation_procedure_type,
+            "parameters": estimation_parameters,
+            "data_splits_url": data_splits_url,
+        }
         self.estimation_procedure_id = estimation_procedure_id
-        self.split = None  # type: Optional[OpenMLSplit]
+        self.split: OpenMLSplit | None = None
 
     @classmethod
     def _entity_letter(cls) -> str:
         return "t"
 
     @property
-    def id(self) -> Optional[int]:
+    def id(self) -> int | None:
+        """Return the OpenML ID of this task."""
         return self.task_id
 
-    def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
+    def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str]]]:
         """Collect all information to display in the __repr__ body."""
-        fields: Dict[str, Any] = {
-            "Task Type Description": "{}/tt/{}".format(
-                openml.config.get_server_base_url(), self.task_type_id
-            )
+        base_server_url = openml.config.get_server_base_url()
+        fields: dict[str, Any] = {
+            "Task Type Description": f"{base_server_url}/tt/{self.task_type_id}"
         }
         if self.task_id is not None:
             fields["Task ID"] = self.task_id
@@ -94,10 +118,17 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
             fields["Evaluation Measure"] = self.evaluation_measure
         if self.estimation_procedure is not None:
             fields["Estimation Procedure"] = self.estimation_procedure["type"]
-        if getattr(self, "target_name", None) is not None:
-            fields["Target Feature"] = getattr(self, "target_name")
-            if hasattr(self, "class_labels") and getattr(self, "class_labels") is not None:
-                fields["# of Classes"] = len(getattr(self, "class_labels"))
+
+        # TODO(eddiebergman): Subclasses could advertise/provide this, instead of having to
+        # have the base class know about it's subclasses.
+        target_name = getattr(self, "target_name", None)
+        if target_name is not None:
+            fields["Target Feature"] = target_name
+
+            class_labels = getattr(self, "class_labels", None)
+            if class_labels is not None:
+                fields["# of Classes"] = len(class_labels)
+
             if hasattr(self, "cost_matrix"):
                 fields["Cost Matrix"] = "Available"
 
@@ -115,7 +146,7 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
         return [(key, fields[key]) for key in order if key in fields]
 
     def get_dataset(self) -> datasets.OpenMLDataset:
-        """Download dataset associated with task"""
+        """Download dataset associated with task."""
         return datasets.get_dataset(self.dataset_id)
 
     def get_train_test_split_indices(
@@ -123,80 +154,68 @@ def get_train_test_split_indices(
         fold: int = 0,
         repeat: int = 0,
         sample: int = 0,
-    ) -> Tuple[np.ndarray, np.ndarray]:
+    ) -> tuple[np.ndarray, np.ndarray]:
+        """Get the indices of the train and test splits for a given task."""
         # Replace with retrieve from cache
         if self.split is None:
             self.split = self.download_split()
 
-        train_indices, test_indices = self.split.get(
-            repeat=repeat,
-            fold=fold,
-            sample=sample,
-        )
-        return train_indices, test_indices
+        return self.split.get(repeat=repeat, fold=fold, sample=sample)
 
-    def _download_split(self, cache_file: str):
+    def _download_split(self, cache_file: Path) -> None:
+        # TODO(eddiebergman): Not sure about this try to read and error approach
         try:
-            with io.open(cache_file, encoding="utf8"):
+            with cache_file.open(encoding="utf8"):
                 pass
-        except (OSError, IOError):
+        except OSError:
             split_url = self.estimation_procedure["data_splits_url"]
             openml._api_calls._download_text_file(
                 source=str(split_url),
-                output_path=cache_file,
+                output_path=str(cache_file),
             )
 
     def download_split(self) -> OpenMLSplit:
         """Download the OpenML split for a given task."""
-        cached_split_file = os.path.join(
-            _create_cache_directory_for_id("tasks", self.task_id),
-            "datasplits.arff",
-        )
+        # TODO(eddiebergman): Can this every be `None`?
+        assert self.task_id is not None
+        cache_dir = _create_cache_directory_for_id("tasks", self.task_id)
+        cached_split_file = cache_dir / "datasplits.arff"
 
         try:
             split = OpenMLSplit._from_arff_file(cached_split_file)
-        except (OSError, IOError):
+        except OSError:
             # Next, download and cache the associated split file
             self._download_split(cached_split_file)
             split = OpenMLSplit._from_arff_file(cached_split_file)
 
         return split
 
-    def get_split_dimensions(self) -> Tuple[int, int, int]:
+    def get_split_dimensions(self) -> tuple[int, int, int]:
+        """Get the (repeats, folds, samples) of the split for a given task."""
         if self.split is None:
             self.split = self.download_split()
 
         return self.split.repeats, self.split.folds, self.split.samples
 
-    def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
-        """Creates a dictionary representation of self."""
-        task_container = OrderedDict()  # type: OrderedDict[str, OrderedDict]
-        task_dict = OrderedDict(
-            [("@xmlns:oml", "http://openml.org/openml")]
-        )  # type: OrderedDict[str, Union[List, str, int]]
-
-        task_container["oml:task_inputs"] = task_dict
-        task_dict["oml:task_type_id"] = self.task_type_id.value
-
-        # having task_inputs and adding a type annotation
-        # solves wrong warnings
-        task_inputs = [
-            OrderedDict([("@name", "source_data"), ("#text", str(self.dataset_id))]),
-            OrderedDict(
-                [("@name", "estimation_procedure"), ("#text", str(self.estimation_procedure_id))]
-            ),
-        ]  # type: List[OrderedDict]
-
-        if self.evaluation_measure is not None:
-            task_inputs.append(
-                OrderedDict([("@name", "evaluation_measures"), ("#text", self.evaluation_measure)])
-            )
-
-        task_dict["oml:input"] = task_inputs
-
-        return task_container
+    # TODO(eddiebergman): Really need some better typing on all this
+    def _to_dict(self) -> dict[str, dict[str, int | str | list[dict[str, Any]]]]:
+        """Creates a dictionary representation of self in a string format (for XML parsing)."""
+        oml_input = [
+            {"@name": "source_data", "#text": str(self.dataset_id)},
+            {"@name": "estimation_procedure", "#text": str(self.estimation_procedure_id)},
+        ]
+        if self.evaluation_measure is not None:  #
+            oml_input.append({"@name": "evaluation_measures", "#text": self.evaluation_measure})
+
+        return {
+            "oml:task_inputs": {
+                "@xmlns:oml": "http://openml.org/openml",
+                "oml:task_type_id": self.task_type_id.value,  # This is an int from the enum?
+                "oml:input": oml_input,
+            }
+        }
 
-    def _parse_publish_response(self, xml_response: Dict):
+    def _parse_publish_response(self, xml_response: dict) -> None:
         """Parse the id from the xml_response and assign it to self."""
         self.task_id = int(xml_response["oml:upload_task"]["oml:id"])
 
@@ -206,24 +225,42 @@ class OpenMLSupervisedTask(OpenMLTask, ABC):
 
     Parameters
     ----------
+    task_type_id : TaskType
+        ID of the task type.
+    task_type : str
+        Name of the task type.
+    data_set_id : int
+        ID of the OpenML dataset associated with the task.
     target_name : str
         Name of the target feature (the class variable).
+    estimation_procedure_id : int, default=None
+        ID of the estimation procedure for the task.
+    estimation_procedure_type : str, default=None
+        Type of the estimation procedure for the task.
+    estimation_parameters : dict, default=None
+        Estimation parameters for the task.
+    evaluation_measure : str, default=None
+        Name of the evaluation measure for the task.
+    data_splits_url : str, default=None
+        URL of the data splits for the task.
+    task_id: Union[int, None]
+        Refers to the unique identifier of task.
     """
 
-    def __init__(
+    def __init__(  # noqa: PLR0913
         self,
         task_type_id: TaskType,
         task_type: str,
         data_set_id: int,
         target_name: str,
         estimation_procedure_id: int = 1,
-        estimation_procedure_type: Optional[str] = None,
-        estimation_parameters: Optional[Dict[str, str]] = None,
-        evaluation_measure: Optional[str] = None,
-        data_splits_url: Optional[str] = None,
-        task_id: Optional[int] = None,
+        estimation_procedure_type: str | None = None,
+        estimation_parameters: dict[str, str] | None = None,
+        evaluation_measure: str | None = None,
+        data_splits_url: str | None = None,
+        task_id: int | None = None,
     ):
-        super(OpenMLSupervisedTask, self).__init__(
+        super().__init__(
             task_id=task_id,
             task_type_id=task_type_id,
             task_type=task_type,
@@ -237,11 +274,30 @@ def __init__(
 
         self.target_name = target_name
 
+    @overload
     def get_X_and_y(
-        self,
-        dataset_format: str = "array",
-    ) -> Tuple[
-        Union[np.ndarray, pd.DataFrame, scipy.sparse.spmatrix], Union[np.ndarray, pd.Series]
+        self, dataset_format: Literal["array"] = "array"
+    ) -> tuple[
+        np.ndarray | scipy.sparse.spmatrix,
+        np.ndarray | None,
+    ]:
+        ...
+
+    @overload
+    def get_X_and_y(
+        self, dataset_format: Literal["dataframe"]
+    ) -> tuple[
+        pd.DataFrame,
+        pd.Series | pd.DataFrame | None,
+    ]:
+        ...
+
+    # TODO(eddiebergman): Do all OpenMLSupervisedTask have a `y`?
+    def get_X_and_y(
+        self, dataset_format: Literal["dataframe", "array"] = "array"
+    ) -> tuple[
+        np.ndarray | pd.DataFrame | scipy.sparse.spmatrix,
+        np.ndarray | pd.Series | pd.DataFrame | None,
     ]:
         """Get data associated with the current task.
 
@@ -273,34 +329,35 @@ def get_X_and_y(
             TaskType.LEARNING_CURVE,
         ):
             raise NotImplementedError(self.task_type)
+
         X, y, _, _ = dataset.get_data(
             dataset_format=dataset_format,
             target=self.target_name,
         )
         return X, y
 
-    def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
-        task_container = super(OpenMLSupervisedTask, self)._to_dict()
-        task_dict = task_container["oml:task_inputs"]
-
-        task_dict["oml:input"].append(
-            OrderedDict([("@name", "target_feature"), ("#text", self.target_name)])
-        )
+    def _to_dict(self) -> dict[str, dict]:
+        task_container = super()._to_dict()
+        oml_input = task_container["oml:task_inputs"]["oml:input"]  # type: ignore
+        assert isinstance(oml_input, list)
 
+        oml_input.append({"@name": "target_feature", "#text": self.target_name})
         return task_container
 
     @property
-    def estimation_parameters(self):
-        warn(
+    def estimation_parameters(self) -> dict[str, str] | None:
+        """Return the estimation parameters for the task."""
+        warnings.warn(
             "The estimation_parameters attribute will be "
             "deprecated in the future, please use "
             "estimation_procedure['parameters'] instead",
             PendingDeprecationWarning,
+            stacklevel=2,
         )
         return self.estimation_procedure["parameters"]
 
     @estimation_parameters.setter
-    def estimation_parameters(self, est_parameters):
+    def estimation_parameters(self, est_parameters: dict[str, str] | None) -> None:
         self.estimation_procedure["parameters"] = est_parameters
 
 
@@ -309,26 +366,48 @@ class OpenMLClassificationTask(OpenMLSupervisedTask):
 
     Parameters
     ----------
-    class_labels : List of str (optional)
-    cost_matrix: array (optional)
+    task_type_id : TaskType
+        ID of the Classification task type.
+    task_type : str
+        Name of the Classification task type.
+    data_set_id : int
+        ID of the OpenML dataset associated with the Classification task.
+    target_name : str
+        Name of the target variable.
+    estimation_procedure_id : int, default=None
+        ID of the estimation procedure for the Classification task.
+    estimation_procedure_type : str, default=None
+        Type of the estimation procedure.
+    estimation_parameters : dict, default=None
+        Estimation parameters for the Classification task.
+    evaluation_measure : str, default=None
+        Name of the evaluation measure.
+    data_splits_url : str, default=None
+        URL of the data splits for the Classification task.
+    task_id : Union[int, None]
+        ID of the Classification task (if it already exists on OpenML).
+    class_labels : List of str, default=None
+        A list of class labels (for classification tasks).
+    cost_matrix : array, default=None
+        A cost matrix (for classification tasks).
     """
 
-    def __init__(
+    def __init__(  # noqa: PLR0913
         self,
         task_type_id: TaskType,
         task_type: str,
         data_set_id: int,
         target_name: str,
         estimation_procedure_id: int = 1,
-        estimation_procedure_type: Optional[str] = None,
-        estimation_parameters: Optional[Dict[str, str]] = None,
-        evaluation_measure: Optional[str] = None,
-        data_splits_url: Optional[str] = None,
-        task_id: Optional[int] = None,
-        class_labels: Optional[List[str]] = None,
-        cost_matrix: Optional[np.ndarray] = None,
+        estimation_procedure_type: str | None = None,
+        estimation_parameters: dict[str, str] | None = None,
+        evaluation_measure: str | None = None,
+        data_splits_url: str | None = None,
+        task_id: int | None = None,
+        class_labels: list[str] | None = None,
+        cost_matrix: np.ndarray | None = None,
     ):
-        super(OpenMLClassificationTask, self).__init__(
+        super().__init__(
             task_id=task_id,
             task_type_id=task_type_id,
             task_type=task_type,
@@ -348,22 +427,46 @@ def __init__(
 
 
 class OpenMLRegressionTask(OpenMLSupervisedTask):
-    """OpenML Regression object."""
+    """OpenML Regression object.
+
+    Parameters
+    ----------
+    task_type_id : TaskType
+        Task type ID of the OpenML Regression task.
+    task_type : str
+        Task type of the OpenML Regression task.
+    data_set_id : int
+        ID of the OpenML dataset.
+    target_name : str
+        Name of the target feature used in the Regression task.
+    estimation_procedure_id : int, default=None
+        ID of the OpenML estimation procedure.
+    estimation_procedure_type : str, default=None
+        Type of the OpenML estimation procedure.
+    estimation_parameters : dict, default=None
+        Parameters used by the OpenML estimation procedure.
+    data_splits_url : str, default=None
+        URL of the OpenML data splits for the Regression task.
+    task_id : Union[int, None]
+        ID of the OpenML Regression task.
+    evaluation_measure : str, default=None
+        Evaluation measure used in the Regression task.
+    """
 
-    def __init__(
+    def __init__(  # noqa: PLR0913
         self,
         task_type_id: TaskType,
         task_type: str,
         data_set_id: int,
         target_name: str,
         estimation_procedure_id: int = 7,
-        estimation_procedure_type: Optional[str] = None,
-        estimation_parameters: Optional[Dict[str, str]] = None,
-        data_splits_url: Optional[str] = None,
-        task_id: Optional[int] = None,
-        evaluation_measure: Optional[str] = None,
+        estimation_procedure_type: str | None = None,
+        estimation_parameters: dict[str, str] | None = None,
+        data_splits_url: str | None = None,
+        task_id: int | None = None,
+        evaluation_measure: str | None = None,
     ):
-        super(OpenMLRegressionTask, self).__init__(
+        super().__init__(
             task_id=task_id,
             task_type_id=task_type_id,
             task_type=task_type,
@@ -382,25 +485,43 @@ class OpenMLClusteringTask(OpenMLTask):
 
     Parameters
     ----------
-    target_name : str (optional)
+    task_type_id : TaskType
+        Task type ID of the OpenML clustering task.
+    task_type : str
+        Task type of the OpenML clustering task.
+    data_set_id : int
+        ID of the OpenML dataset used in clustering the task.
+    estimation_procedure_id : int, default=None
+        ID of the OpenML estimation procedure.
+    task_id : Union[int, None]
+        ID of the OpenML clustering task.
+    estimation_procedure_type : str, default=None
+        Type of the OpenML estimation procedure used in the clustering task.
+    estimation_parameters : dict, default=None
+        Parameters used by the OpenML estimation procedure.
+    data_splits_url : str, default=None
+        URL of the OpenML data splits for the clustering task.
+    evaluation_measure : str, default=None
+        Evaluation measure used in the clustering task.
+    target_name : str, default=None
         Name of the target feature (class) that is not part of the
         feature set for the clustering task.
     """
 
-    def __init__(
+    def __init__(  # noqa: PLR0913
         self,
         task_type_id: TaskType,
         task_type: str,
         data_set_id: int,
         estimation_procedure_id: int = 17,
-        task_id: Optional[int] = None,
-        estimation_procedure_type: Optional[str] = None,
-        estimation_parameters: Optional[Dict[str, str]] = None,
-        data_splits_url: Optional[str] = None,
-        evaluation_measure: Optional[str] = None,
-        target_name: Optional[str] = None,
+        task_id: int | None = None,
+        estimation_procedure_type: str | None = None,
+        estimation_parameters: dict[str, str] | None = None,
+        data_splits_url: str | None = None,
+        evaluation_measure: str | None = None,
+        target_name: str | None = None,
     ):
-        super(OpenMLClusteringTask, self).__init__(
+        super().__init__(
             task_id=task_id,
             task_type_id=task_type_id,
             task_type=task_type,
@@ -414,10 +535,21 @@ def __init__(
 
         self.target_name = target_name
 
+    @overload
     def get_X(
         self,
-        dataset_format: str = "array",
-    ) -> Union[np.ndarray, pd.DataFrame, scipy.sparse.spmatrix]:
+        dataset_format: Literal["array"] = "array",
+    ) -> np.ndarray | scipy.sparse.spmatrix:
+        ...
+
+    @overload
+    def get_X(self, dataset_format: Literal["dataframe"]) -> pd.DataFrame:
+        ...
+
+    def get_X(
+        self,
+        dataset_format: Literal["array", "dataframe"] = "array",
+    ) -> np.ndarray | pd.DataFrame | scipy.sparse.spmatrix:
         """Get data associated with the current task.
 
         Parameters
@@ -432,15 +564,10 @@ def get_X(
 
         """
         dataset = self.get_dataset()
-        data, *_ = dataset.get_data(
-            dataset_format=dataset_format,
-            target=None,
-        )
+        data, *_ = dataset.get_data(dataset_format=dataset_format, target=None)
         return data
 
-    def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
-        task_container = super(OpenMLClusteringTask, self)._to_dict()
-
+    def _to_dict(self) -> dict[str, dict[str, int | str | list[dict[str, Any]]]]:
         # Right now, it is not supported as a feature.
         # Uncomment if it is supported on the server
         # in the future.
@@ -455,28 +582,56 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
                 ])
             )
         """
-        return task_container
+        return super()._to_dict()
 
 
 class OpenMLLearningCurveTask(OpenMLClassificationTask):
-    """OpenML Learning Curve object."""
+    """OpenML Learning Curve object.
+
+    Parameters
+    ----------
+    task_type_id : TaskType
+        ID of the Learning Curve task.
+    task_type : str
+        Name of the Learning Curve task.
+    data_set_id : int
+        ID of the dataset that this task is associated with.
+    target_name : str
+        Name of the target feature in the dataset.
+    estimation_procedure_id : int, default=None
+        ID of the estimation procedure to use for evaluating models.
+    estimation_procedure_type : str, default=None
+        Type of the estimation procedure.
+    estimation_parameters : dict, default=None
+        Additional parameters for the estimation procedure.
+    data_splits_url : str, default=None
+        URL of the file containing the data splits for Learning Curve task.
+    task_id : Union[int, None]
+        ID of the Learning Curve task.
+    evaluation_measure : str, default=None
+        Name of the evaluation measure to use for evaluating models.
+    class_labels : list of str, default=None
+        Class labels for Learning Curve tasks.
+    cost_matrix : numpy array, default=None
+        Cost matrix for Learning Curve tasks.
+    """
 
-    def __init__(
+    def __init__(  # noqa: PLR0913
         self,
         task_type_id: TaskType,
         task_type: str,
         data_set_id: int,
         target_name: str,
         estimation_procedure_id: int = 13,
-        estimation_procedure_type: Optional[str] = None,
-        estimation_parameters: Optional[Dict[str, str]] = None,
-        data_splits_url: Optional[str] = None,
-        task_id: Optional[int] = None,
-        evaluation_measure: Optional[str] = None,
-        class_labels: Optional[List[str]] = None,
-        cost_matrix: Optional[np.ndarray] = None,
+        estimation_procedure_type: str | None = None,
+        estimation_parameters: dict[str, str] | None = None,
+        data_splits_url: str | None = None,
+        task_id: int | None = None,
+        evaluation_measure: str | None = None,
+        class_labels: list[str] | None = None,
+        cost_matrix: np.ndarray | None = None,
     ):
-        super(OpenMLLearningCurveTask, self).__init__(
+        super().__init__(
             task_id=task_id,
             task_type_id=task_type_id,
             task_type=task_type,
diff --git a/openml/testing.py b/openml/testing.py
index ecb9620e1..4af361507 100644
--- a/openml/testing.py
+++ b/openml/testing.py
@@ -1,22 +1,32 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
 import hashlib
 import inspect
+import logging
 import os
 import pathlib
 import shutil
-import sys
 import time
-from typing import Dict, Union, cast
 import unittest
+from pathlib import Path
+from typing import ClassVar
+
 import pandas as pd
 import requests
 
 import openml
-from openml.tasks import TaskType
 from openml.exceptions import OpenMLServerException
+from openml.tasks import TaskType
 
-import logging
+
+def _check_dataset(dataset: dict) -> None:
+    assert isinstance(dataset, dict)
+    assert len(dataset) >= 2
+    assert "did" in dataset
+    assert isinstance(dataset["did"], int)
+    assert "status" in dataset
+    assert dataset["status"] in ["in_preparation", "active", "deactivated"]
 
 
 class TestBase(unittest.TestCase):
@@ -28,14 +38,16 @@ class TestBase(unittest.TestCase):
     Hopefully soon allows using a test server, not the production server.
     """
 
-    publish_tracker = {
+    # TODO: This could be made more explcit with a TypedDict instead of list[str | int]
+    publish_tracker: ClassVar[dict[str, list[str | int]]] = {
         "run": [],
         "data": [],
         "flow": [],
         "task": [],
         "study": [],
         "user": [],
-    }  # type: dict
+    }
+    flow_name_tracker: ClassVar[list[str]] = []
     test_server = "https://test.openml.org/api/v1/xml"
     # amueller's read/write key that he will throw away later
     apikey = "610344db6388d9ba34f6db45a3cf71de"
@@ -44,7 +56,7 @@ class TestBase(unittest.TestCase):
     logger = logging.getLogger("unit_tests_published_entities")
     logger.setLevel(logging.DEBUG)
 
-    def setUp(self, n_levels: int = 1):
+    def setUp(self, n_levels: int = 1) -> None:
         """Setup variables and temporary directories.
 
         In particular, this methods:
@@ -61,31 +73,30 @@ def setUp(self, n_levels: int = 1):
             Number of nested directories the test is in. Necessary to resolve the path to the
             ``files`` directory, which is located directly under the ``tests`` directory.
         """
-
         # This cache directory is checked in to git to simulate a populated
         # cache
         self.maxDiff = None
-        self.static_cache_dir = None
-        abspath_this_file = os.path.abspath(inspect.getfile(self.__class__))
-        static_cache_dir = os.path.dirname(abspath_this_file)
+        abspath_this_file = Path(inspect.getfile(self.__class__)).absolute()
+        static_cache_dir = abspath_this_file.parent
         for _ in range(n_levels):
-            static_cache_dir = os.path.abspath(os.path.join(static_cache_dir, ".."))
+            static_cache_dir = static_cache_dir.parent.absolute()
+
         content = os.listdir(static_cache_dir)
         if "files" in content:
-            self.static_cache_dir = os.path.join(static_cache_dir, "files")
-
-        if self.static_cache_dir is None:
+            static_cache_dir = static_cache_dir / "files"
+        else:
             raise ValueError(
-                "Cannot find test cache dir, expected it to be {}!".format(static_cache_dir)
+                f"Cannot find test cache dir, expected it to be {static_cache_dir}!",
             )
 
-        self.cwd = os.getcwd()
-        workdir = os.path.dirname(os.path.abspath(__file__))
+        self.static_cache_dir = static_cache_dir
+        self.cwd = Path.cwd()
+        workdir = Path(__file__).parent.absolute()
         tmp_dir_name = self.id()
-        self.workdir = os.path.join(workdir, tmp_dir_name)
+        self.workdir = workdir / tmp_dir_name
         shutil.rmtree(self.workdir, ignore_errors=True)
 
-        os.mkdir(self.workdir)
+        self.workdir.mkdir(exist_ok=True)
         os.chdir(self.workdir)
 
         self.cached = True
@@ -93,29 +104,34 @@ def setUp(self, n_levels: int = 1):
         self.production_server = "https://openml.org/api/v1/xml"
         openml.config.server = TestBase.test_server
         openml.config.avoid_duplicate_runs = False
-        openml.config.set_root_cache_directory(self.workdir)
+        openml.config.set_root_cache_directory(str(self.workdir))
 
         # Increase the number of retries to avoid spurious server failures
         self.retry_policy = openml.config.retry_policy
         self.connection_n_retries = openml.config.connection_n_retries
         openml.config.set_retry_policy("robot", n_retries=20)
 
-    def tearDown(self):
+    def tearDown(self) -> None:
+        """Tear down the test"""
         os.chdir(self.cwd)
         try:
             shutil.rmtree(self.workdir)
-        except PermissionError:
-            if os.name == "nt":
+        except PermissionError as e:
+            if os.name != "nt":
                 # one of the files may still be used by another process
-                pass
-            else:
-                raise
+                raise e
+
         openml.config.server = self.production_server
         openml.config.connection_n_retries = self.connection_n_retries
         openml.config.retry_policy = self.retry_policy
 
     @classmethod
-    def _mark_entity_for_removal(self, entity_type, entity_id):
+    def _mark_entity_for_removal(
+        cls,
+        entity_type: str,
+        entity_id: int,
+        entity_name: str | None = None,
+    ) -> None:
         """Static record of entities uploaded to test server
 
         Dictionary of lists where the keys are 'entity_type'.
@@ -127,9 +143,12 @@ def _mark_entity_for_removal(self, entity_type, entity_id):
             TestBase.publish_tracker[entity_type] = [entity_id]
         else:
             TestBase.publish_tracker[entity_type].append(entity_id)
+        if isinstance(entity_type, openml.flows.OpenMLFlow):
+            assert entity_name is not None
+            cls.flow_name_tracker.append(entity_name)
 
     @classmethod
-    def _delete_entity_from_tracker(self, entity_type, entity):
+    def _delete_entity_from_tracker(cls, entity_type: str, entity: int) -> None:
         """Deletes entity records from the static file_tracker
 
         Given an entity type and corresponding ID, deletes all entries, including
@@ -139,61 +158,69 @@ def _delete_entity_from_tracker(self, entity_type, entity):
             # removes duplicate entries
             TestBase.publish_tracker[entity_type] = list(set(TestBase.publish_tracker[entity_type]))
             if entity_type == "flow":
-                delete_index = [
+                delete_index = next(
                     i
-                    for i, (id_, _) in enumerate(TestBase.publish_tracker[entity_type])
+                    for i, (id_, _) in enumerate(
+                        zip(TestBase.publish_tracker[entity_type], TestBase.flow_name_tracker),
+                    )
                     if id_ == entity
-                ][0]
+                )
             else:
-                delete_index = [
+                delete_index = next(
                     i
                     for i, id_ in enumerate(TestBase.publish_tracker[entity_type])
                     if id_ == entity
-                ][0]
+                )
             TestBase.publish_tracker[entity_type].pop(delete_index)
 
-    def _get_sentinel(self, sentinel=None):
+    def _get_sentinel(self, sentinel: str | None = None) -> str:
         if sentinel is None:
             # Create a unique prefix for the flow. Necessary because the flow
             # is identified by its name and external version online. Having a
             # unique name allows us to publish the same flow in each test run.
-            md5 = hashlib.md5()
+            md5 = hashlib.md5()  # noqa: S324
             md5.update(str(time.time()).encode("utf-8"))
             md5.update(str(os.getpid()).encode("utf-8"))
             sentinel = md5.hexdigest()[:10]
             sentinel = "TEST%s" % sentinel
         return sentinel
 
-    def _add_sentinel_to_flow_name(self, flow, sentinel=None):
+    def _add_sentinel_to_flow_name(
+        self,
+        flow: openml.flows.OpenMLFlow,
+        sentinel: str | None = None,
+    ) -> tuple[openml.flows.OpenMLFlow, str]:
         sentinel = self._get_sentinel(sentinel=sentinel)
-        flows_to_visit = list()
+        flows_to_visit = []
         flows_to_visit.append(flow)
         while len(flows_to_visit) > 0:
             current_flow = flows_to_visit.pop()
-            current_flow.name = "%s%s" % (sentinel, current_flow.name)
+            current_flow.name = f"{sentinel}{current_flow.name}"
             for subflow in current_flow.components.values():
                 flows_to_visit.append(subflow)
 
         return flow, sentinel
 
-    def _check_dataset(self, dataset):
-        self.assertEqual(type(dataset), dict)
-        self.assertGreaterEqual(len(dataset), 2)
-        self.assertIn("did", dataset)
-        self.assertIsInstance(dataset["did"], int)
-        self.assertIn("status", dataset)
-        self.assertIsInstance(dataset["status"], str)
-        self.assertIn(dataset["status"], ["in_preparation", "active", "deactivated"])
-
-    def _check_fold_timing_evaluations(
+    def _check_dataset(self, dataset: dict[str, str | int]) -> None:
+        _check_dataset(dataset)
+        assert isinstance(dataset, dict)
+        assert len(dataset) >= 2
+        assert "did" in dataset
+        assert isinstance(dataset["did"], int)
+        assert "status" in dataset
+        assert isinstance(dataset["status"], str)
+        assert dataset["status"] in ["in_preparation", "active", "deactivated"]
+
+    def _check_fold_timing_evaluations(  # noqa: PLR0913
         self,
-        fold_evaluations: Dict,
+        fold_evaluations: dict[str, dict[int, dict[int, float]]],
         num_repeats: int,
         num_folds: int,
+        *,
         max_time_allowed: float = 60000.0,
         task_type: TaskType = TaskType.SUPERVISED_CLASSIFICATION,
         check_scores: bool = True,
-    ):
+    ) -> None:
         """
         Checks whether the right timing measures are attached to the run
         (before upload). Test is only performed for versions >= Python3.3
@@ -203,7 +230,6 @@ def _check_fold_timing_evaluations(
         default max_time_allowed (per fold, in milli seconds) = 1 minute,
         quite pessimistic
         """
-
         # a dict mapping from openml measure to a tuple with the minimum and
         # maximum allowed value
         check_measures = {
@@ -222,31 +248,31 @@ def _check_fold_timing_evaluations(
             elif task_type == TaskType.SUPERVISED_REGRESSION:
                 check_measures["mean_absolute_error"] = (0, float("inf"))
 
-        self.assertIsInstance(fold_evaluations, dict)
-        if sys.version_info[:2] >= (3, 3):
-            # this only holds if we are allowed to record time (otherwise some
-            # are missing)
-            self.assertEqual(set(fold_evaluations.keys()), set(check_measures.keys()))
+        assert isinstance(fold_evaluations, dict)
+        assert set(fold_evaluations.keys()) == set(check_measures.keys())
 
-        for measure in check_measures.keys():
+        for measure in check_measures:
             if measure in fold_evaluations:
                 num_rep_entrees = len(fold_evaluations[measure])
-                self.assertEqual(num_rep_entrees, num_repeats)
+                assert num_rep_entrees == num_repeats
                 min_val = check_measures[measure][0]
                 max_val = check_measures[measure][1]
                 for rep in range(num_rep_entrees):
                     num_fold_entrees = len(fold_evaluations[measure][rep])
-                    self.assertEqual(num_fold_entrees, num_folds)
+                    assert num_fold_entrees == num_folds
                     for fold in range(num_fold_entrees):
                         evaluation = fold_evaluations[measure][rep][fold]
-                        self.assertIsInstance(evaluation, float)
-                        self.assertGreaterEqual(evaluation, min_val)
-                        self.assertLessEqual(evaluation, max_val)
+                        assert isinstance(evaluation, float)
+                        assert evaluation >= min_val
+                        assert evaluation <= max_val
 
 
 def check_task_existence(
-    task_type: TaskType, dataset_id: int, target_name: str, **kwargs
-) -> Union[int, None]:
+    task_type: TaskType,
+    dataset_id: int,
+    target_name: str,
+    **kwargs: dict[str, str | int | dict[str, str | int | openml.tasks.TaskType]],
+) -> int | None:
     """Checks if any task with exists on test server that matches the meta data.
 
     Parameter
@@ -261,9 +287,10 @@ def check_task_existence(
     """
     return_val = None
     tasks = openml.tasks.list_tasks(task_type=task_type, output_format="dataframe")
+    assert isinstance(tasks, pd.DataFrame)
     if len(tasks) == 0:
         return None
-    tasks = cast(pd.DataFrame, tasks).loc[tasks["did"] == dataset_id]
+    tasks = tasks.loc[tasks["did"] == dataset_id]
     if len(tasks) == 0:
         return None
     tasks = tasks.loc[tasks["target_feature"] == target_name]
@@ -305,13 +332,13 @@ class CustomImputer(SimpleImputer):
     Helps bypass the sklearn extension duplicate operation check
     """
 
-    pass
-
 
 def create_request_response(
-    *, status_code: int, content_filepath: pathlib.Path
+    *,
+    status_code: int,
+    content_filepath: pathlib.Path,
 ) -> requests.Response:
-    with open(content_filepath, "r") as xml_response:
+    with content_filepath.open("r") as xml_response:
         response_body = xml_response.read()
 
     response = requests.Response()
diff --git a/openml/utils.py b/openml/utils.py
index ffcc308dd..80d7caaae 100644
--- a/openml/utils.py
+++ b/openml/utils.py
@@ -1,37 +1,58 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
-import os
-import xmltodict
+import contextlib
 import shutil
-from typing import TYPE_CHECKING, List, Tuple, Union, Type
 import warnings
-import pandas as pd
 from functools import wraps
-import collections
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Callable, Mapping, TypeVar, overload
+from typing_extensions import Literal, ParamSpec
+
+import numpy as np
+import pandas as pd
+import xmltodict
 
 import openml
 import openml._api_calls
 import openml.exceptions
+
 from . import config
 
 # Avoid import cycles: https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles
 if TYPE_CHECKING:
     from openml.base import OpenMLBase
 
-oslo_installed = False
-try:
-    # Currently, importing oslo raises a lot of warning that it will stop working
-    # under python3.8; remove this once they disappear
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-        from oslo_concurrency import lockutils
+    P = ParamSpec("P")
+    R = TypeVar("R")
+
+
+@overload
+def extract_xml_tags(
+    xml_tag_name: str,
+    node: Mapping[str, Any],
+    *,
+    allow_none: Literal[True] = ...,
+) -> Any | None:
+    ...
+
 
-        oslo_installed = True
-except ImportError:
-    pass
+@overload
+def extract_xml_tags(
+    xml_tag_name: str,
+    node: Mapping[str, Any],
+    *,
+    allow_none: Literal[False],
+) -> Any:
+    ...
 
 
-def extract_xml_tags(xml_tag_name, node, allow_none=True):
+def extract_xml_tags(
+    xml_tag_name: str,
+    node: Mapping[str, Any],
+    *,
+    allow_none: bool = True,
+) -> Any | None:
     """Helper to extract xml tags from xmltodict.
 
     Parameters
@@ -39,7 +60,7 @@ def extract_xml_tags(xml_tag_name, node, allow_none=True):
     xml_tag_name : str
         Name of the xml tag to extract from the node.
 
-    node : object
+    node : Mapping[str, Any]
         Node object returned by ``xmltodict`` from which ``xml_tag_name``
         should be extracted.
 
@@ -52,46 +73,48 @@ def extract_xml_tags(xml_tag_name, node, allow_none=True):
     object
     """
     if xml_tag_name in node and node[xml_tag_name] is not None:
-        if isinstance(node[xml_tag_name], dict):
-            rval = [node[xml_tag_name]]
-        elif isinstance(node[xml_tag_name], str):
-            rval = [node[xml_tag_name]]
-        elif isinstance(node[xml_tag_name], list):
-            rval = node[xml_tag_name]
-        else:
-            raise ValueError("Received not string and non list as tag item")
+        if isinstance(node[xml_tag_name], (dict, str)):
+            return [node[xml_tag_name]]
+        if isinstance(node[xml_tag_name], list):
+            return node[xml_tag_name]
 
-        return rval
-    else:
-        if allow_none:
-            return None
-        else:
-            raise ValueError("Could not find tag '%s' in node '%s'" % (xml_tag_name, str(node)))
+        raise ValueError("Received not string and non list as tag item")
+
+    if allow_none:
+        return None
+
+    raise ValueError(f"Could not find tag '{xml_tag_name}' in node '{node!s}'")
 
 
-def _get_rest_api_type_alias(oml_object: "OpenMLBase") -> str:
+def _get_rest_api_type_alias(oml_object: OpenMLBase) -> str:
     """Return the alias of the openml entity as it is defined for the REST API."""
-    rest_api_mapping: List[Tuple[Union[Type, Tuple], str]] = [
+    rest_api_mapping: list[tuple[type | tuple, str]] = [
         (openml.datasets.OpenMLDataset, "data"),
         (openml.flows.OpenMLFlow, "flow"),
         (openml.tasks.OpenMLTask, "task"),
         (openml.runs.OpenMLRun, "run"),
         ((openml.study.OpenMLStudy, openml.study.OpenMLBenchmarkSuite), "study"),
     ]
-    _, api_type_alias = [
+    _, api_type_alias = next(
         (python_type, api_alias)
         for (python_type, api_alias) in rest_api_mapping
         if isinstance(oml_object, python_type)
-    ][0]
+    )
     return api_type_alias
 
 
-def _tag_openml_base(oml_object: "OpenMLBase", tag: str, untag: bool = False):
+def _tag_openml_base(oml_object: OpenMLBase, tag: str, untag: bool = False) -> None:  # noqa: FBT001, FBT002
     api_type_alias = _get_rest_api_type_alias(oml_object)
-    _tag_entity(api_type_alias, oml_object.id, tag, untag)
+    if oml_object.id is None:
+        raise openml.exceptions.ObjectNotPublishedError(
+            f"Cannot tag an {api_type_alias} that has not been published yet."
+            "Please publish the object first before being able to tag it."
+            f"\n{oml_object}",
+        )
+    _tag_entity(entity_type=api_type_alias, entity_id=oml_object.id, tag=tag, untag=untag)
 
 
-def _tag_entity(entity_type, entity_id, tag, untag=False):
+def _tag_entity(entity_type: str, entity_id: int, tag: str, *, untag: bool = False) -> list[str]:
     """
     Function that tags or untags a given entity on OpenML. As the OpenML
     API tag functions all consist of the same format, this function covers
@@ -119,27 +142,32 @@ def _tag_entity(entity_type, entity_id, tag, untag=False):
     """
     legal_entities = {"data", "task", "flow", "setup", "run"}
     if entity_type not in legal_entities:
-        raise ValueError("Can't tag a %s" % entity_type)
+        raise ValueError(f"Can't tag a {entity_type}")
 
-    uri = "%s/tag" % entity_type
-    main_tag = "oml:%s_tag" % entity_type
     if untag:
-        uri = "%s/untag" % entity_type
-        main_tag = "oml:%s_untag" % entity_type
+        uri = f"{entity_type}/untag"
+        main_tag = f"oml:{entity_type}_untag"
+    else:
+        uri = f"{entity_type}/tag"
+        main_tag = f"oml:{entity_type}_tag"
 
-    post_variables = {"%s_id" % entity_type: entity_id, "tag": tag}
-    result_xml = openml._api_calls._perform_api_call(uri, "post", post_variables)
+    result_xml = openml._api_calls._perform_api_call(
+        uri,
+        "post",
+        {f"{entity_type}_id": entity_id, "tag": tag},
+    )
 
     result = xmltodict.parse(result_xml, force_list={"oml:tag"})[main_tag]
 
     if "oml:tag" in result:
-        return result["oml:tag"]
-    else:
-        # no tags, return empty list
-        return []
+        return result["oml:tag"]  # type: ignore
+
+    # no tags, return empty list
+    return []
 
 
-def _delete_entity(entity_type, entity_id):
+# TODO(eddiebergman): Maybe this can be made more specific with a Literal
+def _delete_entity(entity_type: str, entity_id: int) -> bool:
     """
     Function that deletes a given entity on OpenML. As the OpenML
     API tag functions all consist of the same format, this function covers
@@ -197,7 +225,7 @@ def _delete_entity(entity_type, entity_id):
                 message=(
                     f"The {entity_type} can not be deleted because "
                     f"it still has associated entities: {e.message}"
-                )
+                ),
             ) from e
         if e.code in unknown_reason:
             raise openml.exceptions.OpenMLServerError(
@@ -209,7 +237,42 @@ def _delete_entity(entity_type, entity_id):
         raise
 
 
-def _list_all(listing_call, output_format="dict", *args, **filters):
+@overload
+def _list_all(
+    listing_call: Callable[P, Any],
+    list_output_format: Literal["dict"] = ...,
+    *args: P.args,
+    **filters: P.kwargs,
+) -> dict:
+    ...
+
+
+@overload
+def _list_all(
+    listing_call: Callable[P, Any],
+    list_output_format: Literal["object"],
+    *args: P.args,
+    **filters: P.kwargs,
+) -> dict:
+    ...
+
+
+@overload
+def _list_all(
+    listing_call: Callable[P, Any],
+    list_output_format: Literal["dataframe"],
+    *args: P.args,
+    **filters: P.kwargs,
+) -> pd.DataFrame:
+    ...
+
+
+def _list_all(  # noqa: C901, PLR0912
+    listing_call: Callable[P, Any],
+    list_output_format: Literal["dict", "dataframe", "object"] = "dict",
+    *args: P.args,
+    **filters: P.kwargs,
+) -> dict | pd.DataFrame:
     """Helper to handle paged listing requests.
 
     Example usage:
@@ -220,49 +283,45 @@ def _list_all(listing_call, output_format="dict", *args, **filters):
     ----------
     listing_call : callable
         Call listing, e.g. list_evaluations.
-    output_format : str, optional (default='dict')
+    list_output_format : str, optional (default='dict')
         The parameter decides the format of the output.
         - If 'dict' the output is a dict of dict
         - If 'dataframe' the output is a pandas DataFrame
+        - If 'object' the output is a dict of objects (only for some `listing_call`)
     *args : Variable length argument list
         Any required arguments for the listing call.
     **filters : Arbitrary keyword arguments
         Any filters that can be applied to the listing function.
         additionally, the batch_size can be specified. This is
         useful for testing purposes.
+
     Returns
     -------
     dict or dataframe
     """
-
     # eliminate filters that have a None value
     active_filters = {key: value for key, value in filters.items() if value is not None}
     page = 0
-    result = collections.OrderedDict()
-    if output_format == "dataframe":
-        result = pd.DataFrame()
+    result = pd.DataFrame() if list_output_format == "dataframe" else {}
 
     # Default batch size per paging.
     # This one can be set in filters (batch_size), but should not be
     # changed afterwards. The derived batch_size can be changed.
-    BATCH_SIZE_ORIG = 10000
-    if "batch_size" in active_filters:
-        BATCH_SIZE_ORIG = active_filters["batch_size"]
-        del active_filters["batch_size"]
+    BATCH_SIZE_ORIG = active_filters.pop("batch_size", 10000)
+    if not isinstance(BATCH_SIZE_ORIG, int):
+        raise ValueError(f"'batch_size' should be an integer but got {BATCH_SIZE_ORIG}")
 
     # max number of results to be shown
-    LIMIT = None
-    offset = 0
-    if "size" in active_filters:
-        LIMIT = active_filters["size"]
-        del active_filters["size"]
+    LIMIT: int | float | None = active_filters.pop("size", None)  # type: ignore
+    if (LIMIT is not None) and (not isinstance(LIMIT, int)) and (not np.isinf(LIMIT)):
+        raise ValueError(f"'limit' should be an integer or inf but got {LIMIT}")
 
     if LIMIT is not None and BATCH_SIZE_ORIG > LIMIT:
         BATCH_SIZE_ORIG = LIMIT
 
-    if "offset" in active_filters:
-        offset = active_filters["offset"]
-        del active_filters["offset"]
+    offset = active_filters.pop("offset", 0)
+    if not isinstance(offset, int):
+        raise ValueError(f"'offset' should be an integer but got {offset}")
 
     batch_size = BATCH_SIZE_ORIG
     while True:
@@ -270,24 +329,27 @@ def _list_all(listing_call, output_format="dict", *args, **filters):
             current_offset = offset + BATCH_SIZE_ORIG * page
             new_batch = listing_call(
                 *args,
-                limit=batch_size,
-                offset=current_offset,
-                output_format=output_format,
-                **active_filters,
+                output_format=list_output_format,  # type: ignore
+                **{**active_filters, "limit": batch_size, "offset": current_offset},  # type: ignore
             )
         except openml.exceptions.OpenMLServerNoResult:
             # we want to return an empty dict in this case
+            # NOTE: This above statement may not actually happen, but we could just return here
+            # to enforce it...
             break
-        if output_format == "dataframe":
+
+        if list_output_format == "dataframe":
             if len(result) == 0:
                 result = new_batch
             else:
                 result = pd.concat([result, new_batch], ignore_index=True)
         else:
-            # For output_format = 'dict' or 'object'
+            # For output_format = 'dict' (or catch all)
             result.update(new_batch)
+
         if len(new_batch) < batch_size:
             break
+
         page += 1
         if LIMIT is not None:
             # check if the number of required results has been achieved
@@ -295,24 +357,24 @@ def _list_all(listing_call, output_format="dict", *args, **filters):
             # in case of bugs to prevent infinite loops
             if len(result) >= LIMIT:
                 break
+
             # check if there are enough results to fulfill a batch
-            if BATCH_SIZE_ORIG > LIMIT - len(result):
+            if LIMIT - len(result) < BATCH_SIZE_ORIG:
                 batch_size = LIMIT - len(result)
 
     return result
 
 
-def _get_cache_dir_for_key(key):
-    cache = config.get_cache_directory()
-    return os.path.join(cache, key)
+def _get_cache_dir_for_key(key: str) -> Path:
+    return Path(config.get_cache_directory()) / key
 
 
-def _create_cache_directory(key):
+def _create_cache_directory(key: str) -> Path:
     cache_dir = _get_cache_dir_for_key(key)
 
     try:
-        os.makedirs(cache_dir, exist_ok=True)
-    except Exception as e:
+        cache_dir.mkdir(exist_ok=True, parents=True)
+    except Exception as e:  # noqa: BLE001
         raise openml.exceptions.OpenMLCacheException(
             f"Cannot create cache directory {cache_dir}."
         ) from e
@@ -320,16 +382,12 @@ def _create_cache_directory(key):
     return cache_dir
 
 
-def _get_cache_dir_for_id(key, id_, create=False):
-    if create:
-        cache_dir = _create_cache_directory(key)
-    else:
-        cache_dir = _get_cache_dir_for_key(key)
+def _get_cache_dir_for_id(key: str, id_: int, create: bool = False) -> Path:  # noqa: FBT001, FBT002
+    cache_dir = _create_cache_directory(key) if create else _get_cache_dir_for_key(key)
+    return Path(cache_dir) / str(id_)
 
-    return os.path.join(cache_dir, str(id_))
 
-
-def _create_cache_directory_for_id(key, id_):
+def _create_cache_directory_for_id(key: str, id_: int) -> Path:
     """Create the cache directory for a specific ID
 
     In order to have a clearer cache structure and because every task
@@ -347,20 +405,18 @@ def _create_cache_directory_for_id(key, id_):
 
     Returns
     -------
-    str
+    cache_dir : Path
         Path of the created dataset cache directory.
     """
     cache_dir = _get_cache_dir_for_id(key, id_, create=True)
-    if os.path.isdir(cache_dir):
-        pass
-    elif os.path.exists(cache_dir):
+    if cache_dir.exists() and not cache_dir.is_dir():
         raise ValueError("%s cache dir exists but is not a directory!" % key)
-    else:
-        os.makedirs(cache_dir)
+
+    cache_dir.mkdir(exist_ok=True, parents=True)
     return cache_dir
 
 
-def _remove_cache_dir_for_id(key, cache_dir):
+def _remove_cache_dir_for_id(key: str, cache_dir: Path) -> None:
     """Remove the task cache directory
 
     This function is NOT thread/multiprocessing safe.
@@ -373,18 +429,22 @@ def _remove_cache_dir_for_id(key, cache_dir):
     """
     try:
         shutil.rmtree(cache_dir)
-    except (OSError, IOError):
+    except OSError as e:
         raise ValueError(
-            "Cannot remove faulty %s cache directory %s."
-            "Please do this manually!" % (key, cache_dir)
-        )
+            f"Cannot remove faulty {key} cache directory {cache_dir}. Please do this manually!",
+        ) from e
 
 
-def thread_safe_if_oslo_installed(func):
-    if oslo_installed:
+def thread_safe_if_oslo_installed(func: Callable[P, R]) -> Callable[P, R]:
+    try:
+        # Currently, importing oslo raises a lot of warning that it will stop working
+        # under python3.8; remove this once they disappear
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            from oslo_concurrency import lockutils
 
         @wraps(func)
-        def safe_func(*args, **kwargs):
+        def safe_func(*args: P.args, **kwargs: P.kwargs) -> R:
             # Lock directories use the id that is passed as either positional or keyword argument.
             id_parameters = [parameter_name for parameter_name in kwargs if "_id" in parameter_name]
             if len(id_parameters) == 1:
@@ -393,24 +453,21 @@ def safe_func(*args, **kwargs):
                 id_ = args[0]
             else:
                 raise RuntimeError(
-                    "An id must be specified for {}, was passed: ({}, {}).".format(
-                        func.__name__, args, kwargs
-                    )
+                    f"An id must be specified for {func.__name__}, was passed: ({args}, {kwargs}).",
                 )
             # The [7:] gets rid of the 'openml.' prefix
-            lock_name = "{}.{}:{}".format(func.__module__[7:], func.__name__, id_)
+            lock_name = f"{func.__module__[7:]}.{func.__name__}:{id_}"
             with lockutils.external_lock(name=lock_name, lock_path=_create_lockfiles_dir()):
                 return func(*args, **kwargs)
 
         return safe_func
-    else:
+    except ImportError:
         return func
 
 
-def _create_lockfiles_dir():
-    dir = os.path.join(config.get_cache_directory(), "locks")
-    try:
-        os.makedirs(dir)
-    except OSError:
-        pass
-    return dir
+def _create_lockfiles_dir() -> Path:
+    path = Path(config.get_cache_directory()) / "locks"
+    # TODO(eddiebergman): Not sure why this is allowed to error and ignore???
+    with contextlib.suppress(OSError):
+        path.mkdir(exist_ok=True, parents=True)
+    return path
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 000000000..99ff2b804
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,318 @@
+# -*- coding: utf-8 -*-
+
+# License: BSD 3-Clause
+[build-system]
+requires = ["setuptools >= 61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "openml"
+dynamic = ["version"]  # Will take it from the __version__ file, update there
+dependencies = [
+  "liac-arff>=2.4.0",
+  "xmltodict",
+  "requests",
+  "scikit-learn>=0.18",
+  "python-dateutil",  # Installed through pandas anyway.
+  "pandas>=1.0.0",
+  "scipy>=0.13.3",
+  "numpy>=1.6.2",
+  "minio",
+  "pyarrow",
+]
+requires-python = ">=3.8"
+authors = [
+  { name = "Matthias Feurer", email="feurerm@informatik.uni-freiburg.de" },
+  { name = "Jan van Rijn" },
+  { name = "Arlind Kadra" },
+  { name = "Pieter Gijsbers" },
+  { name = "Neeratyoy Mallik" },
+  { name = "Sahithya Ravi" },
+  { name = "Andreas Müller" },
+  { name = "Joaquin Vanschoren " },
+  { name = "Frank Hutter" },
+]
+readme = "README.md"
+description = "Python API for OpenML"
+classifiers = [
+  "Intended Audience :: Science/Research",
+  "Intended Audience :: Developers",
+  "License :: OSI Approved :: BSD License",
+  "Programming Language :: Python",
+  "Topic :: Software Development",
+  "Topic :: Scientific/Engineering",
+  "Operating System :: POSIX",
+  "Operating System :: Unix",
+  "Operating System :: MacOS",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3.8",
+  "Programming Language :: Python :: 3.9",
+  "Programming Language :: Python :: 3.10",
+]
+license = { file = "LICENSE" }
+
+[project.scripts]
+openml = "openml.cli:main"
+
+[project.optional-dependencies]
+test=[
+    "nbconvert",
+    "jupyter_client",
+    "matplotlib",
+    "pytest",
+    "pytest-xdist",
+    "pytest-timeout",
+    "nbformat",
+    "oslo.concurrency",
+    "flaky",
+    "pre-commit",
+    "pytest-cov",
+    "pytest-rerunfailures",
+    "mypy",
+    "ruff",
+]
+examples=[
+    "matplotlib",
+    "jupyter",
+    "notebook",
+    "nbconvert",
+    "nbformat",
+    "jupyter_client",
+    "ipython",
+    "ipykernel",
+    "seaborn",
+]
+examples_unix=["fanova"]
+docs=[
+    "sphinx>=3",
+    "sphinx-gallery",
+    "sphinx_bootstrap_theme",
+    "numpydoc",
+]
+
+[project.urls]
+home="https://openml.org/"
+documentation = "https://openml.github.io/openml-python/"
+source = "https://github.com/openml/openml-python"
+
+[tool.setuptools.packages.find]
+where = [""]
+include = ["openml*"]
+namespaces = false
+
+[tool.setuptools.package-data]
+openml = ["*.txt", "*.md", "py.typed"]
+
+[tool.setuptools.dynamic]
+version = {attr = "openml.__version__.__version__"}
+
+# https://docs.pytest.org/en/7.2.x/reference/reference.html#ini-options-ref
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+minversion = "7.0"
+xfail_strict = true
+filterwarnings=[
+    "ignore:the matrix subclass:PendingDeprecationWarning"
+]
+markers = [
+  "server: anything that connects to a server",
+  "upload: anything that uploads to a server",
+  "production: any interaction with the production server",
+  "cache: anything that interacts with the (test) cache",
+]
+
+# https://github.com/charliermarsh/ruff
+[tool.ruff]
+target-version = "py37"
+line-length = 100
+show-source = true
+src = ["openml", "tests", "examples"]
+unsafe-fixes = true
+
+# Allow unused variables when underscore-prefixed.
+dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
+
+select = [
+  "A",
+  # "ANN", # Handled by mypy
+  "ARG",
+  "B",
+  "BLE",
+  "COM",
+  "C4",
+  "D",
+  # "DTZ",  # One day I should know how to utilize timezones and dates...
+  "E",
+  # "EXE", Meh
+  "ERA",
+  "F",
+  "FBT",
+  "I",
+  # "ISC",  # Favours implicit string concatenation
+  "INP",
+  # "INT", # I don't understand this one
+  "N",
+  "NPY",
+  "PD",
+  "PLC",
+  "PLE",
+  "PLR",
+  "PLW",
+  "PIE",
+  "PT",
+  "PTH",
+  # "PYI", # Specific to .pyi files for type stubs
+  "Q",
+  "PGH004",
+  "RET",
+  "RUF",
+  "C90",
+  "S",
+  # "SLF",    # Private member accessed (sure, it's python)
+  "SIM",
+  # "TRY", # Good in principle, would take a lot of work to statisfy
+  "T10",
+  "T20",
+  "TID",
+  "TCH",
+  "UP",
+  "N",
+  "W",
+  "YTT",
+]
+
+ignore = [
+  "D105",    # Missing docstring in magic mthod
+  "D401",    # First line of docstring should be in imperative mood
+  "N806",    # Variable X in function should be lowercase
+  "E731",    # Do not assign a lambda expression, use a def
+  "S101",    # Use of assert detected.
+  "W292",    # No newline at end of file
+  "PLC1901", # "" can be simplified to be falsey
+  "TCH003",  # Move stdlib import into TYPE_CHECKING
+  "COM812",  # Trailing comma missing (handled by linter, ruff recommend disabling if using formatter)
+  "N803",    # Argument should be lowercase (but we accept things like `X`)
+
+  # TODO(@eddibergman): These should be enabled
+  "D100",    # Missing docstring in public module
+  "D103",    # Missing docstring in public function
+  "D104",    # Missing docstring in public package
+
+  # TODO(@eddiebergman): Maybe fix
+   "PLR2004", # Magic value used in comparison, consider replacing 2 with a constant variable
+  "D400",    # First line must end with a period (@eddiebergman too many to fix so ignoring this for now)
+  "D203",    # 1 blank line required before class docstring
+  "D205",    # 1 blank line between summary and description
+
+  # TODO(@eddiebergman): Could be backwards breaking
+  "N802",    # Public function name should be lower case (i.e. get_X())
+]
+
+exclude = [
+  # TODO(eddiebergman): Tests should be re-enabled after the refactor
+  "tests",
+  #
+  ".bzr",
+  ".direnv",
+  ".eggs",
+  ".git",
+  ".hg",
+  ".mypy_cache",
+  ".nox",
+  ".pants.d",
+  ".ruff_cache",
+  ".svn",
+  ".tox",
+  ".venv",
+  "__pypackages__",
+  "_build",
+  "buck-out",
+  "build",
+  "dist",
+  "node_modules",
+  "venv",
+  "docs",
+]
+
+# Exclude a variety of commonly ignored directories.
+[tool.ruff.per-file-ignores]
+"tests/*.py" = [
+  "D100",   # Undocumented public module
+  "D101",   # Missing docstring in public class
+  "D102",   # Missing docstring in public method
+  "D103",   # Missing docstring in public function
+  "S101",   # Use of assert
+  "ANN201", # Missing return type annotation for public function
+  "FBT001", # Positional boolean argument
+  "PLR2004",# No use of magic numbers
+  "PD901",  #  X is a bad variable name. (pandas)
+  "TCH",    # https://docs.astral.sh/ruff/rules/#flake8-type-checking-tch
+  "N803",   # Argument name {name} should be lowercase
+]
+"openml/cli.py" = [
+  "T201",   # print found
+  "T203",   # pprint found
+]
+"openml/__version__.py" = [
+  "D100",   # Undocumented public module
+]
+"__init__.py" = [
+  "I002",   # Missing required import (i.e. from __future__ import annotations)
+]
+"examples/*.py" = [
+  "D101",   # Missing docstring in public class
+  "D102",   # Missing docstring in public method
+  "D103",   # Missing docstring in public function
+  "D415",   # First line should end with a . or ? or !
+  "INP001", # File is part of an implicit namespace package, add an __init__.py
+  "I002",   # Missing required import (i.e. from __future__ import annotations) 
+  "E741",   # Ambigiuous variable name
+  "T201",   # print found
+  "T203",   # pprint found
+  "ERA001", # found commeneted out code
+  "E402",   # Module level import not at top of cell
+  "E501",   # Line too long
+]
+
+
+[tool.ruff.isort]
+known-first-party = ["openml"]
+no-lines-before = ["future"]
+required-imports = ["from __future__ import annotations"]
+combine-as-imports = true
+extra-standard-library = ["typing_extensions"]
+force-wrap-aliases = true
+
+[tool.ruff.pydocstyle]
+convention = "numpy"
+
+[tool.mypy]
+python_version = "3.7"
+packages = ["openml", "tests"]
+
+show_error_codes = true
+
+warn_unused_configs = true # warn about unused [tool.mypy] lines
+
+follow_imports = "normal"      # Type check top level api code we use from imports
+ignore_missing_imports = false # prefer explicit ignores
+
+disallow_untyped_defs = true       # All functions must have types
+disallow_untyped_decorators = true # ... even decorators
+disallow_incomplete_defs = true    # ...all types
+
+no_implicit_optional = true
+check_untyped_defs = true
+
+warn_return_any = true
+
+
+[[tool.mypy.overrides]]
+module = ["tests.*", "openml.extensions.sklearn.*"]
+
+# TODO(eddiebergman): This should be re-enabled after tests get refactored
+ignore_errors = true
+#disallow_untyped_defs = false          # Sometimes we just want to ignore verbose types
+#disallow_untyped_decorators = false    # Test decorators are not properly typed
+#disallow_incomplete_defs = false       # Sometimes we just want to ignore verbose types
+#disable_error_code = ["var-annotated"]
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index 726c8fa73..000000000
--- a/setup.cfg
+++ /dev/null
@@ -1,6 +0,0 @@
-[metadata]
-description-file = README.md
-
-[tool:pytest]
-filterwarnings =
-    ignore:the matrix subclass:PendingDeprecationWarning
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 9f3cdd0e6..000000000
--- a/setup.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# License: BSD 3-Clause
-
-import os
-import setuptools
-import sys
-
-with open("openml/__version__.py") as fh:
-    version = fh.readlines()[-1].split()[-1].strip("\"'")
-
-if sys.version_info < (3, 6):
-    raise ValueError(
-        "Unsupported Python version {}.{}.{} found. OpenML requires Python 3.6 or higher.".format(
-            sys.version_info.major, sys.version_info.minor, sys.version_info.micro
-        )
-    )
-
-with open(os.path.join("README.md"), encoding="utf-8") as fid:
-    README = fid.read()
-
-setuptools.setup(
-    name="openml",
-    author="Matthias Feurer, Jan van Rijn, Arlind Kadra, Pieter Gijsbers, "
-    "Neeratyoy Mallik, Sahithya Ravi, Andreas Müller, Joaquin Vanschoren "
-    "and Frank Hutter",
-    author_email="feurerm@informatik.uni-freiburg.de",
-    maintainer="Matthias Feurer",
-    maintainer_email="feurerm@informatik.uni-freiburg.de",
-    description="Python API for OpenML",
-    long_description=README,
-    long_description_content_type="text/markdown",
-    license="BSD 3-clause",
-    url="https://openml.org/",
-    project_urls={
-        "Documentation": "https://openml.github.io/openml-python/",
-        "Source Code": "https://github.com/openml/openml-python",
-    },
-    version=version,
-    # Make sure to remove stale files such as the egg-info before updating this:
-    # https://stackoverflow.com/a/26547314
-    packages=setuptools.find_packages(
-        include=["openml.*", "openml"],
-        exclude=["*.tests", "*.tests.*", "tests.*", "tests"],
-    ),
-    package_data={"": ["*.txt", "*.md", "py.typed"]},
-    python_requires=">=3.6",
-    install_requires=[
-        "liac-arff>=2.4.0",
-        "xmltodict",
-        "requests",
-        "scikit-learn>=0.18",
-        "python-dateutil",  # Installed through pandas anyway.
-        "pandas>=1.0.0",
-        "scipy>=0.13.3",
-        "numpy>=1.6.2",
-        "minio",
-        "pyarrow",
-    ],
-    extras_require={
-        "test": [
-            "nbconvert",
-            "jupyter_client",
-            "matplotlib",
-            "pytest",
-            "pytest-xdist",
-            "pytest-timeout",
-            "nbformat",
-            "oslo.concurrency",
-            "flaky",
-            "pre-commit",
-            "pytest-cov",
-            "pytest-rerunfailures",
-            "mypy",
-        ],
-        "examples": [
-            "matplotlib",
-            "jupyter",
-            "notebook",
-            "nbconvert",
-            "nbformat",
-            "jupyter_client",
-            "ipython",
-            "ipykernel",
-            "seaborn",
-        ],
-        "examples_unix": ["fanova"],
-        "docs": [
-            "sphinx>=3",
-            "sphinx-gallery",
-            "sphinx_bootstrap_theme",
-            "numpydoc",
-        ],
-    },
-    test_suite="pytest",
-    classifiers=[
-        "Intended Audience :: Science/Research",
-        "Intended Audience :: Developers",
-        "License :: OSI Approved :: BSD License",
-        "Programming Language :: Python",
-        "Topic :: Software Development",
-        "Topic :: Scientific/Engineering",
-        "Operating System :: POSIX",
-        "Operating System :: Unix",
-        "Operating System :: MacOS",
-        "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.6",
-        "Programming Language :: Python :: 3.7",
-        "Programming Language :: Python :: 3.8",
-    ],
-    entry_points={"console_scripts": ["openml=openml.cli:main"]},
-)
diff --git a/tests/conftest.py b/tests/conftest.py
index 43e2cc3ee..62fe3c7e8 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -21,11 +21,11 @@
 """
 
 # License: BSD 3-Clause
+from __future__ import annotations
 
-import os
 import logging
-import pathlib
-from typing import List
+import os
+from pathlib import Path
 import pytest
 
 import openml
@@ -52,29 +52,29 @@ def worker_id() -> str:
         return "master"
 
 
-def read_file_list() -> List[pathlib.Path]:
+def read_file_list() -> list[Path]:
     """Returns a list of paths to all files that currently exist in 'openml/tests/files/'
 
-    :return: List[pathlib.Path]
+    :return: List[Path]
     """
-    test_files_dir = pathlib.Path(__file__).parent / "files"
+    test_files_dir = Path(__file__).parent / "files"
     return [f for f in test_files_dir.rglob("*") if f.is_file()]
 
 
-def compare_delete_files(old_list: List[pathlib.Path], new_list: List[pathlib.Path]) -> None:
+def compare_delete_files(old_list: list[Path], new_list: list[Path]) -> None:
     """Deletes files that are there in the new_list but not in the old_list
 
-    :param old_list: List[pathlib.Path]
-    :param new_list: List[pathlib.Path]
+    :param old_list: List[Path]
+    :param new_list: List[Path]
     :return: None
     """
     file_list = list(set(new_list) - set(old_list))
     for file in file_list:
         os.remove(file)
-        logger.info("Deleted from local: {}".format(file))
+        logger.info(f"Deleted from local: {file}")
 
 
-def delete_remote_files(tracker) -> None:
+def delete_remote_files(tracker, flow_names) -> None:
     """Function that deletes the entities passed as input, from the OpenML test server
 
     The TestBase class in openml/testing.py has an attribute called publish_tracker.
@@ -94,27 +94,27 @@ def delete_remote_files(tracker) -> None:
     # reordering to delete sub flows at the end of flows
     # sub-flows have shorter names, hence, sorting by descending order of flow name length
     if "flow" in tracker:
+        to_sort = list(zip(tracker["flow"], flow_names))
         flow_deletion_order = [
-            entity_id
-            for entity_id, _ in sorted(tracker["flow"], key=lambda x: len(x[1]), reverse=True)
+            entity_id for entity_id, _ in sorted(to_sort, key=lambda x: len(x[1]), reverse=True)
         ]
-        tracker["flow"] = flow_deletion_order
+        tracker["flow"] = [flow_deletion_order[1] for flow_id, _ in flow_deletion_order]
 
     # deleting all collected entities published to test server
     # 'run's are deleted first to prevent dependency issue of entities on deletion
     logger.info("Entity Types: {}".format(["run", "data", "flow", "task", "study"]))
     for entity_type in ["run", "data", "flow", "task", "study"]:
-        logger.info("Deleting {}s...".format(entity_type))
-        for i, entity in enumerate(tracker[entity_type]):
+        logger.info(f"Deleting {entity_type}s...")
+        for _i, entity in enumerate(tracker[entity_type]):
             try:
                 openml.utils._delete_entity(entity_type, entity)
-                logger.info("Deleted ({}, {})".format(entity_type, entity))
+                logger.info(f"Deleted ({entity_type}, {entity})")
             except Exception as e:
-                logger.warning("Cannot delete ({},{}): {}".format(entity_type, entity, e))
+                logger.warning(f"Cannot delete ({entity_type},{entity}): {e}")
 
 
 def pytest_sessionstart() -> None:
-    """pytest hook that is executed before any unit test starts
+    """Pytest hook that is executed before any unit test starts
 
     This function will be called by each of the worker processes, along with the master process
     when they are spawned. This happens even before the collection of unit tests.
@@ -136,7 +136,7 @@ def pytest_sessionstart() -> None:
 
 
 def pytest_sessionfinish() -> None:
-    """pytest hook that is executed after all unit tests of a worker ends
+    """Pytest hook that is executed after all unit tests of a worker ends
 
     This function will be called by each of the worker processes, along with the master process
     when they are done with the unit tests allocated to them.
@@ -154,11 +154,11 @@ def pytest_sessionfinish() -> None:
     # allows access to the file_list read in the set up phase
     global file_list
     worker = worker_id()
-    logger.info("Finishing worker {}".format(worker))
+    logger.info(f"Finishing worker {worker}")
 
     # Test file deletion
-    logger.info("Deleting files uploaded to test server for worker {}".format(worker))
-    delete_remote_files(TestBase.publish_tracker)
+    logger.info(f"Deleting files uploaded to test server for worker {worker}")
+    delete_remote_files(TestBase.publish_tracker, TestBase.flow_name_tracker)
 
     if worker == "master":
         # Local file deletion
@@ -166,7 +166,7 @@ def pytest_sessionfinish() -> None:
         compare_delete_files(file_list, new_file_list)
         logger.info("Local files deleted")
 
-    logger.info("{} is killed".format(worker))
+    logger.info(f"{worker} is killed")
 
 
 def pytest_configure(config):
@@ -182,16 +182,58 @@ def pytest_addoption(parser):
     )
 
 
+def _expected_static_cache_state(root_dir: Path) -> list[Path]:
+    _c_root_dir = root_dir / "org" / "openml" / "test"
+    res_paths = [root_dir, _c_root_dir]
+    
+    for _d in ["datasets", "tasks", "runs", "setups"]:
+        res_paths.append(_c_root_dir / _d)
+
+    for _id in ["-1","2"]:
+        tmp_p = _c_root_dir / "datasets" / _id
+        res_paths.extend([
+            tmp_p / "dataset.arff",
+            tmp_p / "features.xml",
+            tmp_p / "qualities.xml",
+            tmp_p / "description.xml",
+        ])
+
+    res_paths.append(_c_root_dir / "datasets" / "30" / "dataset_30.pq")
+    res_paths.append(_c_root_dir / "runs" / "1" / "description.xml")
+    res_paths.append(_c_root_dir / "setups" / "1" / "description.xml")
+    
+    for _id in ["1", "3", "1882"]:
+        tmp_p = _c_root_dir / "tasks" / _id
+        res_paths.extend([
+            tmp_p / "datasplits.arff",
+            tmp_p / "task.xml",
+        ])
+    
+    return res_paths
+
+
+def assert_static_test_cache_correct(root_dir: Path) -> None:
+    for p in _expected_static_cache_state(root_dir):
+        assert p.exists(), f"Expected path {p} does not exist"
+    
+
 @pytest.fixture(scope="class")
 def long_version(request):
     request.cls.long_version = request.config.getoption("--long")
 
 
-@pytest.fixture
-def test_files_directory() -> pathlib.Path:
-    return pathlib.Path(__file__).parent / "files"
+@pytest.fixture()
+def test_files_directory() -> Path:
+    return Path(__file__).parent / "files"
 
 
 @pytest.fixture()
 def test_api_key() -> str:
     return "c0c42819af31e706efe1f4b88c23c6c1"
+
+
+@pytest.fixture(autouse=True)
+def verify_cache_state(test_files_directory) -> None:
+    assert_static_test_cache_correct(test_files_directory)
+    yield
+    assert_static_test_cache_correct(test_files_directory)
diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
index 93e0247d2..80da9c842 100644
--- a/tests/test_datasets/test_dataset.py
+++ b/tests/test_datasets/test_dataset.py
@@ -1,8 +1,9 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
 import os
-from time import time
 import unittest.mock
+from time import time
 
 import numpy as np
 import pandas as pd
@@ -10,16 +11,17 @@
 from scipy import sparse
 
 import openml
-from openml.testing import TestBase
+from openml.datasets import OpenMLDataFeature, OpenMLDataset
 from openml.exceptions import PyOpenMLError
-from openml.datasets import OpenMLDataset, OpenMLDataFeature
+from openml.testing import TestBase
 
 
+@pytest.mark.production()
 class OpenMLDatasetTest(TestBase):
     _multiprocess_can_split_ = True
 
     def setUp(self):
-        super(OpenMLDatasetTest, self).setUp()
+        super().setUp()
         openml.config.server = self.production_server
 
         # Load dataset id 2 - dataset 2 is interesting because it contains
@@ -77,7 +79,9 @@ def test_init_string_validation(self):
 
         with pytest.raises(ValueError, match="Invalid symbols 'ü' in citation"):
             openml.datasets.OpenMLDataset(
-                name="somename", description="a description", citation="Something by Müller"
+                name="somename",
+                description="a description",
+                citation="Something by Müller",
             )
 
     def test__unpack_categories_with_nan_likes(self):
@@ -94,14 +98,14 @@ def test__unpack_categories_with_nan_likes(self):
     def test_get_data_array(self):
         # Basic usage
         rval, _, categorical, attribute_names = self.dataset.get_data(dataset_format="array")
-        self.assertIsInstance(rval, np.ndarray)
-        self.assertEqual(rval.dtype, np.float32)
-        self.assertEqual((898, 39), rval.shape)
-        self.assertEqual(len(categorical), 39)
-        self.assertTrue(all([isinstance(cat, bool) for cat in categorical]))
-        self.assertEqual(len(attribute_names), 39)
-        self.assertTrue(all([isinstance(att, str) for att in attribute_names]))
-        self.assertIsNone(_)
+        assert isinstance(rval, np.ndarray)
+        assert rval.dtype == np.float32
+        assert rval.shape == (898, 39)
+        assert len(categorical) == 39
+        assert all(isinstance(cat, bool) for cat in categorical)
+        assert len(attribute_names) == 39
+        assert all(isinstance(att, str) for att in attribute_names)
+        assert _ is None
 
         # check that an error is raised when the dataset contains string
         err_msg = "PyOpenML cannot handle string when returning numpy arrays"
@@ -110,9 +114,9 @@ def test_get_data_array(self):
 
     def test_get_data_pandas(self):
         data, _, _, _ = self.titanic.get_data(dataset_format="dataframe")
-        self.assertTrue(isinstance(data, pd.DataFrame))
-        self.assertEqual(data.shape[1], len(self.titanic.features))
-        self.assertEqual(data.shape[0], 1309)
+        assert isinstance(data, pd.DataFrame)
+        assert data.shape[1] == len(self.titanic.features)
+        assert data.shape[0] == 1309
         col_dtype = {
             "pclass": "uint8",
             "survived": "category",
@@ -130,30 +134,31 @@ def test_get_data_pandas(self):
             "home.dest": "object",
         }
         for col_name in data.columns:
-            self.assertTrue(data[col_name].dtype.name == col_dtype[col_name])
+            assert data[col_name].dtype.name == col_dtype[col_name]
 
         X, y, _, _ = self.titanic.get_data(
-            dataset_format="dataframe", target=self.titanic.default_target_attribute
+            dataset_format="dataframe",
+            target=self.titanic.default_target_attribute,
         )
-        self.assertTrue(isinstance(X, pd.DataFrame))
-        self.assertTrue(isinstance(y, pd.Series))
-        self.assertEqual(X.shape, (1309, 13))
-        self.assertEqual(y.shape, (1309,))
+        assert isinstance(X, pd.DataFrame)
+        assert isinstance(y, pd.Series)
+        assert X.shape == (1309, 13)
+        assert y.shape == (1309,)
         for col_name in X.columns:
-            self.assertTrue(X[col_name].dtype.name == col_dtype[col_name])
-        self.assertTrue(y.dtype.name == col_dtype["survived"])
+            assert X[col_name].dtype.name == col_dtype[col_name]
+        assert y.dtype.name == col_dtype["survived"]
 
     @pytest.mark.skip("https://github.com/openml/openml-python/issues/1157")
     def test_get_data_boolean_pandas(self):
         # test to check that we are converting properly True and False even
         # with some inconsistency when dumping the data on openml
         data, _, _, _ = self.jm1.get_data()
-        self.assertTrue(data["defects"].dtype.name == "category")
-        self.assertTrue(set(data["defects"].cat.categories) == {True, False})
+        assert data["defects"].dtype.name == "category"
+        assert set(data["defects"].cat.categories) == {True, False}
 
         data, _, _, _ = self.pc4.get_data()
-        self.assertTrue(data["c"].dtype.name == "category")
-        self.assertTrue(set(data["c"].cat.categories) == {True, False})
+        assert data["c"].dtype.name == "category"
+        assert set(data["c"].cat.categories) == {True, False}
 
     def test_get_data_no_str_data_for_nparrays(self):
         # check that an error is raised when the dataset contains string
@@ -169,59 +174,59 @@ def _check_expected_type(self, dtype, is_cat, col):
         else:
             expected_type = "float64"
 
-        self.assertEqual(dtype.name, expected_type)
+        assert dtype.name == expected_type
 
     @pytest.mark.skip("https://github.com/openml/openml-python/issues/1157")
     def test_get_data_with_rowid(self):
         self.dataset.row_id_attribute = "condition"
         rval, _, categorical, _ = self.dataset.get_data(include_row_id=True)
-        self.assertIsInstance(rval, pd.DataFrame)
+        assert isinstance(rval, pd.DataFrame)
         for dtype, is_cat, col in zip(rval.dtypes, categorical, rval):
             self._check_expected_type(dtype, is_cat, rval[col])
-        self.assertEqual(rval.shape, (898, 39))
-        self.assertEqual(len(categorical), 39)
+        assert rval.shape == (898, 39)
+        assert len(categorical) == 39
 
         rval, _, categorical, _ = self.dataset.get_data()
-        self.assertIsInstance(rval, pd.DataFrame)
+        assert isinstance(rval, pd.DataFrame)
         for dtype, is_cat, col in zip(rval.dtypes, categorical, rval):
             self._check_expected_type(dtype, is_cat, rval[col])
-        self.assertEqual(rval.shape, (898, 38))
-        self.assertEqual(len(categorical), 38)
+        assert rval.shape == (898, 38)
+        assert len(categorical) == 38
 
     def test_get_data_with_target_array(self):
         X, y, _, attribute_names = self.dataset.get_data(dataset_format="array", target="class")
-        self.assertIsInstance(X, np.ndarray)
-        self.assertEqual(X.dtype, np.float32)
-        self.assertEqual(X.shape, (898, 38))
-        self.assertIn(y.dtype, [np.int32, np.int64])
-        self.assertEqual(y.shape, (898,))
-        self.assertEqual(len(attribute_names), 38)
-        self.assertNotIn("class", attribute_names)
+        assert isinstance(X, np.ndarray)
+        assert X.dtype == np.float32
+        assert X.shape == (898, 38)
+        assert y.dtype in [np.int32, np.int64]
+        assert y.shape == (898,)
+        assert len(attribute_names) == 38
+        assert "class" not in attribute_names
 
     @pytest.mark.skip("https://github.com/openml/openml-python/issues/1157")
     def test_get_data_with_target_pandas(self):
         X, y, categorical, attribute_names = self.dataset.get_data(target="class")
-        self.assertIsInstance(X, pd.DataFrame)
+        assert isinstance(X, pd.DataFrame)
         for dtype, is_cat, col in zip(X.dtypes, categorical, X):
             self._check_expected_type(dtype, is_cat, X[col])
-        self.assertIsInstance(y, pd.Series)
-        self.assertEqual(y.dtype.name, "category")
+        assert isinstance(y, pd.Series)
+        assert y.dtype.name == "category"
 
-        self.assertEqual(X.shape, (898, 38))
-        self.assertEqual(len(attribute_names), 38)
-        self.assertEqual(y.shape, (898,))
+        assert X.shape == (898, 38)
+        assert len(attribute_names) == 38
+        assert y.shape == (898,)
 
-        self.assertNotIn("class", attribute_names)
+        assert "class" not in attribute_names
 
     def test_get_data_rowid_and_ignore_and_target(self):
         self.dataset.ignore_attribute = ["condition"]
         self.dataset.row_id_attribute = ["hardness"]
         X, y, categorical, names = self.dataset.get_data(target="class")
-        self.assertEqual(X.shape, (898, 36))
-        self.assertEqual(len(categorical), 36)
+        assert X.shape == (898, 36)
+        assert len(categorical) == 36
         cats = [True] * 3 + [False, True, True, False] + [True] * 23 + [False] * 3 + [True] * 3
         self.assertListEqual(categorical, cats)
-        self.assertEqual(y.shape, (898,))
+        assert y.shape == (898,)
 
     @pytest.mark.skip("https://github.com/openml/openml-python/issues/1157")
     def test_get_data_with_ignore_attributes(self):
@@ -229,26 +234,26 @@ def test_get_data_with_ignore_attributes(self):
         rval, _, categorical, _ = self.dataset.get_data(include_ignore_attribute=True)
         for dtype, is_cat, col in zip(rval.dtypes, categorical, rval):
             self._check_expected_type(dtype, is_cat, rval[col])
-        self.assertEqual(rval.shape, (898, 39))
-        self.assertEqual(len(categorical), 39)
+        assert rval.shape == (898, 39)
+        assert len(categorical) == 39
 
         rval, _, categorical, _ = self.dataset.get_data(include_ignore_attribute=False)
         for dtype, is_cat, col in zip(rval.dtypes, categorical, rval):
             self._check_expected_type(dtype, is_cat, rval[col])
-        self.assertEqual(rval.shape, (898, 38))
-        self.assertEqual(len(categorical), 38)
+        assert rval.shape == (898, 38)
+        assert len(categorical) == 38
 
     def test_get_data_with_nonexisting_class(self):
         # This class is using the anneal dataset with labels [1, 2, 3, 4, 5, 'U']. However,
         # label 4 does not exist and we test that the features 5 and 'U' are correctly mapped to
         # indices 4 and 5, and that nothing is mapped to index 3.
         _, y, _, _ = self.dataset.get_data("class", dataset_format="dataframe")
-        self.assertEqual(list(y.dtype.categories), ["1", "2", "3", "4", "5", "U"])
+        assert list(y.dtype.categories) == ["1", "2", "3", "4", "5", "U"]
         _, y, _, _ = self.dataset.get_data("class", dataset_format="array")
-        self.assertEqual(np.min(y), 0)
-        self.assertEqual(np.max(y), 5)
+        assert np.min(y) == 0
+        assert np.max(y) == 5
         # Check that no label is mapped to 3, since it is reserved for label '4'.
-        self.assertEqual(np.sum(y == 3), 0)
+        assert np.sum(y == 3) == 0
 
     def test_get_data_corrupt_pickle(self):
         # Lazy loaded dataset, populate cache.
@@ -259,155 +264,216 @@ def test_get_data_corrupt_pickle(self):
         # Despite the corrupt file, the data should be loaded from the ARFF file.
         # A warning message is written to the python logger.
         xy, _, _, _ = self.iris.get_data()
-        self.assertIsInstance(xy, pd.DataFrame)
-        self.assertEqual(xy.shape, (150, 5))
+        assert isinstance(xy, pd.DataFrame)
+        assert xy.shape == (150, 5)
 
     def test_lazy_loading_metadata(self):
         # Initial Setup
         did_cache_dir = openml.utils._create_cache_directory_for_id(
-            openml.datasets.functions.DATASETS_CACHE_DIR_NAME, 2
+            openml.datasets.functions.DATASETS_CACHE_DIR_NAME,
+            2,
         )
         _compare_dataset = openml.datasets.get_dataset(
-            2, download_data=False, download_features_meta_data=True, download_qualities=True
+            2,
+            download_data=False,
+            download_features_meta_data=True,
+            download_qualities=True,
         )
         change_time = os.stat(did_cache_dir).st_mtime
 
         # Test with cache
         _dataset = openml.datasets.get_dataset(
-            2, download_data=False, download_features_meta_data=False, download_qualities=False
+            2,
+            download_data=False,
+            download_features_meta_data=False,
+            download_qualities=False,
         )
-        self.assertEqual(change_time, os.stat(did_cache_dir).st_mtime)
-        self.assertEqual(_dataset.features, _compare_dataset.features)
-        self.assertEqual(_dataset.qualities, _compare_dataset.qualities)
+        assert change_time == os.stat(did_cache_dir).st_mtime
+        assert _dataset.features == _compare_dataset.features
+        assert _dataset.qualities == _compare_dataset.qualities
 
         # -- Test without cache
         openml.utils._remove_cache_dir_for_id(
-            openml.datasets.functions.DATASETS_CACHE_DIR_NAME, did_cache_dir
+            openml.datasets.functions.DATASETS_CACHE_DIR_NAME,
+            did_cache_dir,
         )
 
         _dataset = openml.datasets.get_dataset(
-            2, download_data=False, download_features_meta_data=False, download_qualities=False
+            2,
+            download_data=False,
+            download_features_meta_data=False,
+            download_qualities=False,
         )
-        self.assertEqual(["description.xml"], os.listdir(did_cache_dir))
-        self.assertNotEqual(change_time, os.stat(did_cache_dir).st_mtime)
-        self.assertEqual(_dataset.features, _compare_dataset.features)
-        self.assertEqual(_dataset.qualities, _compare_dataset.qualities)
+        assert ["description.xml"] == os.listdir(did_cache_dir)
+        assert change_time != os.stat(did_cache_dir).st_mtime
+        assert _dataset.features == _compare_dataset.features
+        assert _dataset.qualities == _compare_dataset.qualities
 
 
 class OpenMLDatasetTestOnTestServer(TestBase):
     def setUp(self):
-        super(OpenMLDatasetTestOnTestServer, self).setUp()
+        super().setUp()
         # longley, really small dataset
         self.dataset = openml.datasets.get_dataset(125, download_data=False)
 
     def test_tagging(self):
-        tag = "testing_tag_{}_{}".format(self.id(), time())
+        # tags can be at most 64 alphanumeric (+ underscore) chars
+        unique_indicator = str(time()).replace(".", "")
+        tag = f"test_tag_OpenMLDatasetTestOnTestServer_{unique_indicator}"
         datasets = openml.datasets.list_datasets(tag=tag, output_format="dataframe")
-        self.assertTrue(datasets.empty)
+        assert datasets.empty
         self.dataset.push_tag(tag)
         datasets = openml.datasets.list_datasets(tag=tag, output_format="dataframe")
-        self.assertEqual(len(datasets), 1)
-        self.assertIn(125, datasets["did"])
+        assert len(datasets) == 1
+        assert 125 in datasets["did"]
         self.dataset.remove_tag(tag)
         datasets = openml.datasets.list_datasets(tag=tag, output_format="dataframe")
-        self.assertTrue(datasets.empty)
-
-
+        assert datasets.empty
+
+    def test_get_feature_with_ontology_data_id_11(self):
+        # test on car dataset, which has built-in ontology references
+        dataset = openml.datasets.get_dataset(11)
+        assert len(dataset.features) == 7
+        assert len(dataset.features[1].ontologies) >= 2
+        assert len(dataset.features[2].ontologies) >= 1
+        assert len(dataset.features[3].ontologies) >= 1
+
+    def test_add_remove_ontology_to_dataset(self):
+        did = 1
+        feature_index = 1
+        ontology = 'https://www.openml.org/unittest/' + str(time())
+        openml.datasets.functions.data_feature_add_ontology(did, feature_index, ontology)
+        openml.datasets.functions.data_feature_remove_ontology(did, feature_index, ontology)
+
+    def test_add_same_ontology_multiple_features(self):
+        did = 1
+        ontology = 'https://www.openml.org/unittest/' + str(time())
+
+        for i in range(3):
+            openml.datasets.functions.data_feature_add_ontology(did, i, ontology)
+
+
+    def test_add_illegal_long_ontology(self):
+        did = 1
+        ontology = 'http://www.google.com/' + ('a' * 257)
+        try:
+            openml.datasets.functions.data_feature_add_ontology(did, 1, ontology)
+            assert False
+        except openml.exceptions.OpenMLServerException as e:
+            assert e.code == 1105
+
+    def test_add_illegal_url_ontology(self):
+        did = 1
+        ontology = 'not_a_url' + str(time())
+        try:
+            openml.datasets.functions.data_feature_add_ontology(did, 1, ontology)
+            assert False
+        except openml.exceptions.OpenMLServerException as e:
+            assert e.code == 1106
+
+@pytest.mark.production()
 class OpenMLDatasetTestSparse(TestBase):
     _multiprocess_can_split_ = True
 
     def setUp(self):
-        super(OpenMLDatasetTestSparse, self).setUp()
+        super().setUp()
         openml.config.server = self.production_server
 
         self.sparse_dataset = openml.datasets.get_dataset(4136, download_data=False)
 
     def test_get_sparse_dataset_array_with_target(self):
         X, y, _, attribute_names = self.sparse_dataset.get_data(
-            dataset_format="array", target="class"
+            dataset_format="array",
+            target="class",
         )
 
-        self.assertTrue(sparse.issparse(X))
-        self.assertEqual(X.dtype, np.float32)
-        self.assertEqual(X.shape, (600, 20000))
+        assert sparse.issparse(X)
+        assert X.dtype == np.float32
+        assert X.shape == (600, 20000)
 
-        self.assertIsInstance(y, np.ndarray)
-        self.assertIn(y.dtype, [np.int32, np.int64])
-        self.assertEqual(y.shape, (600,))
+        assert isinstance(y, np.ndarray)
+        assert y.dtype in [np.int32, np.int64]
+        assert y.shape == (600,)
 
-        self.assertEqual(len(attribute_names), 20000)
-        self.assertNotIn("class", attribute_names)
+        assert len(attribute_names) == 20000
+        assert "class" not in attribute_names
 
     def test_get_sparse_dataset_dataframe_with_target(self):
         X, y, _, attribute_names = self.sparse_dataset.get_data(
-            dataset_format="dataframe", target="class"
+            dataset_format="dataframe",
+            target="class",
         )
-        self.assertIsInstance(X, pd.DataFrame)
-        self.assertIsInstance(X.dtypes[0], pd.SparseDtype)
-        self.assertEqual(X.shape, (600, 20000))
+        assert isinstance(X, pd.DataFrame)
+        assert isinstance(X.dtypes[0], pd.SparseDtype)
+        assert X.shape == (600, 20000)
 
-        self.assertIsInstance(y, pd.Series)
-        self.assertIsInstance(y.dtypes, pd.SparseDtype)
-        self.assertEqual(y.shape, (600,))
+        assert isinstance(y, pd.Series)
+        assert isinstance(y.dtypes, pd.SparseDtype)
+        assert y.shape == (600,)
 
-        self.assertEqual(len(attribute_names), 20000)
-        self.assertNotIn("class", attribute_names)
+        assert len(attribute_names) == 20000
+        assert "class" not in attribute_names
 
     def test_get_sparse_dataset_array(self):
         rval, _, categorical, attribute_names = self.sparse_dataset.get_data(dataset_format="array")
-        self.assertTrue(sparse.issparse(rval))
-        self.assertEqual(rval.dtype, np.float32)
-        self.assertEqual((600, 20001), rval.shape)
+        assert sparse.issparse(rval)
+        assert rval.dtype == np.float32
+        assert rval.shape == (600, 20001)
 
-        self.assertEqual(len(categorical), 20001)
-        self.assertTrue(all([isinstance(cat, bool) for cat in categorical]))
+        assert len(categorical) == 20001
+        assert all(isinstance(cat, bool) for cat in categorical)
 
-        self.assertEqual(len(attribute_names), 20001)
-        self.assertTrue(all([isinstance(att, str) for att in attribute_names]))
+        assert len(attribute_names) == 20001
+        assert all(isinstance(att, str) for att in attribute_names)
 
     def test_get_sparse_dataset_dataframe(self):
         rval, *_ = self.sparse_dataset.get_data()
-        self.assertIsInstance(rval, pd.DataFrame)
+        assert isinstance(rval, pd.DataFrame)
         np.testing.assert_array_equal(
-            [pd.SparseDtype(np.float32, fill_value=0.0)] * len(rval.dtypes), rval.dtypes
+            [pd.SparseDtype(np.float32, fill_value=0.0)] * len(rval.dtypes),
+            rval.dtypes,
         )
-        self.assertEqual((600, 20001), rval.shape)
+        assert rval.shape == (600, 20001)
 
     def test_get_sparse_dataset_with_rowid(self):
         self.sparse_dataset.row_id_attribute = ["V256"]
         rval, _, categorical, _ = self.sparse_dataset.get_data(
-            dataset_format="array", include_row_id=True
+            dataset_format="array",
+            include_row_id=True,
         )
-        self.assertTrue(sparse.issparse(rval))
-        self.assertEqual(rval.dtype, np.float32)
-        self.assertEqual(rval.shape, (600, 20001))
-        self.assertEqual(len(categorical), 20001)
+        assert sparse.issparse(rval)
+        assert rval.dtype == np.float32
+        assert rval.shape == (600, 20001)
+        assert len(categorical) == 20001
 
         rval, _, categorical, _ = self.sparse_dataset.get_data(
-            dataset_format="array", include_row_id=False
+            dataset_format="array",
+            include_row_id=False,
         )
-        self.assertTrue(sparse.issparse(rval))
-        self.assertEqual(rval.dtype, np.float32)
-        self.assertEqual(rval.shape, (600, 20000))
-        self.assertEqual(len(categorical), 20000)
+        assert sparse.issparse(rval)
+        assert rval.dtype == np.float32
+        assert rval.shape == (600, 20000)
+        assert len(categorical) == 20000
 
     def test_get_sparse_dataset_with_ignore_attributes(self):
         self.sparse_dataset.ignore_attribute = ["V256"]
         rval, _, categorical, _ = self.sparse_dataset.get_data(
-            dataset_format="array", include_ignore_attribute=True
+            dataset_format="array",
+            include_ignore_attribute=True,
         )
-        self.assertTrue(sparse.issparse(rval))
-        self.assertEqual(rval.dtype, np.float32)
-        self.assertEqual(rval.shape, (600, 20001))
+        assert sparse.issparse(rval)
+        assert rval.dtype == np.float32
+        assert rval.shape == (600, 20001)
 
-        self.assertEqual(len(categorical), 20001)
+        assert len(categorical) == 20001
         rval, _, categorical, _ = self.sparse_dataset.get_data(
-            dataset_format="array", include_ignore_attribute=False
+            dataset_format="array",
+            include_ignore_attribute=False,
         )
-        self.assertTrue(sparse.issparse(rval))
-        self.assertEqual(rval.dtype, np.float32)
-        self.assertEqual(rval.shape, (600, 20000))
-        self.assertEqual(len(categorical), 20000)
+        assert sparse.issparse(rval)
+        assert rval.dtype == np.float32
+        assert rval.shape == (600, 20000)
+        assert len(categorical) == 20000
 
     def test_get_sparse_dataset_rowid_and_ignore_and_target(self):
         # TODO: re-add row_id and ignore attributes
@@ -419,24 +485,24 @@ def test_get_sparse_dataset_rowid_and_ignore_and_target(self):
             include_row_id=False,
             include_ignore_attribute=False,
         )
-        self.assertTrue(sparse.issparse(X))
-        self.assertEqual(X.dtype, np.float32)
-        self.assertIn(y.dtype, [np.int32, np.int64])
-        self.assertEqual(X.shape, (600, 19998))
+        assert sparse.issparse(X)
+        assert X.dtype == np.float32
+        assert y.dtype in [np.int32, np.int64]
+        assert X.shape == (600, 19998)
 
-        self.assertEqual(len(categorical), 19998)
+        assert len(categorical) == 19998
         self.assertListEqual(categorical, [False] * 19998)
-        self.assertEqual(y.shape, (600,))
+        assert y.shape == (600,)
 
     def test_get_sparse_categorical_data_id_395(self):
         dataset = openml.datasets.get_dataset(395, download_data=True)
         feature = dataset.features[3758]
-        self.assertTrue(isinstance(dataset, OpenMLDataset))
-        self.assertTrue(isinstance(feature, OpenMLDataFeature))
-        self.assertEqual(dataset.name, "re1.wc")
-        self.assertEqual(feature.name, "CLASS_LABEL")
-        self.assertEqual(feature.data_type, "nominal")
-        self.assertEqual(len(feature.nominal_values), 25)
+        assert isinstance(dataset, OpenMLDataset)
+        assert isinstance(feature, OpenMLDataFeature)
+        assert dataset.name == "re1.wc"
+        assert feature.name == "CLASS_LABEL"
+        assert feature.data_type == "nominal"
+        assert len(feature.nominal_values) == 25
 
 
 class OpenMLDatasetFunctionTest(TestBase):
@@ -445,51 +511,65 @@ class OpenMLDatasetFunctionTest(TestBase):
     def test__read_features(self, filename_mock, pickle_mock):
         """Test we read the features from the xml if no cache pickle is available.
 
-        This test also does some simple checks to verify that the features are read correctly"""
+        This test also does some simple checks to verify that the features are read correctly
+        """
         filename_mock.return_value = os.path.join(self.workdir, "features.xml.pkl")
         pickle_mock.load.side_effect = FileNotFoundError
         features = openml.datasets.dataset._read_features(
             os.path.join(
-                self.static_cache_dir, "org", "openml", "test", "datasets", "2", "features.xml"
-            )
+                self.static_cache_dir,
+                "org",
+                "openml",
+                "test",
+                "datasets",
+                "2",
+                "features.xml",
+            ),
         )
-        self.assertIsInstance(features, dict)
-        self.assertEqual(len(features), 39)
-        self.assertIsInstance(features[0], OpenMLDataFeature)
-        self.assertEqual(features[0].name, "family")
-        self.assertEqual(len(features[0].nominal_values), 9)
+        assert isinstance(features, dict)
+        assert len(features) == 39
+        assert isinstance(features[0], OpenMLDataFeature)
+        assert features[0].name == "family"
+        assert len(features[0].nominal_values) == 9
         # pickle.load is never called because the features pickle file didn't exist
-        self.assertEqual(pickle_mock.load.call_count, 0)
-        self.assertEqual(pickle_mock.dump.call_count, 1)
+        assert pickle_mock.load.call_count == 0
+        assert pickle_mock.dump.call_count == 1
 
     @unittest.mock.patch("openml.datasets.dataset.pickle")
     @unittest.mock.patch("openml.datasets.dataset._get_qualities_pickle_file")
     def test__read_qualities(self, filename_mock, pickle_mock):
         """Test we read the qualities from the xml if no cache pickle is available.
 
-        This test also does some minor checks to ensure that the qualities are read correctly."""
+        This test also does some minor checks to ensure that the qualities are read correctly.
+        """
         filename_mock.return_value = os.path.join(self.workdir, "qualities.xml.pkl")
         pickle_mock.load.side_effect = FileNotFoundError
         qualities = openml.datasets.dataset._read_qualities(
             os.path.join(
-                self.static_cache_dir, "org", "openml", "test", "datasets", "2", "qualities.xml"
-            )
+                self.static_cache_dir,
+                "org",
+                "openml",
+                "test",
+                "datasets",
+                "2",
+                "qualities.xml",
+            ),
         )
-        self.assertIsInstance(qualities, dict)
-        self.assertEqual(len(qualities), 106)
+        assert isinstance(qualities, dict)
+        assert len(qualities) == 106
         # pickle.load is never called because the qualities pickle file didn't exist
-        self.assertEqual(pickle_mock.load.call_count, 0)
-        self.assertEqual(pickle_mock.dump.call_count, 1)
+        assert pickle_mock.load.call_count == 0
+        assert pickle_mock.dump.call_count == 1
 
     def test__check_qualities(self):
         qualities = [{"oml:name": "a", "oml:value": "0.5"}]
         qualities = openml.datasets.dataset._check_qualities(qualities)
-        self.assertEqual(qualities["a"], 0.5)
+        assert qualities["a"] == 0.5
 
         qualities = [{"oml:name": "a", "oml:value": "null"}]
         qualities = openml.datasets.dataset._check_qualities(qualities)
-        self.assertNotEqual(qualities["a"], qualities["a"])
+        assert qualities["a"] != qualities["a"]
 
         qualities = [{"oml:name": "a", "oml:value": None}]
         qualities = openml.datasets.dataset._check_qualities(qualities)
-        self.assertNotEqual(qualities["a"], qualities["a"])
+        assert qualities["a"] != qualities["a"]
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
index fe04f7d96..f3d269dc1 100644
--- a/tests/test_datasets/test_dataset_functions.py
+++ b/tests/test_datasets/test_dataset_functions.py
@@ -1,18 +1,18 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
 import os
-import pathlib
+from pathlib import Path
 import random
+import shutil
+import time
 from itertools import product
 from unittest import mock
-import shutil
 
 import arff
-import time
-
-import pytest
 import numpy as np
 import pandas as pd
+import pytest
 import requests
 import scipy.sparse
 from oslo_concurrency import lockutils
@@ -20,41 +20,41 @@
 import openml
 from openml import OpenMLDataset
 from openml._api_calls import _download_minio_file
-from openml.exceptions import (
-    OpenMLHashException,
-    OpenMLPrivateDatasetError,
-    OpenMLServerException,
-    OpenMLNotAuthorizedError,
-)
-from openml.testing import TestBase, create_request_response
-from openml.utils import _tag_entity, _create_cache_directory_for_id
+from openml.datasets import edit_dataset, fork_dataset
 from openml.datasets.functions import (
-    create_dataset,
-    attributes_arff_from_df,
+    DATASETS_CACHE_DIR_NAME,
     _get_dataset_arff,
     _get_dataset_description,
     _get_dataset_features_file,
+    _get_dataset_parquet,
     _get_dataset_qualities_file,
     _get_online_dataset_arff,
     _get_online_dataset_format,
-    DATASETS_CACHE_DIR_NAME,
-    _get_dataset_parquet,
     _topic_add_dataset,
     _topic_delete_dataset,
+    attributes_arff_from_df,
+    create_dataset,
+)
+from openml.exceptions import (
+    OpenMLHashException,
+    OpenMLNotAuthorizedError,
+    OpenMLPrivateDatasetError,
+    OpenMLServerException,
 )
-from openml.datasets import fork_dataset, edit_dataset
 from openml.tasks import TaskType, create_task
+from openml.testing import TestBase, create_request_response
+from openml.utils import _create_cache_directory_for_id, _tag_entity
 
 
 class TestOpenMLDataset(TestBase):
     _multiprocess_can_split_ = True
 
     def setUp(self):
-        super(TestOpenMLDataset, self).setUp()
+        super().setUp()
 
     def tearDown(self):
         self._remove_pickle_files()
-        super(TestOpenMLDataset, self).tearDown()
+        super().tearDown()
 
     def _remove_pickle_files(self):
         self.lock_path = os.path.join(openml.config.get_cache_directory(), "locks")
@@ -64,7 +64,10 @@ def _remove_pickle_files(self):
                 lock_path=self.lock_path,
             ):
                 pickle_path = os.path.join(
-                    openml.config.get_cache_directory(), "datasets", did, "dataset.pkl.py3"
+                    openml.config.get_cache_directory(),
+                    "datasets",
+                    did,
+                    "dataset.pkl.py3",
                 )
                 try:
                     os.remove(pickle_path)
@@ -90,13 +93,13 @@ def _get_empty_param_for_dataset(self):
         }
 
     def _check_dataset(self, dataset):
-        self.assertEqual(type(dataset), dict)
-        self.assertGreaterEqual(len(dataset), 2)
-        self.assertIn("did", dataset)
-        self.assertIsInstance(dataset["did"], int)
-        self.assertIn("status", dataset)
-        self.assertIsInstance(dataset["status"], str)
-        self.assertIn(dataset["status"], ["in_preparation", "active", "deactivated"])
+        assert type(dataset) == dict
+        assert len(dataset) >= 2
+        assert "did" in dataset
+        assert isinstance(dataset["did"], int)
+        assert "status" in dataset
+        assert isinstance(dataset["status"], str)
+        assert dataset["status"] in ["in_preparation", "active", "deactivated"]
 
     def _check_datasets(self, datasets):
         for did in datasets:
@@ -105,29 +108,31 @@ def _check_datasets(self, datasets):
     def test_tag_untag_dataset(self):
         tag = "test_tag_%d" % random.randint(1, 1000000)
         all_tags = _tag_entity("data", 1, tag)
-        self.assertTrue(tag in all_tags)
+        assert tag in all_tags
         all_tags = _tag_entity("data", 1, tag, untag=True)
-        self.assertTrue(tag not in all_tags)
+        assert tag not in all_tags
 
     def test_list_datasets_output_format(self):
         datasets = openml.datasets.list_datasets(output_format="dataframe")
-        self.assertIsInstance(datasets, pd.DataFrame)
-        self.assertGreaterEqual(len(datasets), 100)
+        assert isinstance(datasets, pd.DataFrame)
+        assert len(datasets) >= 100
 
     def test_list_datasets_paginate(self):
         size = 10
         max = 100
         for i in range(0, max, size):
             datasets = openml.datasets.list_datasets(offset=i, size=size)
-            self.assertEqual(size, len(datasets))
+            assert size == len(datasets)
             self._check_datasets(datasets)
 
     def test_list_datasets_empty(self):
         datasets = openml.datasets.list_datasets(
-            tag="NoOneWouldUseThisTagAnyway", output_format="dataframe"
+            tag="NoOneWouldUseThisTagAnyway",
+            output_format="dataframe",
         )
-        self.assertTrue(datasets.empty)
+        assert datasets.empty
 
+    @pytest.mark.production()
     def test_check_datasets_active(self):
         # Have to test on live because there is no deactivated dataset on the test server.
         openml.config.server = self.production_server
@@ -135,9 +140,9 @@ def test_check_datasets_active(self):
             [2, 17, 79],
             raise_error_if_not_exist=False,
         )
-        self.assertTrue(active[2])
-        self.assertFalse(active[17])
-        self.assertIsNone(active.get(79))
+        assert active[2]
+        assert not active[17]
+        assert active.get(79) is None
         self.assertRaisesRegex(
             ValueError,
             r"Could not find dataset\(s\) 79 in OpenML dataset list.",
@@ -146,6 +151,24 @@ def test_check_datasets_active(self):
         )
         openml.config.server = self.test_server
 
+    def test_illegal_character_tag(self):
+        dataset = openml.datasets.get_dataset(1)
+        tag = "illegal_tag&"
+        try:
+            dataset.push_tag(tag)
+            raise AssertionError()
+        except openml.exceptions.OpenMLServerException as e:
+            assert e.code == 477
+
+    def test_illegal_length_tag(self):
+        dataset = openml.datasets.get_dataset(1)
+        tag = "a" * 65
+        try:
+            dataset.push_tag(tag)
+            raise AssertionError()
+        except openml.exceptions.OpenMLServerException as e:
+            assert e.code == 477
+
     def _datasets_retrieved_successfully(self, dids, metadata_only=True):
         """Checks that all files for the given dids have been downloaded.
 
@@ -156,25 +179,19 @@ def _datasets_retrieved_successfully(self, dids, metadata_only=True):
             - absence of data arff if metadata_only, else it must be present too.
         """
         for did in dids:
-            self.assertTrue(
-                os.path.exists(
-                    os.path.join(
-                        openml.config.get_cache_directory(), "datasets", str(did), "description.xml"
-                    )
+            assert os.path.exists(
+                os.path.join(
+                    openml.config.get_cache_directory(), "datasets", str(did), "description.xml"
                 )
             )
-            self.assertTrue(
-                os.path.exists(
-                    os.path.join(
-                        openml.config.get_cache_directory(), "datasets", str(did), "qualities.xml"
-                    )
+            assert os.path.exists(
+                os.path.join(
+                    openml.config.get_cache_directory(), "datasets", str(did), "qualities.xml"
                 )
             )
-            self.assertTrue(
-                os.path.exists(
-                    os.path.join(
-                        openml.config.get_cache_directory(), "datasets", str(did), "features.xml"
-                    )
+            assert os.path.exists(
+                os.path.join(
+                    openml.config.get_cache_directory(), "datasets", str(did), "features.xml"
                 )
             )
 
@@ -182,28 +199,35 @@ def _datasets_retrieved_successfully(self, dids, metadata_only=True):
             data_assert(
                 os.path.exists(
                     os.path.join(
-                        openml.config.get_cache_directory(), "datasets", str(did), "dataset.arff"
-                    )
-                )
+                        openml.config.get_cache_directory(),
+                        "datasets",
+                        str(did),
+                        "dataset.arff",
+                    ),
+                ),
             )
 
+    @pytest.mark.production()
     def test__name_to_id_with_deactivated(self):
         """Check that an activated dataset is returned if an earlier deactivated one exists."""
         openml.config.server = self.production_server
         # /d/1 was deactivated
-        self.assertEqual(openml.datasets.functions._name_to_id("anneal"), 2)
+        assert openml.datasets.functions._name_to_id("anneal") == 2
         openml.config.server = self.test_server
 
+    @pytest.mark.production()
     def test__name_to_id_with_multiple_active(self):
         """With multiple active datasets, retrieve the least recent active."""
         openml.config.server = self.production_server
-        self.assertEqual(openml.datasets.functions._name_to_id("iris"), 61)
+        assert openml.datasets.functions._name_to_id("iris") == 61
 
+    @pytest.mark.production()
     def test__name_to_id_with_version(self):
         """With multiple active datasets, retrieve the least recent active."""
         openml.config.server = self.production_server
-        self.assertEqual(openml.datasets.functions._name_to_id("iris", version=3), 969)
+        assert openml.datasets.functions._name_to_id("iris", version=3) == 969
 
+    @pytest.mark.production()
     def test__name_to_id_with_multiple_active_error(self):
         """With multiple active datasets, retrieve the least recent active."""
         openml.config.server = self.production_server
@@ -238,40 +262,41 @@ def test_get_datasets_by_name(self):
         # did 1 and 2 on the test server:
         dids = ["anneal", "kr-vs-kp"]
         datasets = openml.datasets.get_datasets(dids, download_data=False)
-        self.assertEqual(len(datasets), 2)
+        assert len(datasets) == 2
         self._datasets_retrieved_successfully([1, 2])
 
     def test_get_datasets_by_mixed(self):
         # did 1 and 2 on the test server:
         dids = ["anneal", 2]
         datasets = openml.datasets.get_datasets(dids, download_data=False)
-        self.assertEqual(len(datasets), 2)
+        assert len(datasets) == 2
         self._datasets_retrieved_successfully([1, 2])
 
     def test_get_datasets(self):
         dids = [1, 2]
         datasets = openml.datasets.get_datasets(dids)
-        self.assertEqual(len(datasets), 2)
+        assert len(datasets) == 2
         self._datasets_retrieved_successfully([1, 2], metadata_only=False)
 
     def test_get_datasets_lazy(self):
         dids = [1, 2]
         datasets = openml.datasets.get_datasets(dids, download_data=False)
-        self.assertEqual(len(datasets), 2)
+        assert len(datasets) == 2
         self._datasets_retrieved_successfully([1, 2], metadata_only=True)
 
         datasets[0].get_data()
         datasets[1].get_data()
         self._datasets_retrieved_successfully([1, 2], metadata_only=False)
 
+    @pytest.mark.production()
     def test_get_dataset_by_name(self):
         dataset = openml.datasets.get_dataset("anneal")
-        self.assertEqual(type(dataset), OpenMLDataset)
-        self.assertEqual(dataset.dataset_id, 1)
+        assert type(dataset) == OpenMLDataset
+        assert dataset.dataset_id == 1
         self._datasets_retrieved_successfully([1], metadata_only=False)
 
-        self.assertGreater(len(dataset.features), 1)
-        self.assertGreater(len(dataset.qualities), 4)
+        assert len(dataset.features) > 1
+        assert len(dataset.qualities) > 4
 
         # Issue324 Properly handle private datasets when trying to access them
         openml.config.server = self.production_server
@@ -288,33 +313,35 @@ def test_get_dataset_download_all_files(self):
 
     def test_get_dataset_uint8_dtype(self):
         dataset = openml.datasets.get_dataset(1)
-        self.assertEqual(type(dataset), OpenMLDataset)
-        self.assertEqual(dataset.name, "anneal")
+        assert type(dataset) == OpenMLDataset
+        assert dataset.name == "anneal"
         df, _, _, _ = dataset.get_data()
-        self.assertEqual(df["carbon"].dtype, "uint8")
+        assert df["carbon"].dtype == "uint8"
 
+    @pytest.mark.production()
     def test_get_dataset(self):
         # This is the only non-lazy load to ensure default behaviour works.
         dataset = openml.datasets.get_dataset(1)
-        self.assertEqual(type(dataset), OpenMLDataset)
-        self.assertEqual(dataset.name, "anneal")
+        assert type(dataset) == OpenMLDataset
+        assert dataset.name == "anneal"
         self._datasets_retrieved_successfully([1], metadata_only=False)
 
-        self.assertGreater(len(dataset.features), 1)
-        self.assertGreater(len(dataset.qualities), 4)
+        assert len(dataset.features) > 1
+        assert len(dataset.qualities) > 4
 
         # Issue324 Properly handle private datasets when trying to access them
         openml.config.server = self.production_server
         self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, 45)
 
+    @pytest.mark.production()
     def test_get_dataset_lazy(self):
         dataset = openml.datasets.get_dataset(1, download_data=False)
-        self.assertEqual(type(dataset), OpenMLDataset)
-        self.assertEqual(dataset.name, "anneal")
+        assert type(dataset) == OpenMLDataset
+        assert dataset.name == "anneal"
         self._datasets_retrieved_successfully([1], metadata_only=True)
 
-        self.assertGreater(len(dataset.features), 1)
-        self.assertGreater(len(dataset.qualities), 4)
+        assert len(dataset.features) > 1
+        assert len(dataset.qualities) > 4
 
         dataset.get_data()
         self._datasets_retrieved_successfully([1], metadata_only=False)
@@ -329,12 +356,8 @@ def test_get_dataset_lazy_all_functions(self):
         # We only tests functions as general integrity is tested by test_get_dataset_lazy
 
         def ensure_absence_of_real_data():
-            self.assertFalse(
-                os.path.exists(
-                    os.path.join(
-                        openml.config.get_cache_directory(), "datasets", "1", "dataset.arff"
-                    )
-                )
+            assert not os.path.exists(
+                os.path.join(openml.config.get_cache_directory(), "datasets", "1", "dataset.arff")
             )
 
         tag = "test_lazy_tag_%d" % random.randint(1, 1000000)
@@ -349,36 +372,36 @@ def ensure_absence_of_real_data():
         correct = [0, 1, 2, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                    20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 35, 36, 37, 38]
         # fmt: on
-        self.assertEqual(nominal_indices, correct)
+        assert nominal_indices == correct
         ensure_absence_of_real_data()
 
         classes = dataset.retrieve_class_labels()
-        self.assertEqual(classes, ["1", "2", "3", "4", "5", "U"])
+        assert classes == ["1", "2", "3", "4", "5", "U"]
         ensure_absence_of_real_data()
 
     def test_get_dataset_sparse(self):
         dataset = openml.datasets.get_dataset(102, download_data=False)
         X, *_ = dataset.get_data(dataset_format="array")
-        self.assertIsInstance(X, scipy.sparse.csr_matrix)
+        assert isinstance(X, scipy.sparse.csr_matrix)
 
     def test_download_rowid(self):
         # Smoke test which checks that the dataset has the row-id set correctly
         did = 44
         dataset = openml.datasets.get_dataset(did, download_data=False)
-        self.assertEqual(dataset.row_id_attribute, "Counter")
+        assert dataset.row_id_attribute == "Counter"
 
     def test__get_dataset_description(self):
         description = _get_dataset_description(self.workdir, 2)
-        self.assertIsInstance(description, dict)
+        assert isinstance(description, dict)
         description_xml_path = os.path.join(self.workdir, "description.xml")
-        self.assertTrue(os.path.exists(description_xml_path))
+        assert os.path.exists(description_xml_path)
 
     def test__getarff_path_dataset_arff(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         description = _get_dataset_description(self.workdir, 2)
         arff_path = _get_dataset_arff(description, cache_directory=self.workdir)
-        self.assertIsInstance(arff_path, str)
-        self.assertTrue(os.path.exists(arff_path))
+        assert isinstance(arff_path, Path)
+        assert arff_path.exists()
 
     def test__download_minio_file_object_does_not_exist(self):
         self.assertRaisesRegex(
@@ -396,10 +419,9 @@ def test__download_minio_file_to_directory(self):
             destination=self.workdir,
             exists_ok=True,
         )
-        self.assertTrue(
-            os.path.isfile(os.path.join(self.workdir, "dataset_20.pq")),
-            "_download_minio_file can save to a folder by copying the object name",
-        )
+        assert os.path.isfile(
+            os.path.join(self.workdir, "dataset_20.pq")
+        ), "_download_minio_file can save to a folder by copying the object name"
 
     def test__download_minio_file_to_path(self):
         file_destination = os.path.join(self.workdir, "custom.pq")
@@ -408,13 +430,12 @@ def test__download_minio_file_to_path(self):
             destination=file_destination,
             exists_ok=True,
         )
-        self.assertTrue(
-            os.path.isfile(file_destination),
-            "_download_minio_file can save to a folder by copying the object name",
-        )
+        assert os.path.isfile(
+            file_destination
+        ), "_download_minio_file can save to a folder by copying the object name"
 
     def test__download_minio_file_raises_FileExists_if_destination_in_use(self):
-        file_destination = pathlib.Path(self.workdir, "custom.pq")
+        file_destination = Path(self.workdir, "custom.pq")
         file_destination.touch()
 
         self.assertRaises(
@@ -426,47 +447,46 @@ def test__download_minio_file_raises_FileExists_if_destination_in_use(self):
         )
 
     def test__download_minio_file_works_with_bucket_subdirectory(self):
-        file_destination = pathlib.Path(self.workdir, "custom.pq")
+        file_destination = Path(self.workdir, "custom.pq")
         _download_minio_file(
             source="http://openml1.win.tue.nl/dataset61/dataset_61.pq",
             destination=file_destination,
             exists_ok=True,
         )
-        self.assertTrue(
-            os.path.isfile(file_destination),
-            "_download_minio_file can download from subdirectories",
-        )
+        assert os.path.isfile(
+            file_destination
+        ), "_download_minio_file can download from subdirectories"
 
     def test__get_dataset_parquet_not_cached(self):
         description = {
-            "oml:minio_url": "http://openml1.win.tue.nl/dataset20/dataset_20.pq",
+            "oml:parquet_url": "http://openml1.win.tue.nl/dataset20/dataset_20.pq",
             "oml:id": "20",
         }
         path = _get_dataset_parquet(description, cache_directory=self.workdir)
-        self.assertIsInstance(path, str, "_get_dataset_parquet returns a path")
-        self.assertTrue(os.path.isfile(path), "_get_dataset_parquet returns path to real file")
+        assert isinstance(path, Path), "_get_dataset_parquet returns a path"
+        assert path.is_file(), "_get_dataset_parquet returns path to real file"
 
     @mock.patch("openml._api_calls._download_minio_file")
     def test__get_dataset_parquet_is_cached(self, patch):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         patch.side_effect = RuntimeError(
-            "_download_minio_file should not be called when loading from cache"
+            "_download_parquet_url should not be called when loading from cache",
         )
         description = {
-            "oml:minio_url": "http://openml1.win.tue.nl/dataset30/dataset_30.pq",
+            "oml:parquet_url": "http://openml1.win.tue.nl/dataset30/dataset_30.pq",
             "oml:id": "30",
         }
         path = _get_dataset_parquet(description, cache_directory=None)
-        self.assertIsInstance(path, str, "_get_dataset_parquet returns a path")
-        self.assertTrue(os.path.isfile(path), "_get_dataset_parquet returns path to real file")
+        assert isinstance(path, Path), "_get_dataset_parquet returns a path"
+        assert path.is_file(), "_get_dataset_parquet returns path to real file"
 
     def test__get_dataset_parquet_file_does_not_exist(self):
         description = {
-            "oml:minio_url": "http://openml1.win.tue.nl/dataset20/does_not_exist.pq",
+            "oml:parquet_url": "http://openml1.win.tue.nl/dataset20/does_not_exist.pq",
             "oml:id": "20",
         }
         path = _get_dataset_parquet(description, cache_directory=self.workdir)
-        self.assertIsNone(path, "_get_dataset_parquet returns None if no file is found")
+        assert path is None, "_get_dataset_parquet returns None if no file is found"
 
     def test__getarff_md5_issue(self):
         description = {
@@ -489,26 +509,28 @@ def test__getarff_md5_issue(self):
 
     def test__get_dataset_features(self):
         features_file = _get_dataset_features_file(self.workdir, 2)
-        self.assertIsInstance(features_file, str)
-        features_xml_path = os.path.join(self.workdir, "features.xml")
-        self.assertTrue(os.path.exists(features_xml_path))
+        assert isinstance(features_file, Path)
+        features_xml_path = self.workdir / "features.xml"
+        assert features_xml_path.exists()
 
     def test__get_dataset_qualities(self):
         qualities = _get_dataset_qualities_file(self.workdir, 2)
-        self.assertIsInstance(qualities, str)
-        qualities_xml_path = os.path.join(self.workdir, "qualities.xml")
-        self.assertTrue(os.path.exists(qualities_xml_path))
+        assert isinstance(qualities, Path)
+        qualities_xml_path = self.workdir / "qualities.xml"
+        assert qualities_xml_path.exists()
 
     def test__get_dataset_skip_download(self):
         dataset = openml.datasets.get_dataset(
-            2, download_qualities=False, download_features_meta_data=False
+            2,
+            download_qualities=False,
+            download_features_meta_data=False,
         )
         # Internal representation without lazy loading
-        self.assertIsNone(dataset._qualities)
-        self.assertIsNone(dataset._features)
+        assert dataset._qualities is None
+        assert dataset._features is None
         # External representation with lazy loading
-        self.assertIsNotNone(dataset.qualities)
-        self.assertIsNotNone(dataset.features)
+        assert dataset.qualities is not None
+        assert dataset.features is not None
 
     def test_get_dataset_force_refresh_cache(self):
         did_cache_dir = _create_cache_directory_for_id(
@@ -520,11 +542,11 @@ def test_get_dataset_force_refresh_cache(self):
 
         # Test default
         openml.datasets.get_dataset(2)
-        self.assertEqual(change_time, os.stat(did_cache_dir).st_mtime)
+        assert change_time == os.stat(did_cache_dir).st_mtime
 
         # Test refresh
         openml.datasets.get_dataset(2, force_refresh_cache=True)
-        self.assertNotEqual(change_time, os.stat(did_cache_dir).st_mtime)
+        assert change_time != os.stat(did_cache_dir).st_mtime
 
         # Final clean up
         openml.utils._remove_cache_dir_for_id(
@@ -545,7 +567,7 @@ def test_get_dataset_force_refresh_cache_clean_start(self):
 
         # Test clean start
         openml.datasets.get_dataset(2, force_refresh_cache=True)
-        self.assertTrue(os.path.exists(did_cache_dir))
+        assert os.path.exists(did_cache_dir)
 
         # Final clean up
         openml.utils._remove_cache_dir_for_id(
@@ -559,12 +581,12 @@ def test_deletion_of_cache_dir(self):
             DATASETS_CACHE_DIR_NAME,
             1,
         )
-        self.assertTrue(os.path.exists(did_cache_dir))
+        assert os.path.exists(did_cache_dir)
         openml.utils._remove_cache_dir_for_id(
             DATASETS_CACHE_DIR_NAME,
             did_cache_dir,
         )
-        self.assertFalse(os.path.exists(did_cache_dir))
+        assert not os.path.exists(did_cache_dir)
 
     # Use _get_dataset_arff to load the description, trigger an exception in the
     # test target and have a slightly higher coverage
@@ -573,13 +595,16 @@ def test_deletion_of_cache_dir_faulty_download(self, patch):
         patch.side_effect = Exception("Boom!")
         self.assertRaisesRegex(Exception, "Boom!", openml.datasets.get_dataset, dataset_id=1)
         datasets_cache_dir = os.path.join(self.workdir, "org", "openml", "test", "datasets")
-        self.assertEqual(len(os.listdir(datasets_cache_dir)), 0)
+        assert len(os.listdir(datasets_cache_dir)) == 0
 
     def test_publish_dataset(self):
         # lazy loading not possible as we need the arff-file.
         openml.datasets.get_dataset(3)
         file_path = os.path.join(
-            openml.config.get_cache_directory(), "datasets", "3", "dataset.arff"
+            openml.config.get_cache_directory(),
+            "datasets",
+            "3",
+            "dataset.arff",
         )
         dataset = OpenMLDataset(
             "anneal",
@@ -593,18 +618,25 @@ def test_publish_dataset(self):
         dataset.publish()
         TestBase._mark_entity_for_removal("data", dataset.dataset_id)
         TestBase.logger.info(
-            "collected from {}: {}".format(__file__.split("/")[-1], dataset.dataset_id)
+            "collected from {}: {}".format(__file__.split("/")[-1], dataset.dataset_id),
         )
-        self.assertIsInstance(dataset.dataset_id, int)
+        assert isinstance(dataset.dataset_id, int)
 
     def test__retrieve_class_labels(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         labels = openml.datasets.get_dataset(2, download_data=False).retrieve_class_labels()
-        self.assertEqual(labels, ["1", "2", "3", "4", "5", "U"])
+        assert labels == ["1", "2", "3", "4", "5", "U"]
+
         labels = openml.datasets.get_dataset(2, download_data=False).retrieve_class_labels(
-            target_name="product-type"
+            target_name="product-type",
         )
-        self.assertEqual(labels, ["C", "H", "G"])
+        assert labels == ["C", "H", "G"]
+
+        # Test workaround for string-typed class labels
+        custom_ds = openml.datasets.get_dataset(2, download_data=False)
+        custom_ds.features[31].data_type = "string"
+        labels = custom_ds.retrieve_class_labels(target_name=custom_ds.features[31].name)
+        assert labels == ["COIL", "SHEET"]
 
     def test_upload_dataset_with_url(self):
         dataset = OpenMLDataset(
@@ -617,21 +649,23 @@ def test_upload_dataset_with_url(self):
         dataset.publish()
         TestBase._mark_entity_for_removal("data", dataset.dataset_id)
         TestBase.logger.info(
-            "collected from {}: {}".format(__file__.split("/")[-1], dataset.dataset_id)
+            "collected from {}: {}".format(__file__.split("/")[-1], dataset.dataset_id),
         )
-        self.assertIsInstance(dataset.dataset_id, int)
+        assert isinstance(dataset.dataset_id, int)
 
     def _assert_status_of_dataset(self, *, did: int, status: str):
         """Asserts there is exactly one dataset with id `did` and its current status is `status`"""
         # need to use listing fn, as this is immune to cache
         result = openml.datasets.list_datasets(
-            data_id=[did], status="all", output_format="dataframe"
+            data_id=[did],
+            status="all",
+            output_format="dataframe",
         )
         result = result.to_dict(orient="index")
         # I think we should drop the test that one result is returned,
         # the server should never return multiple results?
-        self.assertEqual(len(result), 1)
-        self.assertEqual(result[did]["status"], status)
+        assert len(result) == 1
+        assert result[did]["status"] == status
 
     @pytest.mark.flaky()
     def test_data_status(self):
@@ -660,7 +694,7 @@ def test_data_status(self):
         openml.datasets.status_update(did, "active")
         self._assert_status_of_dataset(did=did, status="active")
 
-        with self.assertRaises(ValueError):
+        with pytest.raises(ValueError):
             openml.datasets.status_update(did, "in_preparation")
         self._assert_status_of_dataset(did=did, status="active")
 
@@ -672,32 +706,29 @@ def test_attributes_arff_from_df(self):
         )
         df["category"] = df["category"].astype("category")
         attributes = attributes_arff_from_df(df)
-        self.assertEqual(
-            attributes,
-            [
-                ("integer", "INTEGER"),
-                ("floating", "REAL"),
-                ("string", "STRING"),
-                ("category", ["A", "B"]),
-                ("boolean", ["True", "False"]),
-            ],
-        )
+        assert attributes == [
+            ("integer", "INTEGER"),
+            ("floating", "REAL"),
+            ("string", "STRING"),
+            ("category", ["A", "B"]),
+            ("boolean", ["True", "False"]),
+        ]
         # DataFrame with Sparse columns case
         df = pd.DataFrame(
             {
                 "integer": pd.arrays.SparseArray([1, 2, 0], fill_value=0),
                 "floating": pd.arrays.SparseArray([1.0, 2.0, 0], fill_value=0.0),
-            }
+            },
         )
         df["integer"] = df["integer"].astype(np.int64)
         attributes = attributes_arff_from_df(df)
-        self.assertEqual(attributes, [("integer", "INTEGER"), ("floating", "REAL")])
+        assert attributes == [("integer", "INTEGER"), ("floating", "REAL")]
 
     def test_attributes_arff_from_df_numeric_column(self):
         # Test column names are automatically converted to str if needed (#819)
         df = pd.DataFrame({0: [1, 2, 3], 0.5: [4, 5, 6], "target": [0, 1, 1]})
         attributes = attributes_arff_from_df(df)
-        self.assertEqual(attributes, [("0", "INTEGER"), ("0.5", "INTEGER"), ("target", "INTEGER")])
+        assert attributes == [("0", "INTEGER"), ("0.5", "INTEGER"), ("target", "INTEGER")]
 
     def test_attributes_arff_from_df_mixed_dtype_categories(self):
         # liac-arff imposed categorical attributes to be of sting dtype. We
@@ -719,8 +750,7 @@ def test_attributes_arff_from_df_unknown_dtype(self):
         for arr, dt in zip(data, dtype):
             df = pd.DataFrame(arr)
             err_msg = (
-                "The dtype '{}' of the column '0' is not currently "
-                "supported by liac-arff".format(dt)
+                f"The dtype '{dt}' of the column '0' is not currently " "supported by liac-arff"
             )
             with pytest.raises(ValueError, match=err_msg):
                 attributes_arff_from_df(df)
@@ -728,7 +758,7 @@ def test_attributes_arff_from_df_unknown_dtype(self):
     def test_create_dataset_numpy(self):
         data = np.array([[1, 2, 3], [1.2, 2.5, 3.8], [2, 5, 8], [0, 1, 0]]).T
 
-        attributes = [("col_{}".format(i), "REAL") for i in range(data.shape[1])]
+        attributes = [(f"col_{i}", "REAL") for i in range(data.shape[1])]
 
         dataset = create_dataset(
             name="%s-NumPy_testing_dataset" % self._get_sentinel(),
@@ -738,7 +768,7 @@ def test_create_dataset_numpy(self):
             collection_date="01-01-2018",
             language="English",
             licence="MIT",
-            default_target_attribute="col_{}".format(data.shape[1] - 1),
+            default_target_attribute=f"col_{data.shape[1] - 1}",
             row_id_attribute=None,
             ignore_attribute=None,
             citation="None",
@@ -753,12 +783,10 @@ def test_create_dataset_numpy(self):
         TestBase._mark_entity_for_removal("data", dataset.id)
         TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id))
 
-        self.assertEqual(
-            _get_online_dataset_arff(dataset.id),
-            dataset._dataset,
-            "Uploaded arff does not match original one",
-        )
-        self.assertEqual(_get_online_dataset_format(dataset.id), "arff", "Wrong format for dataset")
+        assert (
+            _get_online_dataset_arff(dataset.id) == dataset._dataset
+        ), "Uploaded arff does not match original one"
+        assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset"
 
     def test_create_dataset_list(self):
         data = [
@@ -809,17 +837,15 @@ def test_create_dataset_list(self):
         dataset.publish()
         TestBase._mark_entity_for_removal("data", dataset.id)
         TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id))
-        self.assertEqual(
-            _get_online_dataset_arff(dataset.id),
-            dataset._dataset,
-            "Uploaded ARFF does not match original one",
-        )
-        self.assertEqual(_get_online_dataset_format(dataset.id), "arff", "Wrong format for dataset")
+        assert (
+            _get_online_dataset_arff(dataset.id) == dataset._dataset
+        ), "Uploaded ARFF does not match original one"
+        assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset"
 
     def test_create_dataset_sparse(self):
         # test the scipy.sparse.coo_matrix
         sparse_data = scipy.sparse.coo_matrix(
-            ([0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1]))
+            ([0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])),
         )
 
         column_names = [
@@ -848,16 +874,14 @@ def test_create_dataset_sparse(self):
         xor_dataset.publish()
         TestBase._mark_entity_for_removal("data", xor_dataset.id)
         TestBase.logger.info(
-            "collected from {}: {}".format(__file__.split("/")[-1], xor_dataset.id)
-        )
-        self.assertEqual(
-            _get_online_dataset_arff(xor_dataset.id),
-            xor_dataset._dataset,
-            "Uploaded ARFF does not match original one",
-        )
-        self.assertEqual(
-            _get_online_dataset_format(xor_dataset.id), "sparse_arff", "Wrong format for dataset"
+            "collected from {}: {}".format(__file__.split("/")[-1], xor_dataset.id),
         )
+        assert (
+            _get_online_dataset_arff(xor_dataset.id) == xor_dataset._dataset
+        ), "Uploaded ARFF does not match original one"
+        assert (
+            _get_online_dataset_format(xor_dataset.id) == "sparse_arff"
+        ), "Wrong format for dataset"
 
         # test the list of dicts sparse representation
         sparse_data = [{0: 0.0}, {1: 1.0, 2: 1.0}, {0: 1.0, 2: 1.0}, {0: 1.0, 1: 1.0}]
@@ -882,16 +906,14 @@ def test_create_dataset_sparse(self):
         xor_dataset.publish()
         TestBase._mark_entity_for_removal("data", xor_dataset.id)
         TestBase.logger.info(
-            "collected from {}: {}".format(__file__.split("/")[-1], xor_dataset.id)
-        )
-        self.assertEqual(
-            _get_online_dataset_arff(xor_dataset.id),
-            xor_dataset._dataset,
-            "Uploaded ARFF does not match original one",
-        )
-        self.assertEqual(
-            _get_online_dataset_format(xor_dataset.id), "sparse_arff", "Wrong format for dataset"
+            "collected from {}: {}".format(__file__.split("/")[-1], xor_dataset.id),
         )
+        assert (
+            _get_online_dataset_arff(xor_dataset.id) == xor_dataset._dataset
+        ), "Uploaded ARFF does not match original one"
+        assert (
+            _get_online_dataset_format(xor_dataset.id) == "sparse_arff"
+        ), "Wrong format for dataset"
 
     def test_create_invalid_dataset(self):
         data = [
@@ -928,15 +950,11 @@ def test_get_online_dataset_arff(self):
         # the same as the arff from _get_arff function
         d_format = (dataset.format).lower()
 
-        self.assertEqual(
-            dataset._get_arff(d_format),
-            decoder.decode(
-                _get_online_dataset_arff(dataset_id),
-                encode_nominal=True,
-                return_type=arff.DENSE if d_format == "arff" else arff.COO,
-            ),
-            "ARFF files are not equal",
-        )
+        assert dataset._get_arff(d_format) == decoder.decode(
+            _get_online_dataset_arff(dataset_id),
+            encode_nominal=True,
+            return_type=arff.DENSE if d_format == "arff" else arff.COO,
+        ), "ARFF files are not equal"
 
     def test_topic_api_error(self):
         # Check server exception when non-admin accessses apis
@@ -961,11 +979,9 @@ def test_get_online_dataset_format(self):
         dataset_id = 77
         dataset = openml.datasets.get_dataset(dataset_id, download_data=False)
 
-        self.assertEqual(
-            (dataset.format).lower(),
-            _get_online_dataset_format(dataset_id),
-            "The format of the ARFF files is different",
-        )
+        assert dataset.format.lower() == _get_online_dataset_format(
+            dataset_id
+        ), "The format of the ARFF files is different"
 
     def test_create_dataset_pandas(self):
         data = [
@@ -1012,15 +1028,13 @@ def test_create_dataset_pandas(self):
         dataset.publish()
         TestBase._mark_entity_for_removal("data", dataset.id)
         TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id))
-        self.assertEqual(
-            _get_online_dataset_arff(dataset.id),
-            dataset._dataset,
-            "Uploaded ARFF does not match original one",
-        )
+        assert (
+            _get_online_dataset_arff(dataset.id) == dataset._dataset
+        ), "Uploaded ARFF does not match original one"
 
         # Check that DataFrame with Sparse columns are supported properly
         sparse_data = scipy.sparse.coo_matrix(
-            ([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1]))
+            ([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])),
         )
         column_names = ["input1", "input2", "y"]
         df = pd.DataFrame.sparse.from_spmatrix(sparse_data, columns=column_names)
@@ -1047,14 +1061,10 @@ def test_create_dataset_pandas(self):
         dataset.publish()
         TestBase._mark_entity_for_removal("data", dataset.id)
         TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id))
-        self.assertEqual(
-            _get_online_dataset_arff(dataset.id),
-            dataset._dataset,
-            "Uploaded ARFF does not match original one",
-        )
-        self.assertEqual(
-            _get_online_dataset_format(dataset.id), "sparse_arff", "Wrong format for dataset"
-        )
+        assert (
+            _get_online_dataset_arff(dataset.id) == dataset._dataset
+        ), "Uploaded ARFF does not match original one"
+        assert _get_online_dataset_format(dataset.id) == "sparse_arff", "Wrong format for dataset"
 
         # Check that we can overwrite the attributes
         data = [["a"], ["b"], ["c"], ["d"], ["e"]]
@@ -1084,10 +1094,8 @@ def test_create_dataset_pandas(self):
         TestBase._mark_entity_for_removal("data", dataset.id)
         TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id))
         downloaded_data = _get_online_dataset_arff(dataset.id)
-        self.assertEqual(
-            downloaded_data, dataset._dataset, "Uploaded ARFF does not match original one"
-        )
-        self.assertTrue("@ATTRIBUTE rnd_str {a, b, c, d, e, f, g}" in downloaded_data)
+        assert downloaded_data == dataset._dataset, "Uploaded ARFF does not match original one"
+        assert "@ATTRIBUTE rnd_str {a, b, c, d, e, f, g}" in downloaded_data
 
     def test_ignore_attributes_dataset(self):
         data = [
@@ -1136,7 +1144,7 @@ def test_ignore_attributes_dataset(self):
             original_data_url=original_data_url,
             paper_url=paper_url,
         )
-        self.assertEqual(dataset.ignore_attribute, ["outlook"])
+        assert dataset.ignore_attribute == ["outlook"]
 
         # pass a list to ignore_attribute
         ignore_attribute = ["outlook", "windy"]
@@ -1158,7 +1166,7 @@ def test_ignore_attributes_dataset(self):
             original_data_url=original_data_url,
             paper_url=paper_url,
         )
-        self.assertEqual(dataset.ignore_attribute, ignore_attribute)
+        assert dataset.ignore_attribute == ignore_attribute
 
         # raise an error if unknown type
         err_msg = "Wrong data type for ignore_attribute. Should be list."
@@ -1173,7 +1181,7 @@ def test_ignore_attributes_dataset(self):
                 licence=licence,
                 default_target_attribute=default_target_attribute,
                 row_id_attribute=None,
-                ignore_attribute=tuple(["outlook", "windy"]),
+                ignore_attribute=("outlook", "windy"),
                 citation=citation,
                 attributes="auto",
                 data=df,
@@ -1235,10 +1243,10 @@ def test_publish_fetch_ignore_attribute(self):
         TestBase._mark_entity_for_removal("data", dataset.id)
         TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id))
         # test if publish was successful
-        self.assertIsInstance(dataset.id, int)
+        assert isinstance(dataset.id, int)
 
         downloaded_dataset = self._wait_for_dataset_being_processed(dataset.id)
-        self.assertEqual(downloaded_dataset.ignore_attribute, ignore_attribute)
+        assert downloaded_dataset.ignore_attribute == ignore_attribute
 
     def _wait_for_dataset_being_processed(self, dataset_id):
         downloaded_dataset = None
@@ -1255,12 +1263,12 @@ def _wait_for_dataset_being_processed(self, dataset_id):
                 # returned code 273: Dataset not processed yet
                 # returned code 362: No qualities found
                 TestBase.logger.error(
-                    "Failed to fetch dataset:{} with '{}'.".format(dataset_id, str(e))
+                    f"Failed to fetch dataset:{dataset_id} with '{e!s}'.",
                 )
                 time.sleep(10)
                 continue
         if downloaded_dataset is None:
-            raise ValueError("TIMEOUT: Failed to fetch uploaded dataset - {}".format(dataset_id))
+            raise ValueError(f"TIMEOUT: Failed to fetch uploaded dataset - {dataset_id}")
         return downloaded_dataset
 
     def test_create_dataset_row_id_attribute_error(self):
@@ -1321,7 +1329,8 @@ def test_create_dataset_row_id_attribute_inference(self):
         df_index_name = [None, "index_name"]
         expected_row_id = [None, "index_name", "integer", "integer"]
         for output_row_id, (row_id, index_name) in zip(
-            expected_row_id, product(row_id_attr, df_index_name)
+            expected_row_id,
+            product(row_id_attr, df_index_name),
         ):
             df.index.name = index_name
             dataset = openml.datasets.functions.create_dataset(
@@ -1342,18 +1351,18 @@ def test_create_dataset_row_id_attribute_inference(self):
                 original_data_url=original_data_url,
                 paper_url=paper_url,
             )
-            self.assertEqual(dataset.row_id_attribute, output_row_id)
+            assert dataset.row_id_attribute == output_row_id
             dataset.publish()
             TestBase._mark_entity_for_removal("data", dataset.id)
             TestBase.logger.info(
-                "collected from {}: {}".format(__file__.split("/")[-1], dataset.id)
+                "collected from {}: {}".format(__file__.split("/")[-1], dataset.id),
             )
             arff_dataset = arff.loads(_get_online_dataset_arff(dataset.id))
             arff_data = np.array(arff_dataset["data"], dtype=object)
             # if we set the name of the index then the index will be added to
             # the data
             expected_shape = (5, 3) if index_name is None else (5, 4)
-            self.assertEqual(arff_data.shape, expected_shape)
+            assert arff_data.shape == expected_shape
 
     def test_create_dataset_attributes_auto_without_df(self):
         # attributes cannot be inferred without passing a dataframe
@@ -1365,7 +1374,7 @@ def test_create_dataset_attributes_auto_without_df(self):
         collection_date = "01-01-2018"
         language = "English"
         licence = "MIT"
-        default_target_attribute = "col_{}".format(data.shape[1] - 1)
+        default_target_attribute = f"col_{data.shape[1] - 1}"
         citation = "None"
         original_data_url = "http://openml.github.io/openml-python"
         paper_url = "http://openml.github.io/openml-python"
@@ -1392,23 +1401,23 @@ def test_create_dataset_attributes_auto_without_df(self):
 
     def test_list_qualities(self):
         qualities = openml.datasets.list_qualities()
-        self.assertEqual(isinstance(qualities, list), True)
-        self.assertEqual(all([isinstance(q, str) for q in qualities]), True)
+        assert isinstance(qualities, list) is True
+        assert all(isinstance(q, str) for q in qualities) is True
 
     def test_get_dataset_cache_format_pickle(self):
         dataset = openml.datasets.get_dataset(1)
         dataset.get_data()
 
-        self.assertEqual(type(dataset), OpenMLDataset)
-        self.assertEqual(dataset.name, "anneal")
-        self.assertGreater(len(dataset.features), 1)
-        self.assertGreater(len(dataset.qualities), 4)
+        assert type(dataset) == OpenMLDataset
+        assert dataset.name == "anneal"
+        assert len(dataset.features) > 1
+        assert len(dataset.qualities) > 4
 
         X, y, categorical, attribute_names = dataset.get_data()
-        self.assertIsInstance(X, pd.DataFrame)
-        self.assertEqual(X.shape, (898, 39))
-        self.assertEqual(len(categorical), X.shape[1])
-        self.assertEqual(len(attribute_names), X.shape[1])
+        assert isinstance(X, pd.DataFrame)
+        assert X.shape == (898, 39)
+        assert len(categorical) == X.shape[1]
+        assert len(attribute_names) == X.shape[1]
 
     def test_get_dataset_cache_format_feather(self):
         # This test crashed due to using the parquet file by default, which is downloaded
@@ -1416,7 +1425,7 @@ def test_get_dataset_cache_format_feather(self):
         # The parquet file on minio with ID 128 is not the iris dataset from the test server.
         dataset = openml.datasets.get_dataset(128, cache_format="feather")
         # Workaround
-        dataset._minio_url = None
+        dataset._parquet_url = None
         dataset.parquet_file = None
         dataset.get_data()
 
@@ -1426,21 +1435,21 @@ def test_get_dataset_cache_format_feather(self):
         feather_file = os.path.join(cache_dir_for_id, "dataset.feather")
         pickle_file = os.path.join(cache_dir_for_id, "dataset.feather.attributes.pkl.py3")
         data = pd.read_feather(feather_file)
-        self.assertTrue(os.path.isfile(feather_file), msg="Feather file is missing")
-        self.assertTrue(os.path.isfile(pickle_file), msg="Attributes pickle file is missing")
-        self.assertEqual(data.shape, (150, 5))
+        assert os.path.isfile(feather_file), "Feather file is missing"
+        assert os.path.isfile(pickle_file), "Attributes pickle file is missing"
+        assert data.shape == (150, 5)
 
         # Check if get_data is able to retrieve feather data
-        self.assertEqual(type(dataset), OpenMLDataset)
-        self.assertEqual(dataset.name, "iris")
-        self.assertGreater(len(dataset.features), 1)
-        self.assertGreater(len(dataset.qualities), 4)
+        assert type(dataset) == OpenMLDataset
+        assert dataset.name == "iris"
+        assert len(dataset.features) > 1
+        assert len(dataset.qualities) > 4
 
         X, y, categorical, attribute_names = dataset.get_data()
-        self.assertIsInstance(X, pd.DataFrame)
-        self.assertEqual(X.shape, (150, 5))
-        self.assertEqual(len(categorical), X.shape[1])
-        self.assertEqual(len(attribute_names), X.shape[1])
+        assert isinstance(X, pd.DataFrame)
+        assert X.shape == (150, 5)
+        assert len(categorical) == X.shape[1]
+        assert len(attribute_names) == X.shape[1]
 
     def test_data_edit_non_critical_field(self):
         # Case 1
@@ -1459,9 +1468,9 @@ def test_data_edit_non_critical_field(self):
             citation="The use of multiple measurements in taxonomic problems",
             language="English",
         )
-        self.assertEqual(did, result)
+        assert did == result
         edited_dataset = openml.datasets.get_dataset(did)
-        self.assertEqual(edited_dataset.description, desc)
+        assert edited_dataset.description == desc
 
     def test_data_edit_critical_field(self):
         # Case 2
@@ -1470,15 +1479,15 @@ def test_data_edit_critical_field(self):
         did = fork_dataset(1)
         self._wait_for_dataset_being_processed(did)
         result = edit_dataset(did, default_target_attribute="shape", ignore_attribute="oil")
-        self.assertEqual(did, result)
+        assert did == result
 
         n_tries = 10
         # we need to wait for the edit to be reflected on the server
         for i in range(n_tries):
             edited_dataset = openml.datasets.get_dataset(did)
             try:
-                self.assertEqual(edited_dataset.default_target_attribute, "shape", edited_dataset)
-                self.assertEqual(edited_dataset.ignore_attribute, ["oil"], edited_dataset)
+                assert edited_dataset.default_target_attribute == "shape", edited_dataset
+                assert edited_dataset.ignore_attribute == ["oil"], edited_dataset
                 break
             except AssertionError as e:
                 if i == n_tries - 1:
@@ -1486,7 +1495,7 @@ def test_data_edit_critical_field(self):
                 time.sleep(10)
                 # Delete the cache dir to get the newer version of the dataset
                 shutil.rmtree(
-                    os.path.join(self.workdir, "org", "openml", "test", "datasets", str(did))
+                    os.path.join(self.workdir, "org", "openml", "test", "datasets", str(did)),
                 )
 
     def test_data_edit_errors(self):
@@ -1547,7 +1556,7 @@ def test_data_edit_errors(self):
     def test_data_fork(self):
         did = 1
         result = fork_dataset(did)
-        self.assertNotEqual(did, result)
+        assert did != result
         # Check server exception when unknown dataset is provided
         self.assertRaisesRegex(
             OpenMLServerException,
@@ -1556,15 +1565,17 @@ def test_data_fork(self):
             data_id=999999,
         )
 
+    @pytest.mark.production()
     def test_get_dataset_parquet(self):
         # Parquet functionality is disabled on the test server
         # There is no parquet-copy of the test server yet.
         openml.config.server = self.production_server
         dataset = openml.datasets.get_dataset(61)
-        self.assertIsNotNone(dataset._minio_url)
-        self.assertIsNotNone(dataset.parquet_file)
-        self.assertTrue(os.path.isfile(dataset.parquet_file))
+        assert dataset._parquet_url is not None
+        assert dataset.parquet_file is not None
+        assert os.path.isfile(dataset.parquet_file)
 
+    @pytest.mark.production()
     def test_list_datasets_with_high_size_parameter(self):
         # Testing on prod since concurrent deletion of uploded datasets make the test fail
         openml.config.server = self.production_server
@@ -1574,11 +1585,11 @@ def test_list_datasets_with_high_size_parameter(self):
 
         # Reverting to test server
         openml.config.server = self.test_server
-        self.assertEqual(len(datasets_a), len(datasets_b))
+        assert len(datasets_a) == len(datasets_b)
 
 
 @pytest.mark.parametrize(
-    "default_target_attribute,row_id_attribute,ignore_attribute",
+    ("default_target_attribute", "row_id_attribute", "ignore_attribute"),
     [
         ("wrong", None, None),
         (None, "wrong", None),
@@ -1590,7 +1601,9 @@ def test_list_datasets_with_high_size_parameter(self):
     ],
 )
 def test_invalid_attribute_validations(
-    default_target_attribute, row_id_attribute, ignore_attribute
+    default_target_attribute,
+    row_id_attribute,
+    ignore_attribute,
 ):
     data = [
         ["a", "sunny", 85.0, 85.0, "FALSE", "no"],
@@ -1637,7 +1650,7 @@ def test_invalid_attribute_validations(
 
 
 @pytest.mark.parametrize(
-    "default_target_attribute,row_id_attribute,ignore_attribute",
+    ("default_target_attribute", "row_id_attribute", "ignore_attribute"),
     [
         ("outlook", None, None),
         (None, "outlook", None),
@@ -1735,7 +1748,7 @@ def test_delete_dataset(self):
         )
         dataset.publish()
         _dataset_id = dataset.id
-        self.assertTrue(openml.datasets.delete_dataset(_dataset_id))
+        assert openml.datasets.delete_dataset(_dataset_id)
 
 
 @mock.patch.object(requests.Session, "delete")
@@ -1745,7 +1758,8 @@ def test_delete_dataset_not_owned(mock_delete, test_files_directory, test_api_ke
         test_files_directory / "mock_responses" / "datasets" / "data_delete_not_owned.xml"
     )
     mock_delete.return_value = create_request_response(
-        status_code=412, content_filepath=content_file
+        status_code=412,
+        content_filepath=content_file,
     )
 
     with pytest.raises(
@@ -1768,7 +1782,8 @@ def test_delete_dataset_with_run(mock_delete, test_files_directory, test_api_key
         test_files_directory / "mock_responses" / "datasets" / "data_delete_has_tasks.xml"
     )
     mock_delete.return_value = create_request_response(
-        status_code=412, content_filepath=content_file
+        status_code=412,
+        content_filepath=content_file,
     )
 
     with pytest.raises(
@@ -1791,7 +1806,8 @@ def test_delete_dataset_success(mock_delete, test_files_directory, test_api_key)
         test_files_directory / "mock_responses" / "datasets" / "data_delete_successful.xml"
     )
     mock_delete.return_value = create_request_response(
-        status_code=200, content_filepath=content_file
+        status_code=200,
+        content_filepath=content_file,
     )
 
     success = openml.datasets.delete_dataset(40000)
@@ -1811,7 +1827,8 @@ def test_delete_unknown_dataset(mock_delete, test_files_directory, test_api_key)
         test_files_directory / "mock_responses" / "datasets" / "data_delete_not_exist.xml"
     )
     mock_delete.return_value = create_request_response(
-        status_code=412, content_filepath=content_file
+        status_code=412,
+        content_filepath=content_file,
     )
 
     with pytest.raises(
@@ -1841,7 +1858,7 @@ def test_list_datasets(all_datasets: pd.DataFrame):
     # We can only perform a smoke test here because we test on dynamic
     # data from the internet...
     # 1087 as the number of datasets on openml.org
-    assert 100 <= len(all_datasets)
+    assert len(all_datasets) >= 100
     _assert_datasets_have_id_and_valid_status(all_datasets)
 
 
@@ -1853,13 +1870,14 @@ def test_list_datasets_by_tag(all_datasets: pd.DataFrame):
 
 def test_list_datasets_by_size():
     datasets = openml.datasets.list_datasets(size=5, output_format="dataframe")
-    assert 5 == len(datasets)
+    assert len(datasets) == 5
     _assert_datasets_have_id_and_valid_status(datasets)
 
 
 def test_list_datasets_by_number_instances(all_datasets: pd.DataFrame):
     small_datasets = openml.datasets.list_datasets(
-        number_instances="5..100", output_format="dataframe"
+        number_instances="5..100",
+        output_format="dataframe",
     )
     assert 0 < len(small_datasets) <= len(all_datasets)
     _assert_datasets_have_id_and_valid_status(small_datasets)
@@ -1867,7 +1885,8 @@ def test_list_datasets_by_number_instances(all_datasets: pd.DataFrame):
 
 def test_list_datasets_by_number_features(all_datasets: pd.DataFrame):
     wide_datasets = openml.datasets.list_datasets(
-        number_features="50..100", output_format="dataframe"
+        number_features="50..100",
+        output_format="dataframe",
     )
     assert 8 <= len(wide_datasets) < len(all_datasets)
     _assert_datasets_have_id_and_valid_status(wide_datasets)
@@ -1875,7 +1894,8 @@ def test_list_datasets_by_number_features(all_datasets: pd.DataFrame):
 
 def test_list_datasets_by_number_classes(all_datasets: pd.DataFrame):
     five_class_datasets = openml.datasets.list_datasets(
-        number_classes="5", output_format="dataframe"
+        number_classes="5",
+        output_format="dataframe",
     )
     assert 3 <= len(five_class_datasets) < len(all_datasets)
     _assert_datasets_have_id_and_valid_status(five_class_datasets)
@@ -1883,7 +1903,8 @@ def test_list_datasets_by_number_classes(all_datasets: pd.DataFrame):
 
 def test_list_datasets_by_number_missing_values(all_datasets: pd.DataFrame):
     na_datasets = openml.datasets.list_datasets(
-        number_missing_values="5..100", output_format="dataframe"
+        number_missing_values="5..100",
+        output_format="dataframe",
     )
     assert 5 <= len(na_datasets) < len(all_datasets)
     _assert_datasets_have_id_and_valid_status(na_datasets)
diff --git a/tests/test_evaluations/test_evaluation_functions.py b/tests/test_evaluations/test_evaluation_functions.py
index 70f36ce19..7af01384f 100644
--- a/tests/test_evaluations/test_evaluation_functions.py
+++ b/tests/test_evaluations/test_evaluation_functions.py
@@ -1,4 +1,6 @@
 # License: BSD 3-Clause
+from __future__ import annotations
+
 import pytest
 
 import openml
@@ -12,19 +14,26 @@ class TestEvaluationFunctions(TestBase):
 
     def _check_list_evaluation_setups(self, **kwargs):
         evals_setups = openml.evaluations.list_evaluations_setups(
-            "predictive_accuracy", **kwargs, sort_order="desc", output_format="dataframe"
+            "predictive_accuracy",
+            **kwargs,
+            sort_order="desc",
+            output_format="dataframe",
         )
         evals = openml.evaluations.list_evaluations(
-            "predictive_accuracy", **kwargs, sort_order="desc", output_format="dataframe"
+            "predictive_accuracy",
+            **kwargs,
+            sort_order="desc",
+            output_format="dataframe",
         )
 
         # Check if list is non-empty
-        self.assertGreater(len(evals_setups), 0)
+        assert len(evals_setups) > 0
         # Check if length is accurate
-        self.assertEqual(len(evals_setups), len(evals))
+        assert len(evals_setups) == len(evals)
         # Check if output from sort is sorted in the right order
         self.assertSequenceEqual(
-            sorted(evals_setups["value"].tolist(), reverse=True), evals_setups["value"].tolist()
+            sorted(evals_setups["value"].tolist(), reverse=True),
+            evals_setups["value"].tolist(),
         )
 
         # Check if output and order of list_evaluations is preserved
@@ -34,7 +43,7 @@ def _check_list_evaluation_setups(self, **kwargs):
             evals_setups = evals_setups.head(1)
 
         # Check if the hyper-parameter column is as accurate and flow_id
-        for index, row in evals_setups.iterrows():
+        for _index, row in evals_setups.iterrows():
             params = openml.runs.get_run(row["run_id"]).parameter_settings
             list1 = [param["oml:value"] for param in params]
             list2 = list(row["parameters"].values())
@@ -42,99 +51,119 @@ def _check_list_evaluation_setups(self, **kwargs):
             self.assertSequenceEqual(sorted(list1), sorted(list2))
         return evals_setups
 
+    @pytest.mark.production()
     def test_evaluation_list_filter_task(self):
         openml.config.server = self.production_server
 
         task_id = 7312
 
         evaluations = openml.evaluations.list_evaluations(
-            "predictive_accuracy", size=110, tasks=[task_id]
+            "predictive_accuracy",
+            size=110,
+            tasks=[task_id],
         )
 
-        self.assertGreater(len(evaluations), 100)
-        for run_id in evaluations.keys():
-            self.assertEqual(evaluations[run_id].task_id, task_id)
+        assert len(evaluations) > 100
+        for run_id in evaluations:
+            assert evaluations[run_id].task_id == task_id
             # default behaviour of this method: return aggregated results (not
             # per fold)
-            self.assertIsNotNone(evaluations[run_id].value)
-            self.assertIsNone(evaluations[run_id].values)
+            assert evaluations[run_id].value is not None
+            assert evaluations[run_id].values is None
 
+    @pytest.mark.production()
     def test_evaluation_list_filter_uploader_ID_16(self):
         openml.config.server = self.production_server
 
         uploader_id = 16
         evaluations = openml.evaluations.list_evaluations(
-            "predictive_accuracy", size=60, uploaders=[uploader_id], output_format="dataframe"
+            "predictive_accuracy",
+            size=60,
+            uploaders=[uploader_id],
+            output_format="dataframe",
         )
-        self.assertEqual(evaluations["uploader"].unique(), [uploader_id])
+        assert evaluations["uploader"].unique() == [uploader_id]
 
-        self.assertGreater(len(evaluations), 50)
+        assert len(evaluations) > 50
 
+    @pytest.mark.production()
     def test_evaluation_list_filter_uploader_ID_10(self):
         openml.config.server = self.production_server
 
         setup_id = 10
         evaluations = openml.evaluations.list_evaluations(
-            "predictive_accuracy", size=60, setups=[setup_id]
+            "predictive_accuracy",
+            size=60,
+            setups=[setup_id],
         )
 
-        self.assertGreater(len(evaluations), 50)
-        for run_id in evaluations.keys():
-            self.assertEqual(evaluations[run_id].setup_id, setup_id)
+        assert len(evaluations) > 50
+        for run_id in evaluations:
+            assert evaluations[run_id].setup_id == setup_id
             # default behaviour of this method: return aggregated results (not
             # per fold)
-            self.assertIsNotNone(evaluations[run_id].value)
-            self.assertIsNone(evaluations[run_id].values)
+            assert evaluations[run_id].value is not None
+            assert evaluations[run_id].values is None
 
+    @pytest.mark.production()
     def test_evaluation_list_filter_flow(self):
         openml.config.server = self.production_server
 
         flow_id = 100
 
         evaluations = openml.evaluations.list_evaluations(
-            "predictive_accuracy", size=10, flows=[flow_id]
+            "predictive_accuracy",
+            size=10,
+            flows=[flow_id],
         )
 
-        self.assertGreater(len(evaluations), 2)
-        for run_id in evaluations.keys():
-            self.assertEqual(evaluations[run_id].flow_id, flow_id)
+        assert len(evaluations) > 2
+        for run_id in evaluations:
+            assert evaluations[run_id].flow_id == flow_id
             # default behaviour of this method: return aggregated results (not
             # per fold)
-            self.assertIsNotNone(evaluations[run_id].value)
-            self.assertIsNone(evaluations[run_id].values)
+            assert evaluations[run_id].value is not None
+            assert evaluations[run_id].values is None
 
+    @pytest.mark.production()
     def test_evaluation_list_filter_run(self):
         openml.config.server = self.production_server
 
         run_id = 12
 
         evaluations = openml.evaluations.list_evaluations(
-            "predictive_accuracy", size=2, runs=[run_id]
+            "predictive_accuracy",
+            size=2,
+            runs=[run_id],
         )
 
-        self.assertEqual(len(evaluations), 1)
-        for run_id in evaluations.keys():
-            self.assertEqual(evaluations[run_id].run_id, run_id)
+        assert len(evaluations) == 1
+        for run_id in evaluations:
+            assert evaluations[run_id].run_id == run_id
             # default behaviour of this method: return aggregated results (not
             # per fold)
-            self.assertIsNotNone(evaluations[run_id].value)
-            self.assertIsNone(evaluations[run_id].values)
+            assert evaluations[run_id].value is not None
+            assert evaluations[run_id].values is None
 
+    @pytest.mark.production()
     def test_evaluation_list_limit(self):
         openml.config.server = self.production_server
 
         evaluations = openml.evaluations.list_evaluations(
-            "predictive_accuracy", size=100, offset=100
+            "predictive_accuracy",
+            size=100,
+            offset=100,
         )
-        self.assertEqual(len(evaluations), 100)
+        assert len(evaluations) == 100
 
     def test_list_evaluations_empty(self):
         evaluations = openml.evaluations.list_evaluations("unexisting_measure")
         if len(evaluations) > 0:
             raise ValueError("UnitTest Outdated, got somehow results")
 
-        self.assertIsInstance(evaluations, dict)
+        assert isinstance(evaluations, dict)
 
+    @pytest.mark.production()
     def test_evaluation_list_per_fold(self):
         openml.config.server = self.production_server
         size = 1000
@@ -152,10 +181,10 @@ def test_evaluation_list_per_fold(self):
             per_fold=True,
         )
 
-        self.assertEqual(len(evaluations), size)
-        for run_id in evaluations.keys():
-            self.assertIsNone(evaluations[run_id].value)
-            self.assertIsNotNone(evaluations[run_id].values)
+        assert len(evaluations) == size
+        for run_id in evaluations:
+            assert evaluations[run_id].value is None
+            assert evaluations[run_id].values is not None
             # potentially we could also test array values, but these might be
             # added in the future
 
@@ -168,39 +197,48 @@ def test_evaluation_list_per_fold(self):
             uploaders=uploader_ids,
             per_fold=False,
         )
-        for run_id in evaluations.keys():
-            self.assertIsNotNone(evaluations[run_id].value)
-            self.assertIsNone(evaluations[run_id].values)
+        for run_id in evaluations:
+            assert evaluations[run_id].value is not None
+            assert evaluations[run_id].values is None
 
+    @pytest.mark.production()
     def test_evaluation_list_sort(self):
         openml.config.server = self.production_server
         size = 10
         task_id = 6
         # Get all evaluations of the task
         unsorted_eval = openml.evaluations.list_evaluations(
-            "predictive_accuracy", size=None, offset=0, tasks=[task_id]
+            "predictive_accuracy",
+            size=None,
+            offset=0,
+            tasks=[task_id],
         )
         # Get top 10 evaluations of the same task
         sorted_eval = openml.evaluations.list_evaluations(
-            "predictive_accuracy", size=size, offset=0, tasks=[task_id], sort_order="desc"
+            "predictive_accuracy",
+            size=size,
+            offset=0,
+            tasks=[task_id],
+            sort_order="desc",
         )
-        self.assertEqual(len(sorted_eval), size)
-        self.assertGreater(len(unsorted_eval), 0)
+        assert len(sorted_eval) == size
+        assert len(unsorted_eval) > 0
         sorted_output = [evaluation.value for evaluation in sorted_eval.values()]
         unsorted_output = [evaluation.value for evaluation in unsorted_eval.values()]
 
         # Check if output from sort is sorted in the right order
-        self.assertTrue(sorted(sorted_output, reverse=True) == sorted_output)
+        assert sorted(sorted_output, reverse=True) == sorted_output
 
         # Compare manual sorting against sorted output
         test_output = sorted(unsorted_output, reverse=True)
-        self.assertTrue(test_output[:size] == sorted_output)
+        assert test_output[:size] == sorted_output
 
     def test_list_evaluation_measures(self):
         measures = openml.evaluations.list_evaluation_measures()
-        self.assertEqual(isinstance(measures, list), True)
-        self.assertEqual(all([isinstance(s, str) for s in measures]), True)
+        assert isinstance(measures, list) is True
+        assert all(isinstance(s, str) for s in measures) is True
 
+    @pytest.mark.production()
     def test_list_evaluations_setups_filter_flow(self):
         openml.config.server = self.production_server
         flow_id = [405]
@@ -217,8 +255,9 @@ def test_list_evaluations_setups_filter_flow(self):
         )
         columns = list(evals_cols.columns)
         keys = list(evals["parameters"].values[0].keys())
-        self.assertTrue(all(elem in columns for elem in keys))
+        assert all(elem in columns for elem in keys)
 
+    @pytest.mark.production()
     def test_list_evaluations_setups_filter_task(self):
         openml.config.server = self.production_server
         task_id = [6]
diff --git a/tests/test_evaluations/test_evaluations_example.py b/tests/test_evaluations/test_evaluations_example.py
index 5715b570a..bf5b03f3f 100644
--- a/tests/test_evaluations/test_evaluations_example.py
+++ b/tests/test_evaluations/test_evaluations_example.py
@@ -1,4 +1,5 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
 import unittest
 
@@ -8,9 +9,10 @@ def test_example_python_paper(self):
         # Example script which will appear in the upcoming OpenML-Python paper
         # This test ensures that the example will keep running!
 
-        import openml
-        import numpy as np
         import matplotlib.pyplot as plt
+        import numpy as np
+
+        import openml
 
         df = openml.evaluations.list_evaluations_setups(
             "predictive_accuracy",
diff --git a/tests/test_extensions/test_functions.py b/tests/test_extensions/test_functions.py
index 36bb06061..bc7937c88 100644
--- a/tests/test_extensions/test_functions.py
+++ b/tests/test_extensions/test_functions.py
@@ -1,10 +1,12 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
 import inspect
 
-import openml.testing
+import pytest
 
-from openml.extensions import get_extension_by_model, get_extension_by_flow, register_extension
+import openml.testing
+from openml.extensions import get_extension_by_flow, get_extension_by_model, register_extension
 
 
 class DummyFlow:
@@ -61,31 +63,29 @@ def setUp(self):
         _unregister()
 
     def test_get_extension_by_flow(self):
-        self.assertIsNone(get_extension_by_flow(DummyFlow()))
-        with self.assertRaisesRegex(ValueError, "No extension registered which can handle flow:"):
+        assert get_extension_by_flow(DummyFlow()) is None
+        with pytest.raises(ValueError, match="No extension registered which can handle flow:"):
             get_extension_by_flow(DummyFlow(), raise_if_no_extension=True)
         register_extension(DummyExtension1)
-        self.assertIsInstance(get_extension_by_flow(DummyFlow()), DummyExtension1)
+        assert isinstance(get_extension_by_flow(DummyFlow()), DummyExtension1)
         register_extension(DummyExtension2)
-        self.assertIsInstance(get_extension_by_flow(DummyFlow()), DummyExtension1)
+        assert isinstance(get_extension_by_flow(DummyFlow()), DummyExtension1)
         register_extension(DummyExtension1)
-        with self.assertRaisesRegex(
-            ValueError,
-            "Multiple extensions registered which can handle flow:",
+        with pytest.raises(
+            ValueError, match="Multiple extensions registered which can handle flow:"
         ):
             get_extension_by_flow(DummyFlow())
 
     def test_get_extension_by_model(self):
-        self.assertIsNone(get_extension_by_model(DummyModel()))
-        with self.assertRaisesRegex(ValueError, "No extension registered which can handle model:"):
+        assert get_extension_by_model(DummyModel()) is None
+        with pytest.raises(ValueError, match="No extension registered which can handle model:"):
             get_extension_by_model(DummyModel(), raise_if_no_extension=True)
         register_extension(DummyExtension1)
-        self.assertIsInstance(get_extension_by_model(DummyModel()), DummyExtension1)
+        assert isinstance(get_extension_by_model(DummyModel()), DummyExtension1)
         register_extension(DummyExtension2)
-        self.assertIsInstance(get_extension_by_model(DummyModel()), DummyExtension1)
+        assert isinstance(get_extension_by_model(DummyModel()), DummyExtension1)
         register_extension(DummyExtension1)
-        with self.assertRaisesRegex(
-            ValueError,
-            "Multiple extensions registered which can handle model:",
+        with pytest.raises(
+            ValueError, match="Multiple extensions registered which can handle model:"
         ):
             get_extension_by_model(DummyModel())
diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
index 2b07796ed..4c7b0d60e 100644
--- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
+++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
@@ -1,17 +1,17 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
 import collections
 import json
-import re
 import os
+import re
 import sys
-from typing import Any
 import unittest
-from distutils.version import LooseVersion
+import warnings
 from collections import OrderedDict
+from distutils.version import LooseVersion
+from typing import Any
 from unittest import mock
-import warnings
-from packaging import version
 
 import numpy as np
 import pandas as pd
@@ -19,6 +19,7 @@
 import scipy.optimize
 import scipy.stats
 import sklearn.base
+import sklearn.cluster
 import sklearn.datasets
 import sklearn.decomposition
 import sklearn.dummy
@@ -32,19 +33,17 @@
 import sklearn.pipeline
 import sklearn.preprocessing
 import sklearn.tree
-import sklearn.cluster
+from packaging import version
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import OneHotEncoder, StandardScaler
 
 import openml
-from openml.extensions.sklearn import SklearnExtension
 from openml.exceptions import PyOpenMLError
+from openml.extensions.sklearn import SklearnExtension, cat, cont
 from openml.flows import OpenMLFlow
 from openml.flows.functions import assert_flows_equal
 from openml.runs.trace import OpenMLRunTrace
-from openml.testing import TestBase, SimpleImputer, CustomImputer
-from openml.extensions.sklearn import cat, cont
-
+from openml.testing import CustomImputer, SimpleImputer, TestBase
 
 this_directory = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(this_directory)
@@ -115,7 +114,12 @@ def _get_expected_pipeline_description(self, model: Any) -> str:
         return expected_fixture
 
     def _serialization_test_helper(
-        self, model, X, y, subcomponent_parameters, dependencies_mock_call_count=(1, 2)
+        self,
+        model,
+        X,
+        y,
+        subcomponent_parameters,
+        dependencies_mock_call_count=(1, 2),
     ):
         # Regex pattern for memory addresses of style 0x7f8e0f31ecf8
         pattern = re.compile("0x[0-9a-f]{12}")
@@ -129,61 +133,60 @@ def _serialization_test_helper(
             new_model = self.extension.flow_to_model(serialization)
             # compares string representations of the dict, as it potentially
             # contains complex objects that can not be compared with == op
-            self.assertEqual(
-                re.sub(pattern, str(model.get_params()), ""),
-                re.sub(pattern, str(new_model.get_params()), ""),
+            assert re.sub(pattern, str(model.get_params()), "") == re.sub(
+                pattern, str(new_model.get_params()), ""
             )
 
-            self.assertEqual(type(new_model), type(model))
-            self.assertIsNot(new_model, model)
+            assert type(new_model) == type(model)
+            assert new_model is not model
 
             if X is not None:
                 new_model.fit(self.X, self.y)
 
-            self.assertEqual(check_dependencies_mock.call_count, dependencies_mock_call_count[0])
+            assert check_dependencies_mock.call_count == dependencies_mock_call_count[0]
 
             xml = serialization._to_dict()
             new_model2 = self.extension.flow_to_model(OpenMLFlow._from_dict(xml))
-            self.assertEqual(
-                re.sub(pattern, str(model.get_params()), ""),
-                re.sub(pattern, str(new_model2.get_params()), ""),
+            assert re.sub(pattern, str(model.get_params()), "") == re.sub(
+                pattern, str(new_model2.get_params()), ""
             )
 
-            self.assertEqual(type(new_model2), type(model))
-            self.assertIsNot(new_model2, model)
+            assert type(new_model2) == type(model)
+            assert new_model2 is not model
 
             if X is not None:
                 new_model2.fit(self.X, self.y)
 
-            self.assertEqual(check_dependencies_mock.call_count, dependencies_mock_call_count[1])
+            assert check_dependencies_mock.call_count == dependencies_mock_call_count[1]
 
             if subcomponent_parameters:
                 for nm in (new_model, new_model2):
                     new_model_params = nm.get_params()
                     model_params = model.get_params()
                     for subcomponent_parameter in subcomponent_parameters:
-                        self.assertEqual(
-                            type(new_model_params[subcomponent_parameter]),
-                            type(model_params[subcomponent_parameter]),
+                        assert type(new_model_params[subcomponent_parameter]) == type(
+                            model_params[subcomponent_parameter]
                         )
-                        self.assertIsNot(
-                            new_model_params[subcomponent_parameter],
-                            model_params[subcomponent_parameter],
+                        assert (
+                            new_model_params[subcomponent_parameter]
+                            is not model_params[subcomponent_parameter]
                         )
                         del new_model_params[subcomponent_parameter]
                         del model_params[subcomponent_parameter]
-                    self.assertEqual(new_model_params, model_params)
+                    assert new_model_params == model_params
 
             return serialization, new_model
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_serialize_model(self):
         model = sklearn.tree.DecisionTreeClassifier(
-            criterion="entropy", max_features="auto", max_leaf_nodes=2000
+            criterion="entropy",
+            max_features="auto",
+            max_leaf_nodes=2000,
         )
 
         tree_name = "tree" if LooseVersion(sklearn.__version__) < "0.22" else "_classes"
-        fixture_name = "sklearn.tree.{}.DecisionTreeClassifier".format(tree_name)
+        fixture_name = f"sklearn.tree.{tree_name}.DecisionTreeClassifier"
         fixture_short_name = "sklearn.DecisionTreeClassifier"
         # str obtained from self.extension._get_sklearn_description(model)
         fixture_description = "A decision tree classifier."
@@ -207,7 +210,7 @@ def test_serialize_model(self):
                     ("presort", "false"),
                     ("random_state", "null"),
                     ("splitter", '"best"'),
-                )
+                ),
             )
         elif LooseVersion(sklearn.__version__) < "1.0":
             fixture_parameters = OrderedDict(
@@ -225,7 +228,7 @@ def test_serialize_model(self):
                     ("presort", presort_val),
                     ("random_state", "null"),
                     ("splitter", '"best"'),
-                )
+                ),
             )
         else:
             fixture_parameters = OrderedDict(
@@ -242,7 +245,7 @@ def test_serialize_model(self):
                     ("presort", presort_val),
                     ("random_state", "null"),
                     ("splitter", '"best"'),
-                )
+                ),
             )
 
         if LooseVersion(sklearn.__version__) >= "0.22":
@@ -251,22 +254,26 @@ def test_serialize_model(self):
         if LooseVersion(sklearn.__version__) >= "0.24":
             del fixture_parameters["presort"]
 
-        structure_fixture = {"sklearn.tree.{}.DecisionTreeClassifier".format(tree_name): []}
+        structure_fixture = {f"sklearn.tree.{tree_name}.DecisionTreeClassifier": []}
 
         serialization, _ = self._serialization_test_helper(
-            model, X=self.X, y=self.y, subcomponent_parameters=None
+            model,
+            X=self.X,
+            y=self.y,
+            subcomponent_parameters=None,
         )
         structure = serialization.get_structure("name")
 
-        self.assertEqual(serialization.name, fixture_name)
-        self.assertEqual(serialization.class_name, fixture_name)
-        self.assertEqual(serialization.custom_name, fixture_short_name)
-        self.assertEqual(serialization.description, fixture_description)
-        self.assertEqual(serialization.parameters, fixture_parameters)
-        self.assertEqual(serialization.dependencies, version_fixture)
+        assert serialization.name == fixture_name
+        assert serialization.class_name == fixture_name
+        assert serialization.custom_name == fixture_short_name
+        assert serialization.description == fixture_description
+        assert serialization.parameters == fixture_parameters
+        assert serialization.dependencies == version_fixture
         self.assertDictEqual(structure, structure_fixture)
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
+    @pytest.mark.production()
     def test_can_handle_flow(self):
         openml.config.server = self.production_server
 
@@ -277,16 +284,16 @@ def test_can_handle_flow(self):
 
         openml.config.server = self.test_server
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_serialize_model_clustering(self):
         model = sklearn.cluster.KMeans()
 
         cluster_name = "k_means_" if LooseVersion(sklearn.__version__) < "0.22" else "_kmeans"
-        fixture_name = "sklearn.cluster.{}.KMeans".format(cluster_name)
+        fixture_name = f"sklearn.cluster.{cluster_name}.KMeans"
         fixture_short_name = "sklearn.KMeans"
         # str obtained from self.extension._get_sklearn_description(model)
         fixture_description = "K-Means clustering{}".format(
-            "" if LooseVersion(sklearn.__version__) < "0.22" else "."
+            "" if LooseVersion(sklearn.__version__) < "0.22" else ".",
         )
         version_fixture = self.extension._min_dependency_str(sklearn.__version__)
 
@@ -308,7 +315,7 @@ def test_serialize_model_clustering(self):
                     ("random_state", "null"),
                     ("tol", "0.0001"),
                     ("verbose", "0"),
-                )
+                ),
             )
         elif LooseVersion(sklearn.__version__) < "1.0":
             fixture_parameters = OrderedDict(
@@ -324,7 +331,7 @@ def test_serialize_model_clustering(self):
                     ("random_state", "null"),
                     ("tol", "0.0001"),
                     ("verbose", "0"),
-                )
+                ),
             )
         elif LooseVersion(sklearn.__version__) < "1.1":
             fixture_parameters = OrderedDict(
@@ -338,7 +345,7 @@ def test_serialize_model_clustering(self):
                     ("random_state", "null"),
                     ("tol", "0.0001"),
                     ("verbose", "0"),
-                )
+                ),
             )
         else:
             n_init = '"warn"' if LooseVersion(sklearn.__version__) >= "1.2" else "10"
@@ -353,12 +360,15 @@ def test_serialize_model_clustering(self):
                     ("random_state", "null"),
                     ("tol", "0.0001"),
                     ("verbose", "0"),
-                )
+                ),
             )
-        fixture_structure = {"sklearn.cluster.{}.KMeans".format(cluster_name): []}
+        fixture_structure = {f"sklearn.cluster.{cluster_name}.KMeans": []}
 
         serialization, _ = self._serialization_test_helper(
-            model, X=None, y=None, subcomponent_parameters=None
+            model,
+            X=None,
+            y=None,
+            subcomponent_parameters=None,
         )
         structure = serialization.get_structure("name")
 
@@ -370,21 +380,22 @@ def test_serialize_model_clustering(self):
         assert serialization.dependencies == version_fixture
         assert structure == fixture_structure
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_serialize_model_with_subcomponent(self):
         model = sklearn.ensemble.AdaBoostClassifier(
-            n_estimators=100, base_estimator=sklearn.tree.DecisionTreeClassifier()
+            n_estimators=100,
+            base_estimator=sklearn.tree.DecisionTreeClassifier(),
         )
 
         weight_name = "{}weight_boosting".format(
-            "" if LooseVersion(sklearn.__version__) < "0.22" else "_"
+            "" if LooseVersion(sklearn.__version__) < "0.22" else "_",
         )
         tree_name = "tree" if LooseVersion(sklearn.__version__) < "0.22" else "_classes"
         fixture_name = (
-            "sklearn.ensemble.{}.AdaBoostClassifier"
-            "(base_estimator=sklearn.tree.{}.DecisionTreeClassifier)".format(weight_name, tree_name)
+            f"sklearn.ensemble.{weight_name}.AdaBoostClassifier"
+            f"(base_estimator=sklearn.tree.{tree_name}.DecisionTreeClassifier)"
         )
-        fixture_class_name = "sklearn.ensemble.{}.AdaBoostClassifier".format(weight_name)
+        fixture_class_name = f"sklearn.ensemble.{weight_name}.AdaBoostClassifier"
         fixture_short_name = "sklearn.AdaBoostClassifier"
         # str obtained from self.extension._get_sklearn_description(model)
         fixture_description = (
@@ -396,13 +407,13 @@ def test_serialize_model_with_subcomponent(self):
             " on difficult cases.\n\nThis class implements the algorithm known "
             "as AdaBoost-SAMME [2]."
         )
-        fixture_subcomponent_name = "sklearn.tree.{}.DecisionTreeClassifier".format(tree_name)
-        fixture_subcomponent_class_name = "sklearn.tree.{}.DecisionTreeClassifier".format(tree_name)
+        fixture_subcomponent_name = f"sklearn.tree.{tree_name}.DecisionTreeClassifier"
+        fixture_subcomponent_class_name = f"sklearn.tree.{tree_name}.DecisionTreeClassifier"
         # str obtained from self.extension._get_sklearn_description(model.base_estimator)
         fixture_subcomponent_description = "A decision tree classifier."
         fixture_structure = {
             fixture_name: [],
-            "sklearn.tree.{}.DecisionTreeClassifier".format(tree_name): ["base_estimator"],
+            f"sklearn.tree.{tree_name}.DecisionTreeClassifier": ["base_estimator"],
         }
 
         serialization, _ = self._serialization_test_helper(
@@ -414,24 +425,25 @@ def test_serialize_model_with_subcomponent(self):
         )
         structure = serialization.get_structure("name")
 
-        self.assertEqual(serialization.name, fixture_name)
-        self.assertEqual(serialization.class_name, fixture_class_name)
-        self.assertEqual(serialization.custom_name, fixture_short_name)
-        self.assertEqual(serialization.description, fixture_description)
-        self.assertEqual(serialization.parameters["algorithm"], '"SAMME.R"')
-        self.assertIsInstance(serialization.parameters["base_estimator"], str)
-        self.assertEqual(serialization.parameters["learning_rate"], "1.0")
-        self.assertEqual(serialization.parameters["n_estimators"], "100")
-        self.assertEqual(serialization.components["base_estimator"].name, fixture_subcomponent_name)
-        self.assertEqual(
-            serialization.components["base_estimator"].class_name, fixture_subcomponent_class_name
-        )
-        self.assertEqual(
-            serialization.components["base_estimator"].description, fixture_subcomponent_description
+        assert serialization.name == fixture_name
+        assert serialization.class_name == fixture_class_name
+        assert serialization.custom_name == fixture_short_name
+        assert serialization.description == fixture_description
+        assert serialization.parameters["algorithm"] == '"SAMME.R"'
+        assert isinstance(serialization.parameters["base_estimator"], str)
+        assert serialization.parameters["learning_rate"] == "1.0"
+        assert serialization.parameters["n_estimators"] == "100"
+        assert serialization.components["base_estimator"].name == fixture_subcomponent_name
+        assert (
+            serialization.components["base_estimator"].class_name == fixture_subcomponent_class_name
+        )
+        assert (
+            serialization.components["base_estimator"].description
+            == fixture_subcomponent_description
         )
         self.assertDictEqual(structure, fixture_structure)
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_serialize_pipeline(self):
         scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
         dummy = sklearn.dummy.DummyClassifier(strategy="prior")
@@ -440,14 +452,14 @@ def test_serialize_pipeline(self):
         scaler_name = "data" if LooseVersion(sklearn.__version__) < "0.22" else "_data"
         fixture_name = (
             "sklearn.pipeline.Pipeline("
-            "scaler=sklearn.preprocessing.{}.StandardScaler,"
-            "dummy=sklearn.dummy.DummyClassifier)".format(scaler_name)
+            f"scaler=sklearn.preprocessing.{scaler_name}.StandardScaler,"
+            "dummy=sklearn.dummy.DummyClassifier)"
         )
         fixture_short_name = "sklearn.Pipeline(StandardScaler,DummyClassifier)"
         fixture_description = self._get_expected_pipeline_description(model)
         fixture_structure = {
             fixture_name: [],
-            "sklearn.preprocessing.{}.StandardScaler".format(scaler_name): ["scaler"],
+            f"sklearn.preprocessing.{scaler_name}.StandardScaler": ["scaler"],
             "sklearn.dummy.DummyClassifier": ["dummy"],
         }
 
@@ -460,9 +472,9 @@ def test_serialize_pipeline(self):
         )
         structure = serialization.get_structure("name")
 
-        self.assertEqual(serialization.name, fixture_name)
-        self.assertEqual(serialization.custom_name, fixture_short_name)
-        self.assertEqual(serialization.description, fixture_description)
+        assert serialization.name == fixture_name
+        assert serialization.custom_name == fixture_short_name
+        assert serialization.description == fixture_description
         self.assertDictEqual(structure, fixture_structure)
 
         # Comparing the pipeline
@@ -470,38 +482,35 @@ def test_serialize_pipeline(self):
         # as value
         # memory parameter has been added in 0.19, verbose in 0.21
         if LooseVersion(sklearn.__version__) < "0.19":
-            self.assertEqual(len(serialization.parameters), 1)
+            assert len(serialization.parameters) == 1
         elif LooseVersion(sklearn.__version__) < "0.21":
-            self.assertEqual(len(serialization.parameters), 2)
+            assert len(serialization.parameters) == 2
         else:
-            self.assertEqual(len(serialization.parameters), 3)
+            assert len(serialization.parameters) == 3
 
         # Hard to compare two representations of a dict due to possibly
         # different sorting. Making a json makes it easier
-        self.assertEqual(
-            json.loads(serialization.parameters["steps"]),
-            [
-                {
-                    "oml-python:serialized_object": "component_reference",
-                    "value": {"key": "scaler", "step_name": "scaler"},
-                },
-                {
-                    "oml-python:serialized_object": "component_reference",
-                    "value": {"key": "dummy", "step_name": "dummy"},
-                },
-            ],
-        )
+        assert json.loads(serialization.parameters["steps"]) == [
+            {
+                "oml-python:serialized_object": "component_reference",
+                "value": {"key": "scaler", "step_name": "scaler"},
+            },
+            {
+                "oml-python:serialized_object": "component_reference",
+                "value": {"key": "dummy", "step_name": "dummy"},
+            },
+        ]
 
         # Checking the sub-component
-        self.assertEqual(len(serialization.components), 2)
-        self.assertIsInstance(serialization.components["scaler"], OpenMLFlow)
-        self.assertIsInstance(serialization.components["dummy"], OpenMLFlow)
+        assert len(serialization.components) == 2
+        assert isinstance(serialization.components["scaler"], OpenMLFlow)
+        assert isinstance(serialization.components["dummy"], OpenMLFlow)
 
-        self.assertEqual([step[0] for step in new_model.steps], [step[0] for step in model.steps])
-        self.assertIsNot(new_model.steps[0][1], model.steps[0][1])
-        self.assertIsNot(new_model.steps[1][1], model.steps[1][1])
+        assert [step[0] for step in new_model.steps] == [step[0] for step in model.steps]
+        assert new_model.steps[0][1] is not model.steps[0][1]
+        assert new_model.steps[1][1] is not model.steps[1][1]
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_serialize_pipeline_clustering(self):
         scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
         km = sklearn.cluster.KMeans()
@@ -511,15 +520,15 @@ def test_serialize_pipeline_clustering(self):
         cluster_name = "k_means_" if LooseVersion(sklearn.__version__) < "0.22" else "_kmeans"
         fixture_name = (
             "sklearn.pipeline.Pipeline("
-            "scaler=sklearn.preprocessing.{}.StandardScaler,"
-            "clusterer=sklearn.cluster.{}.KMeans)".format(scaler_name, cluster_name)
+            f"scaler=sklearn.preprocessing.{scaler_name}.StandardScaler,"
+            f"clusterer=sklearn.cluster.{cluster_name}.KMeans)"
         )
         fixture_short_name = "sklearn.Pipeline(StandardScaler,KMeans)"
         fixture_description = self._get_expected_pipeline_description(model)
         fixture_structure = {
             fixture_name: [],
-            "sklearn.preprocessing.{}.StandardScaler".format(scaler_name): ["scaler"],
-            "sklearn.cluster.{}.KMeans".format(cluster_name): ["clusterer"],
+            f"sklearn.preprocessing.{scaler_name}.StandardScaler": ["scaler"],
+            f"sklearn.cluster.{cluster_name}.KMeans": ["clusterer"],
         }
         serialization, new_model = self._serialization_test_helper(
             model,
@@ -530,9 +539,9 @@ def test_serialize_pipeline_clustering(self):
         )
         structure = serialization.get_structure("name")
 
-        self.assertEqual(serialization.name, fixture_name)
-        self.assertEqual(serialization.custom_name, fixture_short_name)
-        self.assertEqual(serialization.description, fixture_description)
+        assert serialization.name == fixture_name
+        assert serialization.custom_name == fixture_short_name
+        assert serialization.description == fixture_description
         self.assertDictEqual(structure, fixture_structure)
 
         # Comparing the pipeline
@@ -540,37 +549,34 @@ def test_serialize_pipeline_clustering(self):
         # as value
         # memory parameter has been added in 0.19
         if LooseVersion(sklearn.__version__) < "0.19":
-            self.assertEqual(len(serialization.parameters), 1)
+            assert len(serialization.parameters) == 1
         elif LooseVersion(sklearn.__version__) < "0.21":
-            self.assertEqual(len(serialization.parameters), 2)
+            assert len(serialization.parameters) == 2
         else:
-            self.assertEqual(len(serialization.parameters), 3)
+            assert len(serialization.parameters) == 3
         # Hard to compare two representations of a dict due to possibly
         # different sorting. Making a json makes it easier
-        self.assertEqual(
-            json.loads(serialization.parameters["steps"]),
-            [
-                {
-                    "oml-python:serialized_object": "component_reference",
-                    "value": {"key": "scaler", "step_name": "scaler"},
-                },
-                {
-                    "oml-python:serialized_object": "component_reference",
-                    "value": {"key": "clusterer", "step_name": "clusterer"},
-                },
-            ],
-        )
+        assert json.loads(serialization.parameters["steps"]) == [
+            {
+                "oml-python:serialized_object": "component_reference",
+                "value": {"key": "scaler", "step_name": "scaler"},
+            },
+            {
+                "oml-python:serialized_object": "component_reference",
+                "value": {"key": "clusterer", "step_name": "clusterer"},
+            },
+        ]
 
         # Checking the sub-component
-        self.assertEqual(len(serialization.components), 2)
-        self.assertIsInstance(serialization.components["scaler"], OpenMLFlow)
-        self.assertIsInstance(serialization.components["clusterer"], OpenMLFlow)
+        assert len(serialization.components) == 2
+        assert isinstance(serialization.components["scaler"], OpenMLFlow)
+        assert isinstance(serialization.components["clusterer"], OpenMLFlow)
 
-        self.assertEqual([step[0] for step in new_model.steps], [step[0] for step in model.steps])
-        self.assertIsNot(new_model.steps[0][1], model.steps[0][1])
-        self.assertIsNot(new_model.steps[1][1], model.steps[1][1])
+        assert [step[0] for step in new_model.steps] == [step[0] for step in model.steps]
+        assert new_model.steps[0][1] is not model.steps[0][1]
+        assert new_model.steps[1][1] is not model.steps[1][1]
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     @unittest.skipIf(
         LooseVersion(sklearn.__version__) < "0.20",
         reason="columntransformer introduction in 0.20.0",
@@ -595,8 +601,8 @@ def test_serialize_column_transformer(self):
         scaler_name = "data" if LooseVersion(sklearn.__version__) < "0.22" else "_data"
         fixture = (
             "sklearn.compose._column_transformer.ColumnTransformer("
-            "numeric=sklearn.preprocessing.{}.StandardScaler,"
-            "nominal=sklearn.preprocessing._encoders.OneHotEncoder,drop=drop)".format(scaler_name)
+            f"numeric=sklearn.preprocessing.{scaler_name}.StandardScaler,"
+            "nominal=sklearn.preprocessing._encoders.OneHotEncoder,drop=drop)"
         )
         fixture_short_name = "sklearn.ColumnTransformer"
 
@@ -617,19 +623,19 @@ def test_serialize_column_transformer(self):
 
         fixture_structure = {
             fixture: [],
-            "sklearn.preprocessing.{}.StandardScaler".format(scaler_name): ["numeric"],
+            f"sklearn.preprocessing.{scaler_name}.StandardScaler": ["numeric"],
             "sklearn.preprocessing._encoders.OneHotEncoder": ["nominal"],
             "drop": ["drop"],
         }
 
         serialization = self.extension.model_to_flow(model)
         structure = serialization.get_structure("name")
-        self.assertEqual(serialization.name, fixture)
-        self.assertEqual(serialization.custom_name, fixture_short_name)
-        self.assertEqual(serialization.description, fixture_description)
+        assert serialization.name == fixture
+        assert serialization.custom_name == fixture_short_name
+        assert serialization.description == fixture_description
         self.assertDictEqual(structure, fixture_structure)
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     @unittest.skipIf(
         LooseVersion(sklearn.__version__) < "0.20",
         reason="columntransformer introduction in 0.20.0",
@@ -650,7 +656,7 @@ def test_serialize_column_transformer_pipeline(self):
             remainder="passthrough",
         )
         model = sklearn.pipeline.Pipeline(
-            steps=[("transformer", inner), ("classifier", sklearn.tree.DecisionTreeClassifier())]
+            steps=[("transformer", inner), ("classifier", sklearn.tree.DecisionTreeClassifier())],
         )
         scaler_name = "data" if LooseVersion(sklearn.__version__) < "0.22" else "_data"
         tree_name = "tree" if LooseVersion(sklearn.__version__) < "0.22" else "_classes"
@@ -658,20 +664,20 @@ def test_serialize_column_transformer_pipeline(self):
             "sklearn.pipeline.Pipeline("
             "transformer=sklearn.compose._column_transformer."
             "ColumnTransformer("
-            "numeric=sklearn.preprocessing.{}.StandardScaler,"
+            f"numeric=sklearn.preprocessing.{scaler_name}.StandardScaler,"
             "nominal=sklearn.preprocessing._encoders.OneHotEncoder),"
-            "classifier=sklearn.tree.{}.DecisionTreeClassifier)".format(scaler_name, tree_name)
+            f"classifier=sklearn.tree.{tree_name}.DecisionTreeClassifier)"
         )
         fixture_structure = {
-            "sklearn.preprocessing.{}.StandardScaler".format(scaler_name): [
+            f"sklearn.preprocessing.{scaler_name}.StandardScaler": [
                 "transformer",
                 "numeric",
             ],
             "sklearn.preprocessing._encoders.OneHotEncoder": ["transformer", "nominal"],
             "sklearn.compose._column_transformer.ColumnTransformer(numeric="
-            "sklearn.preprocessing.{}.StandardScaler,nominal=sklearn."
-            "preprocessing._encoders.OneHotEncoder)".format(scaler_name): ["transformer"],
-            "sklearn.tree.{}.DecisionTreeClassifier".format(tree_name): ["classifier"],
+            f"sklearn.preprocessing.{scaler_name}.StandardScaler,nominal=sklearn."
+            "preprocessing._encoders.OneHotEncoder)": ["transformer"],
+            f"sklearn.tree.{tree_name}.DecisionTreeClassifier": ["classifier"],
             fixture_name: [],
         }
 
@@ -691,14 +697,15 @@ def test_serialize_column_transformer_pipeline(self):
             dependencies_mock_call_count=(5, 10),
         )
         structure = serialization.get_structure("name")
-        self.assertEqual(serialization.name, fixture_name)
-        self.assertEqual(serialization.description, fixture_description)
+        assert serialization.name == fixture_name
+        assert serialization.description == fixture_description
 
         self.assertDictEqual(structure, fixture_structure)
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     @unittest.skipIf(
-        LooseVersion(sklearn.__version__) < "0.20", reason="Pipeline processing behaviour updated"
+        LooseVersion(sklearn.__version__) < "0.20",
+        reason="Pipeline processing behaviour updated",
     )
     def test_serialize_feature_union(self):
         ohe_params = {"sparse": False}
@@ -721,33 +728,30 @@ def test_serialize_feature_union(self):
         scaler_name = "data" if LooseVersion(sklearn.__version__) < "0.22" else "_data"
         fixture_name = (
             "sklearn.pipeline.FeatureUnion("
-            "ohe=sklearn.preprocessing.{}.OneHotEncoder,"
-            "scaler=sklearn.preprocessing.{}.StandardScaler)".format(
-                module_name_encoder, scaler_name
-            )
+            f"ohe=sklearn.preprocessing.{module_name_encoder}.OneHotEncoder,"
+            f"scaler=sklearn.preprocessing.{scaler_name}.StandardScaler)"
         )
         fixture_structure = {
             fixture_name: [],
-            "sklearn.preprocessing.{}." "OneHotEncoder".format(module_name_encoder): ["ohe"],
-            "sklearn.preprocessing.{}.StandardScaler".format(scaler_name): ["scaler"],
+            f"sklearn.preprocessing.{module_name_encoder}." "OneHotEncoder": ["ohe"],
+            f"sklearn.preprocessing.{scaler_name}.StandardScaler": ["scaler"],
         }
-        self.assertEqual(serialization.name, fixture_name)
+        assert serialization.name == fixture_name
         self.assertDictEqual(structure, fixture_structure)
-        self.assertEqual(new_model.transformer_list[0][0], fu.transformer_list[0][0])
-        self.assertEqual(
-            new_model.transformer_list[0][1].get_params(), fu.transformer_list[0][1].get_params()
+        assert new_model.transformer_list[0][0] == fu.transformer_list[0][0]
+        assert (
+            new_model.transformer_list[0][1].get_params() == fu.transformer_list[0][1].get_params()
         )
-        self.assertEqual(new_model.transformer_list[1][0], fu.transformer_list[1][0])
-        self.assertEqual(
-            new_model.transformer_list[1][1].get_params(), fu.transformer_list[1][1].get_params()
+        assert new_model.transformer_list[1][0] == fu.transformer_list[1][0]
+        assert (
+            new_model.transformer_list[1][1].get_params() == fu.transformer_list[1][1].get_params()
         )
 
-        self.assertEqual(
-            [step[0] for step in new_model.transformer_list],
-            [step[0] for step in fu.transformer_list],
-        )
-        self.assertIsNot(new_model.transformer_list[0][1], fu.transformer_list[0][1])
-        self.assertIsNot(new_model.transformer_list[1][1], fu.transformer_list[1][1])
+        assert [step[0] for step in new_model.transformer_list] == [
+            step[0] for step in fu.transformer_list
+        ]
+        assert new_model.transformer_list[0][1] is not fu.transformer_list[0][1]
+        assert new_model.transformer_list[1][1] is not fu.transformer_list[1][1]
 
         fu.set_params(scaler="drop")
         serialization, new_model = self._serialization_test_helper(
@@ -757,15 +761,14 @@ def test_serialize_feature_union(self):
             subcomponent_parameters=("ohe", "transformer_list"),
             dependencies_mock_call_count=(3, 6),
         )
-        self.assertEqual(
-            serialization.name,
-            "sklearn.pipeline.FeatureUnion("
-            "ohe=sklearn.preprocessing.{}.OneHotEncoder,"
-            "scaler=drop)".format(module_name_encoder),
+        assert (
+            serialization.name == "sklearn.pipeline.FeatureUnion("
+            f"ohe=sklearn.preprocessing.{module_name_encoder}.OneHotEncoder,"
+            "scaler=drop)"
         )
-        self.assertIs(new_model.transformer_list[1][1], "drop")
+        assert new_model.transformer_list[1][1] == "drop"
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_serialize_feature_union_switched_names(self):
         ohe_params = {"categories": "auto"} if LooseVersion(sklearn.__version__) >= "0.20" else {}
         ohe = sklearn.preprocessing.OneHotEncoder(**ohe_params)
@@ -791,30 +794,26 @@ def test_serialize_feature_union_switched_names(self):
         # OneHotEncoder was moved to _encoders module in 0.20
         module_name_encoder = "_encoders" if LooseVersion(sklearn.__version__) >= "0.20" else "data"
         scaler_name = "data" if LooseVersion(sklearn.__version__) < "0.22" else "_data"
-        self.assertEqual(
-            fu1_serialization.name,
-            "sklearn.pipeline.FeatureUnion("
-            "ohe=sklearn.preprocessing.{}.OneHotEncoder,"
-            "scaler=sklearn.preprocessing.{}.StandardScaler)".format(
-                module_name_encoder, scaler_name
-            ),
+        assert (
+            fu1_serialization.name == "sklearn.pipeline.FeatureUnion("
+            f"ohe=sklearn.preprocessing.{module_name_encoder}.OneHotEncoder,"
+            f"scaler=sklearn.preprocessing.{scaler_name}.StandardScaler)"
         )
-        self.assertEqual(
-            fu2_serialization.name,
-            "sklearn.pipeline.FeatureUnion("
-            "scaler=sklearn.preprocessing.{}.OneHotEncoder,"
-            "ohe=sklearn.preprocessing.{}.StandardScaler)".format(module_name_encoder, scaler_name),
+        assert (
+            fu2_serialization.name == "sklearn.pipeline.FeatureUnion("
+            f"scaler=sklearn.preprocessing.{module_name_encoder}.OneHotEncoder,"
+            f"ohe=sklearn.preprocessing.{scaler_name}.StandardScaler)"
         )
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_serialize_complex_flow(self):
         ohe = sklearn.preprocessing.OneHotEncoder(handle_unknown="ignore")
         scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
         boosting = sklearn.ensemble.AdaBoostClassifier(
-            base_estimator=sklearn.tree.DecisionTreeClassifier()
+            base_estimator=sklearn.tree.DecisionTreeClassifier(),
         )
         model = sklearn.pipeline.Pipeline(
-            steps=[("ohe", ohe), ("scaler", scaler), ("boosting", boosting)]
+            steps=[("ohe", ohe), ("scaler", scaler), ("boosting", boosting)],
         )
         parameter_grid = {
             "boosting__base_estimator__max_depth": scipy.stats.randint(1, 10),
@@ -825,7 +824,9 @@ def test_serialize_complex_flow(self):
         parameter_grid = OrderedDict(sorted(parameter_grid.items()))
         cv = sklearn.model_selection.StratifiedKFold(n_splits=5, shuffle=True)
         rs = sklearn.model_selection.RandomizedSearchCV(
-            estimator=model, param_distributions=parameter_grid, cv=cv
+            estimator=model,
+            param_distributions=parameter_grid,
+            cv=cv,
         )
         serialized, new_model = self._serialization_test_helper(
             rs,
@@ -839,16 +840,17 @@ def test_serialize_complex_flow(self):
         module_name_encoder = "_encoders" if LooseVersion(sklearn.__version__) >= "0.20" else "data"
         ohe_name = "sklearn.preprocessing.%s.OneHotEncoder" % module_name_encoder
         scaler_name = "sklearn.preprocessing.{}.StandardScaler".format(
-            "data" if LooseVersion(sklearn.__version__) < "0.22" else "_data"
+            "data" if LooseVersion(sklearn.__version__) < "0.22" else "_data",
         )
         tree_name = "sklearn.tree.{}.DecisionTreeClassifier".format(
-            "tree" if LooseVersion(sklearn.__version__) < "0.22" else "_classes"
+            "tree" if LooseVersion(sklearn.__version__) < "0.22" else "_classes",
         )
         weight_name = "weight" if LooseVersion(sklearn.__version__) < "0.22" else "_weight"
         boosting_name = "sklearn.ensemble.{}_boosting.AdaBoostClassifier(base_estimator={})".format(
-            weight_name, tree_name
+            weight_name,
+            tree_name,
         )
-        pipeline_name = "sklearn.pipeline.Pipeline(ohe=%s,scaler=%s," "boosting=%s)" % (
+        pipeline_name = "sklearn.pipeline.Pipeline(ohe={},scaler={}," "boosting={})".format(
             ohe_name,
             scaler_name,
             boosting_name,
@@ -864,10 +866,10 @@ def test_serialize_complex_flow(self):
             pipeline_name: ["estimator"],
             fixture_name: [],
         }
-        self.assertEqual(serialized.name, fixture_name)
-        self.assertEqual(structure, fixture_structure)
+        assert serialized.name == fixture_name
+        assert structure == fixture_structure
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     @unittest.skipIf(
         LooseVersion(sklearn.__version__) < "0.21",
         reason="Pipeline till 0.20 doesn't support 'passthrough'",
@@ -878,53 +880,56 @@ def test_serialize_strings_as_pipeline_steps(self):
         # First check: test whether a passthrough in a pipeline is serialized correctly
         model = sklearn.pipeline.Pipeline(steps=[("transformer", "passthrough")])
         serialized = self.extension.model_to_flow(model)
-        self.assertIsInstance(serialized, OpenMLFlow)
-        self.assertEqual(len(serialized.components), 1)
-        self.assertEqual(serialized.components["transformer"].name, "passthrough")
+        assert isinstance(serialized, OpenMLFlow)
+        assert len(serialized.components) == 1
+        assert serialized.components["transformer"].name == "passthrough"
         serialized = self.extension._serialize_sklearn(
-            ("transformer", "passthrough"), parent_model=model
+            ("transformer", "passthrough"),
+            parent_model=model,
         )
-        self.assertEqual(serialized, ("transformer", "passthrough"))
+        assert serialized == ("transformer", "passthrough")
         extracted_info = self.extension._extract_information_from_model(model)
-        self.assertEqual(len(extracted_info[2]), 1)
-        self.assertIsInstance(extracted_info[2]["transformer"], OpenMLFlow)
-        self.assertEqual(extracted_info[2]["transformer"].name, "passthrough")
+        assert len(extracted_info[2]) == 1
+        assert isinstance(extracted_info[2]["transformer"], OpenMLFlow)
+        assert extracted_info[2]["transformer"].name == "passthrough"
 
         # Second check: test whether a lone passthrough in a column transformer is serialized
         # correctly
         model = sklearn.compose.ColumnTransformer([("passthrough", "passthrough", (0,))])
         serialized = self.extension.model_to_flow(model)
-        self.assertIsInstance(serialized, OpenMLFlow)
-        self.assertEqual(len(serialized.components), 1)
-        self.assertEqual(serialized.components["passthrough"].name, "passthrough")
+        assert isinstance(serialized, OpenMLFlow)
+        assert len(serialized.components) == 1
+        assert serialized.components["passthrough"].name == "passthrough"
         serialized = self.extension._serialize_sklearn(
-            ("passthrough", "passthrough"), parent_model=model
+            ("passthrough", "passthrough"),
+            parent_model=model,
         )
-        self.assertEqual(serialized, ("passthrough", "passthrough"))
+        assert serialized == ("passthrough", "passthrough")
         extracted_info = self.extension._extract_information_from_model(model)
-        self.assertEqual(len(extracted_info[2]), 1)
-        self.assertIsInstance(extracted_info[2]["passthrough"], OpenMLFlow)
-        self.assertEqual(extracted_info[2]["passthrough"].name, "passthrough")
+        assert len(extracted_info[2]) == 1
+        assert isinstance(extracted_info[2]["passthrough"], OpenMLFlow)
+        assert extracted_info[2]["passthrough"].name == "passthrough"
 
         # Third check: passthrough and drop in a column transformer
         model = sklearn.compose.ColumnTransformer(
-            [("passthrough", "passthrough", (0,)), ("drop", "drop", (1,))]
+            [("passthrough", "passthrough", (0,)), ("drop", "drop", (1,))],
         )
         serialized = self.extension.model_to_flow(model)
-        self.assertIsInstance(serialized, OpenMLFlow)
-        self.assertEqual(len(serialized.components), 2)
-        self.assertEqual(serialized.components["passthrough"].name, "passthrough")
-        self.assertEqual(serialized.components["drop"].name, "drop")
+        assert isinstance(serialized, OpenMLFlow)
+        assert len(serialized.components) == 2
+        assert serialized.components["passthrough"].name == "passthrough"
+        assert serialized.components["drop"].name == "drop"
         serialized = self.extension._serialize_sklearn(
-            ("passthrough", "passthrough"), parent_model=model
+            ("passthrough", "passthrough"),
+            parent_model=model,
         )
-        self.assertEqual(serialized, ("passthrough", "passthrough"))
+        assert serialized == ("passthrough", "passthrough")
         extracted_info = self.extension._extract_information_from_model(model)
-        self.assertEqual(len(extracted_info[2]), 2)
-        self.assertIsInstance(extracted_info[2]["passthrough"], OpenMLFlow)
-        self.assertIsInstance(extracted_info[2]["drop"], OpenMLFlow)
-        self.assertEqual(extracted_info[2]["passthrough"].name, "passthrough")
-        self.assertEqual(extracted_info[2]["drop"].name, "drop")
+        assert len(extracted_info[2]) == 2
+        assert isinstance(extracted_info[2]["passthrough"], OpenMLFlow)
+        assert isinstance(extracted_info[2]["drop"], OpenMLFlow)
+        assert extracted_info[2]["passthrough"].name == "passthrough"
+        assert extracted_info[2]["drop"].name == "drop"
 
         # Fourth check: having an actual preprocessor in the column transformer, too
         model = sklearn.compose.ColumnTransformer(
@@ -932,50 +937,51 @@ def test_serialize_strings_as_pipeline_steps(self):
                 ("passthrough", "passthrough", (0,)),
                 ("drop", "drop", (1,)),
                 ("test", sklearn.preprocessing.StandardScaler(), (2,)),
-            ]
+            ],
         )
         serialized = self.extension.model_to_flow(model)
-        self.assertIsInstance(serialized, OpenMLFlow)
-        self.assertEqual(len(serialized.components), 3)
-        self.assertEqual(serialized.components["passthrough"].name, "passthrough")
-        self.assertEqual(serialized.components["drop"].name, "drop")
+        assert isinstance(serialized, OpenMLFlow)
+        assert len(serialized.components) == 3
+        assert serialized.components["passthrough"].name == "passthrough"
+        assert serialized.components["drop"].name == "drop"
         serialized = self.extension._serialize_sklearn(
-            ("passthrough", "passthrough"), parent_model=model
+            ("passthrough", "passthrough"),
+            parent_model=model,
         )
-        self.assertEqual(serialized, ("passthrough", "passthrough"))
+        assert serialized == ("passthrough", "passthrough")
         extracted_info = self.extension._extract_information_from_model(model)
-        self.assertEqual(len(extracted_info[2]), 3)
-        self.assertIsInstance(extracted_info[2]["passthrough"], OpenMLFlow)
-        self.assertIsInstance(extracted_info[2]["drop"], OpenMLFlow)
-        self.assertEqual(extracted_info[2]["passthrough"].name, "passthrough")
-        self.assertEqual(extracted_info[2]["drop"].name, "drop")
+        assert len(extracted_info[2]) == 3
+        assert isinstance(extracted_info[2]["passthrough"], OpenMLFlow)
+        assert isinstance(extracted_info[2]["drop"], OpenMLFlow)
+        assert extracted_info[2]["passthrough"].name == "passthrough"
+        assert extracted_info[2]["drop"].name == "drop"
 
         # Fifth check: test whether a lone drop in a feature union is serialized correctly
         model = sklearn.pipeline.FeatureUnion([("drop", "drop")])
         serialized = self.extension.model_to_flow(model)
-        self.assertIsInstance(serialized, OpenMLFlow)
-        self.assertEqual(len(serialized.components), 1)
-        self.assertEqual(serialized.components["drop"].name, "drop")
+        assert isinstance(serialized, OpenMLFlow)
+        assert len(serialized.components) == 1
+        assert serialized.components["drop"].name == "drop"
         serialized = self.extension._serialize_sklearn(("drop", "drop"), parent_model=model)
-        self.assertEqual(serialized, ("drop", "drop"))
+        assert serialized == ("drop", "drop")
         extracted_info = self.extension._extract_information_from_model(model)
-        self.assertEqual(len(extracted_info[2]), 1)
-        self.assertIsInstance(extracted_info[2]["drop"], OpenMLFlow)
-        self.assertEqual(extracted_info[2]["drop"].name, "drop")
+        assert len(extracted_info[2]) == 1
+        assert isinstance(extracted_info[2]["drop"], OpenMLFlow)
+        assert extracted_info[2]["drop"].name == "drop"
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_serialize_type(self):
         supported_types = [float, np.float32, np.float64, int, np.int32, np.int64]
         if LooseVersion(np.__version__) < "1.24":
-            supported_types.append(np.float)
-            supported_types.append(np.int)
+            supported_types.append(float)
+            supported_types.append(int)
 
         for supported_type in supported_types:
             serialized = self.extension.model_to_flow(supported_type)
             deserialized = self.extension.flow_to_model(serialized)
-            self.assertEqual(deserialized, supported_type)
+            assert deserialized == supported_type
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_serialize_rvs(self):
         supported_rvs = [
             scipy.stats.norm(loc=1, scale=5),
@@ -986,18 +992,18 @@ def test_serialize_rvs(self):
         for supported_rv in supported_rvs:
             serialized = self.extension.model_to_flow(supported_rv)
             deserialized = self.extension.flow_to_model(serialized)
-            self.assertEqual(type(deserialized.dist), type(supported_rv.dist))
+            assert type(deserialized.dist) == type(supported_rv.dist)
             del deserialized.dist
             del supported_rv.dist
-            self.assertEqual(deserialized.__dict__, supported_rv.__dict__)
+            assert deserialized.__dict__ == supported_rv.__dict__
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_serialize_function(self):
         serialized = self.extension.model_to_flow(sklearn.feature_selection.chi2)
         deserialized = self.extension.flow_to_model(serialized)
-        self.assertEqual(deserialized, sklearn.feature_selection.chi2)
+        assert deserialized == sklearn.feature_selection.chi2
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_serialize_cvobject(self):
         methods = [sklearn.model_selection.KFold(3), sklearn.model_selection.LeaveOneOut()]
         fixtures = [
@@ -1016,13 +1022,13 @@ def test_serialize_cvobject(self):
                                             ("n_splits", "3"),
                                             ("random_state", "null"),
                                             ("shuffle", "false"),
-                                        ]
+                                        ],
                                     ),
                                 ),
-                            ]
+                            ],
                         ),
                     ),
-                ]
+                ],
             ),
             OrderedDict(
                 [
@@ -1033,21 +1039,21 @@ def test_serialize_cvobject(self):
                             [
                                 ("name", "sklearn.model_selection._split.LeaveOneOut"),
                                 ("parameters", OrderedDict()),
-                            ]
+                            ],
                         ),
                     ),
-                ]
+                ],
             ),
         ]
         for method, fixture in zip(methods, fixtures):
             m = self.extension.model_to_flow(method)
-            self.assertEqual(m, fixture)
+            assert m == fixture
 
             m_new = self.extension.flow_to_model(m)
-            self.assertIsNot(m_new, m)
-            self.assertIsInstance(m_new, type(method))
+            assert m_new is not m
+            assert isinstance(m_new, type(method))
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_serialize_simple_parameter_grid(self):
         # We cannot easily test for scipy random variables in here, but they
         # should be covered
@@ -1058,7 +1064,7 @@ def test_serialize_simple_parameter_grid(self):
             [
                 OrderedDict([("C", [1, 10, 100, 1000]), ("kernel", ["linear"])]),
                 OrderedDict(
-                    [("C", [1, 10, 100, 1000]), ("gamma", [0.001, 0.0001]), ("kernel", ["rbf"])]
+                    [("C", [1, 10, 100, 1000]), ("gamma", [0.001, 0.0001]), ("kernel", ["rbf"])],
                 ),
             ],
             OrderedDict(
@@ -1069,7 +1075,7 @@ def test_serialize_simple_parameter_grid(self):
                     ("max_features", [1, 3, 10]),
                     ("min_samples_leaf", [1, 3, 10]),
                     ("min_samples_split", [1, 3, 10]),
-                ]
+                ],
             ),
         ]
 
@@ -1077,28 +1083,30 @@ def test_serialize_simple_parameter_grid(self):
             serialized = self.extension.model_to_flow(grid)
             deserialized = self.extension.flow_to_model(serialized)
 
-            self.assertEqual(deserialized, grid)
-            self.assertIsNot(deserialized, grid)
+            assert deserialized == grid
+            assert deserialized is not grid
             # providing error_score because nan != nan
             hpo = sklearn.model_selection.GridSearchCV(
-                param_grid=grid, estimator=model, error_score=-1000
+                param_grid=grid,
+                estimator=model,
+                error_score=-1000,
             )
 
             serialized = self.extension.model_to_flow(hpo)
             deserialized = self.extension.flow_to_model(serialized)
-            self.assertEqual(hpo.param_grid, deserialized.param_grid)
-            self.assertEqual(hpo.estimator.get_params(), deserialized.estimator.get_params())
+            assert hpo.param_grid == deserialized.param_grid
+            assert hpo.estimator.get_params() == deserialized.estimator.get_params()
             hpo_params = hpo.get_params(deep=False)
             deserialized_params = deserialized.get_params(deep=False)
             del hpo_params["estimator"]
             del deserialized_params["estimator"]
-            self.assertEqual(hpo_params, deserialized_params)
+            assert hpo_params == deserialized_params
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     @unittest.skip(
         "This feature needs further reworking. If we allow several "
         "components, we need to register them all in the downstream "
-        "flows. This is so far not implemented."
+        "flows. This is so far not implemented.",
     )
     def test_serialize_advanced_grid(self):
         # TODO instead a GridSearchCV object should be serialized
@@ -1120,7 +1128,7 @@ def test_serialize_advanced_grid(self):
             },
             {
                 "reduce_dim": [
-                    sklearn.feature_selection.SelectKBest(sklearn.feature_selection.chi2)
+                    sklearn.feature_selection.SelectKBest(sklearn.feature_selection.chi2),
                 ],
                 "reduce_dim__k": N_FEATURES_OPTIONS,
                 "classify__C": C_OPTIONS,
@@ -1130,26 +1138,24 @@ def test_serialize_advanced_grid(self):
         serialized = self.extension.model_to_flow(grid)
         deserialized = self.extension.flow_to_model(serialized)
 
-        self.assertEqual(
-            grid[0]["reduce_dim"][0].get_params(), deserialized[0]["reduce_dim"][0].get_params()
-        )
-        self.assertIsNot(grid[0]["reduce_dim"][0], deserialized[0]["reduce_dim"][0])
-        self.assertEqual(
-            grid[0]["reduce_dim"][1].get_params(), deserialized[0]["reduce_dim"][1].get_params()
+        assert (
+            grid[0]["reduce_dim"][0].get_params() == deserialized[0]["reduce_dim"][0].get_params()
         )
-        self.assertIsNot(grid[0]["reduce_dim"][1], deserialized[0]["reduce_dim"][1])
-        self.assertEqual(
-            grid[0]["reduce_dim__n_components"], deserialized[0]["reduce_dim__n_components"]
+        assert grid[0]["reduce_dim"][0] is not deserialized[0]["reduce_dim"][0]
+        assert (
+            grid[0]["reduce_dim"][1].get_params() == deserialized[0]["reduce_dim"][1].get_params()
         )
-        self.assertEqual(grid[0]["classify__C"], deserialized[0]["classify__C"])
-        self.assertEqual(
-            grid[1]["reduce_dim"][0].get_params(), deserialized[1]["reduce_dim"][0].get_params()
+        assert grid[0]["reduce_dim"][1] is not deserialized[0]["reduce_dim"][1]
+        assert grid[0]["reduce_dim__n_components"] == deserialized[0]["reduce_dim__n_components"]
+        assert grid[0]["classify__C"] == deserialized[0]["classify__C"]
+        assert (
+            grid[1]["reduce_dim"][0].get_params() == deserialized[1]["reduce_dim"][0].get_params()
         )
-        self.assertIsNot(grid[1]["reduce_dim"][0], deserialized[1]["reduce_dim"][0])
-        self.assertEqual(grid[1]["reduce_dim__k"], deserialized[1]["reduce_dim__k"])
-        self.assertEqual(grid[1]["classify__C"], deserialized[1]["classify__C"])
+        assert grid[1]["reduce_dim"][0] is not deserialized[1]["reduce_dim"][0]
+        assert grid[1]["reduce_dim__k"] == deserialized[1]["reduce_dim__k"]
+        assert grid[1]["classify__C"] == deserialized[1]["classify__C"]
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_serialize_advanced_grid_fails(self):
         # This unit test is checking that the test we skip above would actually fail
 
@@ -1157,28 +1163,29 @@ def test_serialize_advanced_grid_fails(self):
             "base_estimator": [
                 sklearn.tree.DecisionTreeClassifier(),
                 sklearn.tree.ExtraTreeClassifier(),
-            ]
+            ],
         }
 
         clf = sklearn.model_selection.GridSearchCV(
             sklearn.ensemble.BaggingClassifier(),
             param_grid=param_grid,
         )
-        with self.assertRaisesRegex(
-            TypeError, re.compile(r".*OpenML.*Flow.*is not JSON serializable", flags=re.DOTALL)
+        with pytest.raises(
+            TypeError,
+            match=re.compile(r".*OpenML.*Flow.*is not JSON serializable", flags=re.DOTALL),
         ):
             self.extension.model_to_flow(clf)
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_serialize_resampling(self):
         kfold = sklearn.model_selection.StratifiedKFold(n_splits=4, shuffle=True)
         serialized = self.extension.model_to_flow(kfold)
         deserialized = self.extension.flow_to_model(serialized)
         # Best approximation to get_params()
-        self.assertEqual(str(deserialized), str(kfold))
-        self.assertIsNot(deserialized, kfold)
+        assert str(deserialized) == str(kfold)
+        assert deserialized is not kfold
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_hypothetical_parameter_values(self):
         # The hypothetical parameter values of true, 1, 0.1 formatted as a
         # string (and their correct serialization and deserialization) an only
@@ -1189,21 +1196,21 @@ def test_hypothetical_parameter_values(self):
         serialized = self.extension.model_to_flow(model)
         serialized.external_version = "sklearn==test123"
         deserialized = self.extension.flow_to_model(serialized)
-        self.assertEqual(deserialized.get_params(), model.get_params())
-        self.assertIsNot(deserialized, model)
+        assert deserialized.get_params() == model.get_params()
+        assert deserialized is not model
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_gaussian_process(self):
         opt = scipy.optimize.fmin_l_bfgs_b
         kernel = sklearn.gaussian_process.kernels.Matern()
         gp = sklearn.gaussian_process.GaussianProcessClassifier(kernel=kernel, optimizer=opt)
-        with self.assertRaisesRegex(
+        with pytest.raises(
             TypeError,
-            r"Matern\(length_scale=1, nu=1.5\), <class 'sklearn.gaussian_process.kernels.Matern'>",
+            match=r"Matern\(length_scale=1, nu=1.5\), <class 'sklearn.gaussian_process.kernels.Matern'>",
         ):
             self.extension.model_to_flow(gp)
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_error_on_adding_component_multiple_times_to_flow(self):
         # this function implicitly checks
         # - openml.flows._check_multiple_occurence_of_component_in_flow()
@@ -1211,24 +1218,24 @@ def test_error_on_adding_component_multiple_times_to_flow(self):
         pca2 = sklearn.decomposition.PCA()
         pipeline = sklearn.pipeline.Pipeline((("pca1", pca), ("pca2", pca2)))
         fixture = "Found a second occurence of component .*.PCA when trying to serialize Pipeline"
-        with self.assertRaisesRegex(ValueError, fixture):
+        with pytest.raises(ValueError, match=fixture):
             self.extension.model_to_flow(pipeline)
 
         fu = sklearn.pipeline.FeatureUnion((("pca1", pca), ("pca2", pca2)))
         fixture = (
             "Found a second occurence of component .*.PCA when trying " "to serialize FeatureUnion"
         )
-        with self.assertRaisesRegex(ValueError, fixture):
+        with pytest.raises(ValueError, match=fixture):
             self.extension.model_to_flow(fu)
 
         fs = sklearn.feature_selection.SelectKBest()
         fu2 = sklearn.pipeline.FeatureUnion((("pca1", pca), ("fs", fs)))
         pipeline2 = sklearn.pipeline.Pipeline((("fu", fu2), ("pca2", pca2)))
         fixture = "Found a second occurence of component .*.PCA when trying to serialize Pipeline"
-        with self.assertRaisesRegex(ValueError, fixture):
+        with pytest.raises(ValueError, match=fixture):
             self.extension.model_to_flow(pipeline2)
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_subflow_version_propagated(self):
         this_directory = os.path.dirname(os.path.abspath(__file__))
         tests_directory = os.path.abspath(os.path.join(this_directory, "..", ".."))
@@ -1243,44 +1250,40 @@ def test_subflow_version_propagated(self):
         # I put the alternative travis-ci answer here as well. While it has a
         # different value, it is still correct as it is a propagation of the
         # subclasses' module name
-        self.assertEqual(
-            flow.external_version,
-            "%s,%s,%s"
-            % (
-                self.extension._format_external_version("openml", openml.__version__),
-                self.extension._format_external_version("sklearn", sklearn.__version__),
-                self.extension._format_external_version("tests", "0.1"),
-            ),
+        assert flow.external_version == "{},{},{}".format(
+            self.extension._format_external_version("openml", openml.__version__),
+            self.extension._format_external_version("sklearn", sklearn.__version__),
+            self.extension._format_external_version("tests", "0.1"),
         )
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     @mock.patch("warnings.warn")
     def test_check_dependencies(self, warnings_mock):
         dependencies = ["sklearn==0.1", "sklearn>=99.99.99", "sklearn>99.99.99"]
         for dependency in dependencies:
             self.assertRaises(ValueError, self.extension._check_dependencies, dependency)
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_illegal_parameter_names(self):
         # illegal name: estimators
         clf1 = sklearn.ensemble.VotingClassifier(
             estimators=[
                 ("estimators", sklearn.ensemble.RandomForestClassifier()),
                 ("whatevs", sklearn.ensemble.ExtraTreesClassifier()),
-            ]
+            ],
         )
         clf2 = sklearn.ensemble.VotingClassifier(
             estimators=[
                 ("whatevs", sklearn.ensemble.RandomForestClassifier()),
                 ("estimators", sklearn.ensemble.ExtraTreesClassifier()),
-            ]
+            ],
         )
         cases = [clf1, clf2]
 
         for case in cases:
             self.assertRaises(PyOpenMLError, self.extension.model_to_flow, case)
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_paralizable_check(self):
         # using this model should pass the test (if param distribution is
         # legal)
@@ -1297,18 +1300,19 @@ def test_paralizable_check(self):
             sklearn.ensemble.RandomForestClassifier(n_jobs=5),
             sklearn.ensemble.RandomForestClassifier(n_jobs=-1),
             sklearn.pipeline.Pipeline(
-                steps=[("bag", sklearn.ensemble.BaggingClassifier(n_jobs=1))]
+                steps=[("bag", sklearn.ensemble.BaggingClassifier(n_jobs=1))],
             ),
             sklearn.pipeline.Pipeline(
-                steps=[("bag", sklearn.ensemble.BaggingClassifier(n_jobs=5))]
+                steps=[("bag", sklearn.ensemble.BaggingClassifier(n_jobs=5))],
             ),
             sklearn.pipeline.Pipeline(
-                steps=[("bag", sklearn.ensemble.BaggingClassifier(n_jobs=-1))]
+                steps=[("bag", sklearn.ensemble.BaggingClassifier(n_jobs=-1))],
             ),
             sklearn.model_selection.GridSearchCV(singlecore_bagging, legal_param_dist),
             sklearn.model_selection.GridSearchCV(multicore_bagging, legal_param_dist),
             sklearn.ensemble.BaggingClassifier(
-                n_jobs=-1, base_estimator=sklearn.ensemble.RandomForestClassifier(n_jobs=5)
+                n_jobs=-1,
+                base_estimator=sklearn.ensemble.RandomForestClassifier(n_jobs=5),
             ),
         ]
         illegal_models = [
@@ -1324,13 +1328,13 @@ def test_paralizable_check(self):
         X, y = sklearn.datasets.load_iris(return_X_y=True)
         for model, refit_time in zip(legal_models, has_refit_time):
             model.fit(X, y)
-            self.assertEqual(refit_time, hasattr(model, "refit_time_"))
+            assert refit_time == hasattr(model, "refit_time_")
 
         for model in illegal_models:
-            with self.assertRaises(PyOpenMLError):
+            with pytest.raises(PyOpenMLError):
                 self.extension._prevent_optimize_n_jobs(model)
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test__get_fn_arguments_with_defaults(self):
         sklearn_version = LooseVersion(sklearn.__version__)
         if sklearn_version < "0.19":
@@ -1379,16 +1383,16 @@ def test__get_fn_arguments_with_defaults(self):
 
         for fn, num_params_with_defaults in fns:
             defaults, defaultless = self.extension._get_fn_arguments_with_defaults(fn)
-            self.assertIsInstance(defaults, dict)
-            self.assertIsInstance(defaultless, set)
+            assert isinstance(defaults, dict)
+            assert isinstance(defaultless, set)
             # check whether we have both defaults and defaultless params
-            self.assertEqual(len(defaults), num_params_with_defaults)
-            self.assertGreater(len(defaultless), 0)
+            assert len(defaults) == num_params_with_defaults
+            assert len(defaultless) > 0
             # check no overlap
             self.assertSetEqual(set(defaults.keys()), set(defaults.keys()) - defaultless)
             self.assertSetEqual(defaultless, defaultless - set(defaults.keys()))
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_deserialize_with_defaults(self):
         # used the 'initialize_with_defaults' flag of the deserialization
         # method to return a flow that contains default hyperparameter
@@ -1424,7 +1428,7 @@ def test_deserialize_with_defaults(self):
             self.extension.model_to_flow(pipe_deserialized),
         )
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_deserialize_adaboost_with_defaults(self):
         # used the 'initialize_with_defaults' flag of the deserialization
         # method to return a flow that contains default hyperparameter
@@ -1463,7 +1467,7 @@ def test_deserialize_adaboost_with_defaults(self):
             self.extension.model_to_flow(pipe_deserialized),
         )
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_deserialize_complex_with_defaults(self):
         # used the 'initialize_with_defaults' flag of the deserialization
         # method to return a flow that contains default hyperparameter
@@ -1475,8 +1479,8 @@ def test_deserialize_complex_with_defaults(self):
                 "Estimator",
                 sklearn.ensemble.AdaBoostClassifier(
                     sklearn.ensemble.BaggingClassifier(
-                        sklearn.ensemble.GradientBoostingClassifier()
-                    )
+                        sklearn.ensemble.GradientBoostingClassifier(),
+                    ),
                 ),
             ),
         ]
@@ -1507,11 +1511,11 @@ def test_deserialize_complex_with_defaults(self):
             self.extension.model_to_flow(pipe_deserialized),
         )
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_openml_param_name_to_sklearn(self):
         scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
         boosting = sklearn.ensemble.AdaBoostClassifier(
-            base_estimator=sklearn.tree.DecisionTreeClassifier()
+            base_estimator=sklearn.tree.DecisionTreeClassifier(),
         )
         model = sklearn.pipeline.Pipeline(steps=[("scaler", scaler), ("boosting", boosting)])
         flow = self.extension.model_to_flow(model)
@@ -1524,7 +1528,7 @@ def test_openml_param_name_to_sklearn(self):
         setup = openml.setups.get_setup(run.setup_id)
 
         # make sure to test enough parameters
-        self.assertGreater(len(setup.parameters), 15)
+        assert len(setup.parameters) > 15
 
         for parameter in setup.parameters.values():
             sklearn_name = self.extension._openml_param_name_to_sklearn(parameter, flow)
@@ -1539,32 +1543,30 @@ def test_openml_param_name_to_sklearn(self):
                 subflow = flow.get_subflow(splitted[0:-1])
             else:
                 subflow = flow
-            openml_name = "%s(%s)_%s" % (subflow.name, subflow.version, splitted[-1])
-            self.assertEqual(parameter.full_name, openml_name)
+            openml_name = f"{subflow.name}({subflow.version})_{splitted[-1]}"
+            assert parameter.full_name == openml_name
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_obtain_parameter_values_flow_not_from_server(self):
         model = sklearn.linear_model.LogisticRegression(solver="lbfgs")
         flow = self.extension.model_to_flow(model)
         logistic_name = "logistic" if LooseVersion(sklearn.__version__) < "0.22" else "_logistic"
-        msg = "Flow sklearn.linear_model.{}.LogisticRegression has no flow_id!".format(
-            logistic_name
-        )
+        msg = f"Flow sklearn.linear_model.{logistic_name}.LogisticRegression has no flow_id!"
 
-        with self.assertRaisesRegex(ValueError, msg):
+        with pytest.raises(ValueError, match=msg):
             self.extension.obtain_parameter_values(flow)
 
         model = sklearn.ensemble.AdaBoostClassifier(
             base_estimator=sklearn.linear_model.LogisticRegression(
                 solver="lbfgs",
-            )
+            ),
         )
         flow = self.extension.model_to_flow(model)
         flow.flow_id = 1
-        with self.assertRaisesRegex(ValueError, msg):
+        with pytest.raises(ValueError, match=msg):
             self.extension.obtain_parameter_values(flow)
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_obtain_parameter_values(self):
         model = sklearn.model_selection.RandomizedSearchCV(
             estimator=sklearn.ensemble.RandomForestClassifier(n_estimators=5),
@@ -1584,24 +1586,25 @@ def test_obtain_parameter_values(self):
         flow.components["estimator"].flow_id = 2
         parameters = self.extension.obtain_parameter_values(flow)
         for parameter in parameters:
-            self.assertIsNotNone(parameter["oml:component"], msg=parameter)
+            assert parameter["oml:component"] is not None, parameter
             if parameter["oml:name"] == "n_estimators":
-                self.assertEqual(parameter["oml:value"], "5")
-                self.assertEqual(parameter["oml:component"], 2)
+                assert parameter["oml:value"] == "5"
+                assert parameter["oml:component"] == 2
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_numpy_type_allowed_in_flow(self):
         """Simple numpy types should be serializable."""
         dt = sklearn.tree.DecisionTreeClassifier(
-            max_depth=np.float64(3.0), min_samples_leaf=np.int32(5)
+            max_depth=np.float64(3.0),
+            min_samples_leaf=np.int32(5),
         )
         self.extension.model_to_flow(dt)
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_numpy_array_not_allowed_in_flow(self):
         """Simple numpy arrays should not be serializable."""
         bin = sklearn.preprocessing.MultiLabelBinarizer(classes=np.asarray([1, 2, 3]))
-        with self.assertRaises(TypeError):
+        with pytest.raises(TypeError):
             self.extension.model_to_flow(bin)
 
 
@@ -1615,7 +1618,7 @@ def setUp(self):
     ################################################################################################
     # Test methods for performing runs with this extension module
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_run_model_on_task(self):
         task = openml.tasks.get_task(1)  # anneal; crossvalidation
         # using most_frequent imputer since dataset has mixed types and to keep things simple
@@ -1623,11 +1626,11 @@ def test_run_model_on_task(self):
             [
                 ("imp", SimpleImputer(strategy="most_frequent")),
                 ("dummy", sklearn.dummy.DummyClassifier()),
-            ]
+            ],
         )
         openml.runs.run_model_on_task(pipe, task, dataset_format="array")
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_seed_model(self):
         # randomized models that are initialized without seeds, can be seeded
         randomized_clfs = [
@@ -1650,11 +1653,11 @@ def test_seed_model(self):
             const_probe = 42
             all_params = clf.get_params()
             params = [key for key in all_params if key.endswith("random_state")]
-            self.assertGreater(len(params), 0)
+            assert len(params) > 0
 
             # before param value is None
             for param in params:
-                self.assertIsNone(all_params[param])
+                assert all_params[param] is None
 
             # now seed the params
             clf_seeded = self.extension.seed_model(clf, const_probe)
@@ -1664,13 +1667,13 @@ def test_seed_model(self):
 
             # afterwards, param value is set
             for param in randstate_params:
-                self.assertIsInstance(new_params[param], int)
-                self.assertIsNotNone(new_params[param])
+                assert isinstance(new_params[param], int)
+                assert new_params[param] is not None
 
             if idx == 1:
-                self.assertEqual(clf.cv.random_state, 56422)
+                assert clf.cv.random_state == 56422
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_seed_model_raises(self):
         # the _set_model_seed_where_none should raise exception if random_state is
         # anything else than an int
@@ -1680,10 +1683,10 @@ def test_seed_model_raises(self):
         ]
 
         for clf in randomized_clfs:
-            with self.assertRaises(ValueError):
+            with pytest.raises(ValueError):
                 self.extension.seed_model(model=clf, seed=42)
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_run_model_on_fold_classification_1_array(self):
         task = openml.tasks.get_task(1)  # anneal; crossvalidation
 
@@ -1695,7 +1698,7 @@ def test_run_model_on_fold_classification_1_array(self):
         y_test = y[test_indices]
 
         pipeline = sklearn.pipeline.Pipeline(
-            steps=[("imp", SimpleImputer()), ("clf", sklearn.tree.DecisionTreeClassifier())]
+            steps=[("imp", SimpleImputer()), ("clf", sklearn.tree.DecisionTreeClassifier())],
         )
         # TODO add some mocking here to actually test the innards of this function, too!
         res = self.extension._run_model_on_fold(
@@ -1711,26 +1714,27 @@ def test_run_model_on_fold_classification_1_array(self):
         y_hat, y_hat_proba, user_defined_measures, trace = res
 
         # predictions
-        self.assertIsInstance(y_hat, np.ndarray)
-        self.assertEqual(y_hat.shape, y_test.shape)
-        self.assertIsInstance(y_hat_proba, pd.DataFrame)
-        self.assertEqual(y_hat_proba.shape, (y_test.shape[0], 6))
+        assert isinstance(y_hat, np.ndarray)
+        assert y_hat.shape == y_test.shape
+        assert isinstance(y_hat_proba, pd.DataFrame)
+        assert y_hat_proba.shape == (y_test.shape[0], 6)
         np.testing.assert_array_almost_equal(np.sum(y_hat_proba, axis=1), np.ones(y_test.shape))
         # The class '4' (at index 3) is not present in the training data. We check that the
         # predicted probabilities for that class are zero!
         np.testing.assert_array_almost_equal(
-            y_hat_proba.iloc[:, 3].to_numpy(), np.zeros(y_test.shape)
+            y_hat_proba.iloc[:, 3].to_numpy(),
+            np.zeros(y_test.shape),
         )
         for i in (0, 1, 2, 4, 5):
-            self.assertTrue(np.any(y_hat_proba.iloc[:, i].to_numpy() != np.zeros(y_test.shape)))
+            assert np.any(y_hat_proba.iloc[:, i].to_numpy() != np.zeros(y_test.shape))
 
         # check user defined measures
-        fold_evaluations = collections.defaultdict(lambda: collections.defaultdict(dict))
+        fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(lambda: collections.defaultdict(dict))
         for measure in user_defined_measures:
             fold_evaluations[measure][0][0] = user_defined_measures[measure]
 
         # trace. SGD does not produce any
-        self.assertIsNone(trace)
+        assert trace is None
 
         self._check_fold_timing_evaluations(
             fold_evaluations,
@@ -1740,7 +1744,7 @@ def test_run_model_on_fold_classification_1_array(self):
             check_scores=False,
         )
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     @unittest.skipIf(
         LooseVersion(sklearn.__version__) < "0.21",
         reason="SimpleImputer, ColumnTransformer available only after 0.19 and "
@@ -1767,7 +1771,7 @@ def test_run_model_on_fold_classification_1_dataframe(self):
         cont_imp = make_pipeline(CustomImputer(strategy="mean"), StandardScaler())
         ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
         pipeline = sklearn.pipeline.Pipeline(
-            steps=[("transform", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())]
+            steps=[("transform", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())],
         )
         # TODO add some mocking here to actually test the innards of this function, too!
         res = self.extension._run_model_on_fold(
@@ -1783,26 +1787,27 @@ def test_run_model_on_fold_classification_1_dataframe(self):
         y_hat, y_hat_proba, user_defined_measures, trace = res
 
         # predictions
-        self.assertIsInstance(y_hat, np.ndarray)
-        self.assertEqual(y_hat.shape, y_test.shape)
-        self.assertIsInstance(y_hat_proba, pd.DataFrame)
-        self.assertEqual(y_hat_proba.shape, (y_test.shape[0], 6))
+        assert isinstance(y_hat, np.ndarray)
+        assert y_hat.shape == y_test.shape
+        assert isinstance(y_hat_proba, pd.DataFrame)
+        assert y_hat_proba.shape == (y_test.shape[0], 6)
         np.testing.assert_array_almost_equal(np.sum(y_hat_proba, axis=1), np.ones(y_test.shape))
         # The class '4' (at index 3) is not present in the training data. We check that the
         # predicted probabilities for that class are zero!
         np.testing.assert_array_almost_equal(
-            y_hat_proba.iloc[:, 3].to_numpy(), np.zeros(y_test.shape)
+            y_hat_proba.iloc[:, 3].to_numpy(),
+            np.zeros(y_test.shape),
         )
         for i in (0, 1, 2, 4, 5):
-            self.assertTrue(np.any(y_hat_proba.iloc[:, i].to_numpy() != np.zeros(y_test.shape)))
+            assert np.any(y_hat_proba.iloc[:, i].to_numpy() != np.zeros(y_test.shape))
 
         # check user defined measures
-        fold_evaluations = collections.defaultdict(lambda: collections.defaultdict(dict))
+        fold_evaluations: dict[str, dict[int, dict[int, float]]]  = collections.defaultdict(lambda: collections.defaultdict(dict))
         for measure in user_defined_measures:
             fold_evaluations[measure][0][0] = user_defined_measures[measure]
 
         # trace. SGD does not produce any
-        self.assertIsNone(trace)
+        assert trace is None
 
         self._check_fold_timing_evaluations(
             fold_evaluations,
@@ -1812,7 +1817,7 @@ def test_run_model_on_fold_classification_1_dataframe(self):
             check_scores=False,
         )
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_run_model_on_fold_classification_2(self):
         task = openml.tasks.get_task(7)  # kr-vs-kp; crossvalidation
 
@@ -1841,22 +1846,22 @@ def test_run_model_on_fold_classification_2(self):
         y_hat, y_hat_proba, user_defined_measures, trace = res
 
         # predictions
-        self.assertIsInstance(y_hat, np.ndarray)
-        self.assertEqual(y_hat.shape, y_test.shape)
-        self.assertIsInstance(y_hat_proba, pd.DataFrame)
-        self.assertEqual(y_hat_proba.shape, (y_test.shape[0], 2))
+        assert isinstance(y_hat, np.ndarray)
+        assert y_hat.shape == y_test.shape
+        assert isinstance(y_hat_proba, pd.DataFrame)
+        assert y_hat_proba.shape == (y_test.shape[0], 2)
         np.testing.assert_array_almost_equal(np.sum(y_hat_proba, axis=1), np.ones(y_test.shape))
         for i in (0, 1):
-            self.assertTrue(np.any(y_hat_proba.to_numpy()[:, i] != np.zeros(y_test.shape)))
+            assert np.any(y_hat_proba.to_numpy()[:, i] != np.zeros(y_test.shape))
 
         # check user defined measures
-        fold_evaluations = collections.defaultdict(lambda: collections.defaultdict(dict))
+        fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(lambda: collections.defaultdict(dict))
         for measure in user_defined_measures:
             fold_evaluations[measure][0][0] = user_defined_measures[measure]
 
         # check that it produced and returned a trace object of the correct length
-        self.assertIsInstance(trace, OpenMLRunTrace)
-        self.assertEqual(len(trace.trace_iterations), 2)
+        assert isinstance(trace, OpenMLRunTrace)
+        assert len(trace.trace_iterations) == 2
 
         self._check_fold_timing_evaluations(
             fold_evaluations,
@@ -1866,7 +1871,7 @@ def test_run_model_on_fold_classification_2(self):
             check_scores=False,
         )
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_run_model_on_fold_classification_3(self):
         class HardNaiveBayes(sklearn.naive_bayes.GaussianNB):
             # class for testing a naive bayes classifier that does not allow soft
@@ -1887,7 +1892,9 @@ def predict_proba(*args, **kwargs):
             task = openml.tasks.get_task(task_id)
             X, y = task.get_X_and_y()
             train_indices, test_indices = task.get_train_test_split_indices(
-                repeat=0, fold=0, sample=0
+                repeat=0,
+                fold=0,
+                sample=0,
             )
             X_train = X[train_indices]
             y_train = y[train_indices]
@@ -1896,10 +1903,10 @@ def predict_proba(*args, **kwargs):
                 steps=[
                     ("imputer", SimpleImputer()),
                     ("estimator", sklearn.naive_bayes.GaussianNB()),
-                ]
+                ],
             )
             clf2 = sklearn.pipeline.Pipeline(
-                steps=[("imputer", SimpleImputer()), ("estimator", HardNaiveBayes())]
+                steps=[("imputer", SimpleImputer()), ("estimator", HardNaiveBayes())],
             )
 
             pred_1, proba_1, _, _ = self.extension._run_model_on_fold(
@@ -1925,19 +1932,18 @@ def predict_proba(*args, **kwargs):
             np.testing.assert_array_equal(pred_1, pred_2)
             np.testing.assert_array_almost_equal(np.sum(proba_1, axis=1), np.ones(X_test.shape[0]))
             # Test that there are predictions other than ones and zeros
-            self.assertLess(
-                np.sum(proba_1.to_numpy() == 0) + np.sum(proba_1.to_numpy() == 1),
-                X_test.shape[0] * len(task.class_labels),
-            )
+            assert np.sum(proba_1.to_numpy() == 0) + np.sum(proba_1.to_numpy() == 1) < X_test.shape[
+                0
+            ] * len(task.class_labels)
 
             np.testing.assert_array_almost_equal(np.sum(proba_2, axis=1), np.ones(X_test.shape[0]))
             # Test that there are only ones and zeros predicted
-            self.assertEqual(
-                np.sum(proba_2.to_numpy() == 0) + np.sum(proba_2.to_numpy() == 1),
-                X_test.shape[0] * len(task.class_labels),
-            )
+            assert np.sum(proba_2.to_numpy() == 0) + np.sum(
+                proba_2.to_numpy() == 1
+            ) == X_test.shape[0] * len(task.class_labels)
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
+    @pytest.mark.production()
     def test_run_model_on_fold_regression(self):
         # There aren't any regression tasks on the test server
         openml.config.server = self.production_server
@@ -1951,7 +1957,7 @@ def test_run_model_on_fold_regression(self):
         y_test = y[test_indices]
 
         pipeline = sklearn.pipeline.Pipeline(
-            steps=[("imp", SimpleImputer()), ("clf", sklearn.tree.DecisionTreeRegressor())]
+            steps=[("imp", SimpleImputer()), ("clf", sklearn.tree.DecisionTreeRegressor())],
         )
         # TODO add some mocking here to actually test the innards of this function, too!
         res = self.extension._run_model_on_fold(
@@ -1967,17 +1973,17 @@ def test_run_model_on_fold_regression(self):
         y_hat, y_hat_proba, user_defined_measures, trace = res
 
         # predictions
-        self.assertIsInstance(y_hat, np.ndarray)
-        self.assertEqual(y_hat.shape, y_test.shape)
-        self.assertIsNone(y_hat_proba)
+        assert isinstance(y_hat, np.ndarray)
+        assert y_hat.shape == y_test.shape
+        assert y_hat_proba is None
 
         # check user defined measures
-        fold_evaluations = collections.defaultdict(lambda: collections.defaultdict(dict))
+        fold_evaluations: dict[str, dict[int, dict[int, float]]]  = collections.defaultdict(lambda: collections.defaultdict(dict))
         for measure in user_defined_measures:
             fold_evaluations[measure][0][0] = user_defined_measures[measure]
 
         # trace. SGD does not produce any
-        self.assertIsNone(trace)
+        assert trace is None
 
         self._check_fold_timing_evaluations(
             fold_evaluations,
@@ -1987,7 +1993,8 @@ def test_run_model_on_fold_regression(self):
             check_scores=False,
         )
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
+    @pytest.mark.production()
     def test_run_model_on_fold_clustering(self):
         # There aren't any regression tasks on the test server
         openml.config.server = self.production_server
@@ -1996,7 +2003,7 @@ def test_run_model_on_fold_clustering(self):
         X = task.get_X(dataset_format="array")
 
         pipeline = sklearn.pipeline.Pipeline(
-            steps=[("imp", SimpleImputer()), ("clf", sklearn.cluster.KMeans())]
+            steps=[("imp", SimpleImputer()), ("clf", sklearn.cluster.KMeans())],
         )
         # TODO add some mocking here to actually test the innards of this function, too!
         res = self.extension._run_model_on_fold(
@@ -2010,17 +2017,17 @@ def test_run_model_on_fold_clustering(self):
         y_hat, y_hat_proba, user_defined_measures, trace = res
 
         # predictions
-        self.assertIsInstance(y_hat, np.ndarray)
-        self.assertEqual(y_hat.shape, (X.shape[0],))
-        self.assertIsNone(y_hat_proba)
+        assert isinstance(y_hat, np.ndarray)
+        assert y_hat.shape == (X.shape[0],)
+        assert y_hat_proba is None
 
         # check user defined measures
-        fold_evaluations = collections.defaultdict(lambda: collections.defaultdict(dict))
+        fold_evaluations: dict[str, dict[int, dict[int, float]]]  = collections.defaultdict(lambda: collections.defaultdict(dict))
         for measure in user_defined_measures:
             fold_evaluations[measure][0][0] = user_defined_measures[measure]
 
         # trace. SGD does not produce any
-        self.assertIsNone(trace)
+        assert trace is None
 
         self._check_fold_timing_evaluations(
             fold_evaluations,
@@ -2030,7 +2037,7 @@ def test_run_model_on_fold_clustering(self):
             check_scores=False,
         )
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test__extract_trace_data(self):
         param_grid = {
             "hidden_layer_sizes": [[5, 5], [10, 10], [20, 20]],
@@ -2053,34 +2060,34 @@ def test__extract_trace_data(self):
             clf.fit(X[train], y[train])
 
         # check num layers of MLP
-        self.assertIn(clf.best_estimator_.hidden_layer_sizes, param_grid["hidden_layer_sizes"])
+        assert clf.best_estimator_.hidden_layer_sizes in param_grid["hidden_layer_sizes"]
 
         trace_list = self.extension._extract_trace_data(clf, rep_no=0, fold_no=0)
         trace = self.extension._obtain_arff_trace(clf, trace_list)
 
-        self.assertIsInstance(trace, OpenMLRunTrace)
-        self.assertIsInstance(trace_list, list)
-        self.assertEqual(len(trace_list), num_iters)
+        assert isinstance(trace, OpenMLRunTrace)
+        assert isinstance(trace_list, list)
+        assert len(trace_list) == num_iters
 
         for trace_iteration in iter(trace):
-            self.assertEqual(trace_iteration.repeat, 0)
-            self.assertEqual(trace_iteration.fold, 0)
-            self.assertGreaterEqual(trace_iteration.iteration, 0)
-            self.assertLessEqual(trace_iteration.iteration, num_iters)
-            self.assertIsNone(trace_iteration.setup_string)
-            self.assertIsInstance(trace_iteration.evaluation, float)
-            self.assertTrue(np.isfinite(trace_iteration.evaluation))
-            self.assertIsInstance(trace_iteration.selected, bool)
-
-            self.assertEqual(len(trace_iteration.parameters), len(param_grid))
+            assert trace_iteration.repeat == 0
+            assert trace_iteration.fold == 0
+            assert trace_iteration.iteration >= 0
+            assert trace_iteration.iteration <= num_iters
+            assert trace_iteration.setup_string is None
+            assert isinstance(trace_iteration.evaluation, float)
+            assert np.isfinite(trace_iteration.evaluation)
+            assert isinstance(trace_iteration.selected, bool)
+
+            assert len(trace_iteration.parameters) == len(param_grid)
             for param in param_grid:
                 # Prepend with the "parameter_" prefix
                 param_in_trace = "parameter_%s" % param
-                self.assertIn(param_in_trace, trace_iteration.parameters)
+                assert param_in_trace in trace_iteration.parameters
                 param_value = json.loads(trace_iteration.parameters[param_in_trace])
-                self.assertTrue(param_value in param_grid[param])
+                assert param_value in param_grid[param]
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_trim_flow_name(self):
         import re
 
@@ -2097,10 +2104,8 @@ def test_trim_flow_name(self):
         short = "sklearn.Pipeline(ColumnTransformer,VarianceThreshold,SVC)"
         shorter = "sklearn.Pipeline(...,SVC)"
         long_stripped, _ = re.subn(r"\s", "", long)
-        self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped))
-        self.assertEqual(
-            shorter, SklearnExtension.trim_flow_name(long_stripped, extra_trim_length=50)
-        )
+        assert short == SklearnExtension.trim_flow_name(long_stripped)
+        assert shorter == SklearnExtension.trim_flow_name(long_stripped, extra_trim_length=50)
 
         long = """sklearn.pipeline.Pipeline(
                     imputation=openmlstudy14.preprocessing.ConditionalImputer,
@@ -2109,16 +2114,18 @@ def test_trim_flow_name(self):
                     classifier=sklearn.ensemble.forest.RandomForestClassifier)"""
         short = "sklearn.Pipeline(ConditionalImputer,OneHotEncoder,VarianceThreshold,RandomForestClassifier)"  # noqa: E501
         long_stripped, _ = re.subn(r"\s", "", long)
-        self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped))
+        assert short == SklearnExtension.trim_flow_name(long_stripped)
 
         long = """sklearn.pipeline.Pipeline(
                     SimpleImputer=sklearn.preprocessing.imputation.Imputer,
                     VarianceThreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, # noqa: E501
                     Estimator=sklearn.model_selection._search.RandomizedSearchCV(
                         estimator=sklearn.tree.tree.DecisionTreeClassifier))"""
-        short = "sklearn.Pipeline(Imputer,VarianceThreshold,RandomizedSearchCV(DecisionTreeClassifier))"  # noqa: E501
+        short = (
+            "sklearn.Pipeline(Imputer,VarianceThreshold,RandomizedSearchCV(DecisionTreeClassifier))"
+        )
         long_stripped, _ = re.subn(r"\s", "", long)
-        self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped))
+        assert short == SklearnExtension.trim_flow_name(long_stripped)
 
         long = """sklearn.model_selection._search.RandomizedSearchCV(
                     estimator=sklearn.pipeline.Pipeline(
@@ -2126,24 +2133,22 @@ def test_trim_flow_name(self):
                         classifier=sklearn.ensemble.forest.RandomForestClassifier))"""
         short = "sklearn.RandomizedSearchCV(Pipeline(Imputer,RandomForestClassifier))"
         long_stripped, _ = re.subn(r"\s", "", long)
-        self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped))
+        assert short == SklearnExtension.trim_flow_name(long_stripped)
 
         long = """sklearn.pipeline.FeatureUnion(
                     pca=sklearn.decomposition.pca.PCA,
                     svd=sklearn.decomposition.truncated_svd.TruncatedSVD)"""
         short = "sklearn.FeatureUnion(PCA,TruncatedSVD)"
         long_stripped, _ = re.subn(r"\s", "", long)
-        self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped))
+        assert short == SklearnExtension.trim_flow_name(long_stripped)
 
         long = "sklearn.ensemble.forest.RandomForestClassifier"
         short = "sklearn.RandomForestClassifier"
-        self.assertEqual(short, SklearnExtension.trim_flow_name(long))
+        assert short == SklearnExtension.trim_flow_name(long)
 
-        self.assertEqual(
-            "weka.IsolationForest", SklearnExtension.trim_flow_name("weka.IsolationForest")
-        )
+        assert SklearnExtension.trim_flow_name("weka.IsolationForest") == "weka.IsolationForest"
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     @unittest.skipIf(
         LooseVersion(sklearn.__version__) < "0.21",
         reason="SimpleImputer, ColumnTransformer available only after 0.19 and "
@@ -2157,7 +2162,8 @@ def test_run_on_model_with_empty_steps(self):
         task = openml.tasks.get_task(59)  # mfeat-pixel; crossvalidation
 
         X, y, categorical_ind, feature_names = dataset.get_data(
-            target=dataset.default_target_attribute, dataset_format="array"
+            target=dataset.default_target_attribute,
+            dataset_format="array",
         )
         categorical_ind = np.array(categorical_ind)
         (cat_idx,) = np.where(categorical_ind)
@@ -2176,8 +2182,8 @@ def test_run_on_model_with_empty_steps(self):
                         make_pipeline(SimpleImputer(strategy="median"), StandardScaler()),
                         cont_idx.tolist(),
                     ),
-                ]
-            )
+                ],
+            ),
         )
 
         clf = sklearn.pipeline.Pipeline(
@@ -2185,7 +2191,7 @@ def test_run_on_model_with_empty_steps(self):
                 ("dummystep", "passthrough"),  # adding 'passthrough' as an estimator
                 ("prep", clf),
                 ("classifier", sklearn.svm.SVC(gamma="auto")),
-            ]
+            ],
         )
 
         # adding 'drop' to a ColumnTransformer
@@ -2197,43 +2203,42 @@ def test_run_on_model_with_empty_steps(self):
         # serializing model with non-actionable step
         run, flow = openml.runs.run_model_on_task(model=clf, task=task, return_flow=True)
 
-        self.assertEqual(len(flow.components), 3)
-        self.assertIsInstance(flow.components["dummystep"], OpenMLFlow)
-        self.assertEqual(flow.components["dummystep"].name, "passthrough")
-        self.assertIsInstance(flow.components["classifier"], OpenMLFlow)
+        assert len(flow.components) == 3
+        assert isinstance(flow.components["dummystep"], OpenMLFlow)
+        assert flow.components["dummystep"].name == "passthrough"
+        assert isinstance(flow.components["classifier"], OpenMLFlow)
         if LooseVersion(sklearn.__version__) < "0.22":
-            self.assertEqual(flow.components["classifier"].name, "sklearn.svm.classes.SVC")
+            assert flow.components["classifier"].name == "sklearn.svm.classes.SVC"
         else:
-            self.assertEqual(flow.components["classifier"].name, "sklearn.svm._classes.SVC")
-        self.assertIsInstance(flow.components["prep"], OpenMLFlow)
-        self.assertEqual(flow.components["prep"].class_name, "sklearn.pipeline.Pipeline")
-        self.assertIsInstance(flow.components["prep"].components["columntransformer"], OpenMLFlow)
-        self.assertIsInstance(
-            flow.components["prep"].components["columntransformer"].components["cat"],
-            OpenMLFlow,
+            assert flow.components["classifier"].name == "sklearn.svm._classes.SVC"
+        assert isinstance(flow.components["prep"], OpenMLFlow)
+        assert flow.components["prep"].class_name == "sklearn.pipeline.Pipeline"
+        assert isinstance(flow.components["prep"].components["columntransformer"], OpenMLFlow)
+        assert isinstance(
+            flow.components["prep"].components["columntransformer"].components["cat"], OpenMLFlow
         )
-        self.assertEqual(
-            flow.components["prep"].components["columntransformer"].components["cat"].name, "drop"
+        assert (
+            flow.components["prep"].components["columntransformer"].components["cat"].name == "drop"
         )
 
         # de-serializing flow to a model with non-actionable step
         model = self.extension.flow_to_model(flow)
         model.fit(X, y)
-        self.assertEqual(type(model), type(clf))
-        self.assertNotEqual(model, clf)
-        self.assertEqual(len(model.named_steps), 3)
-        self.assertEqual(model.named_steps["dummystep"], "passthrough")
+        assert type(model) == type(clf)
+        assert model != clf
+        assert len(model.named_steps) == 3
+        assert model.named_steps["dummystep"] == "passthrough"
 
         xml = flow._to_dict()
         new_model = self.extension.flow_to_model(OpenMLFlow._from_dict(xml))
 
         new_model.fit(X, y)
-        self.assertEqual(type(new_model), type(clf))
-        self.assertNotEqual(new_model, clf)
-        self.assertEqual(len(new_model.named_steps), 3)
-        self.assertEqual(new_model.named_steps["dummystep"], "passthrough")
+        assert type(new_model) == type(clf)
+        assert new_model != clf
+        assert len(new_model.named_steps) == 3
+        assert new_model.named_steps["dummystep"] == "passthrough"
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_sklearn_serialization_with_none_step(self):
         msg = (
             "Cannot serialize objects of None type. Please use a valid "
@@ -2241,12 +2246,12 @@ def test_sklearn_serialization_with_none_step(self):
             "replaced with 'drop' or 'passthrough'."
         )
         clf = sklearn.pipeline.Pipeline(
-            [("dummystep", None), ("classifier", sklearn.svm.SVC(gamma="auto"))]
+            [("dummystep", None), ("classifier", sklearn.svm.SVC(gamma="auto"))],
         )
-        with self.assertRaisesRegex(ValueError, msg):
+        with pytest.raises(ValueError, match=msg):
             self.extension.model_to_flow(clf)
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     @unittest.skipIf(
         LooseVersion(sklearn.__version__) < "0.20",
         reason="columntransformer introduction in 0.20.0",
@@ -2260,17 +2265,18 @@ def test_failed_serialization_of_custom_class(self):
             from sklearn.preprocessing import Imputer as SimpleImputer
 
         import sklearn.tree
-        from sklearn.pipeline import Pipeline, make_pipeline
         from sklearn.compose import ColumnTransformer
+        from sklearn.pipeline import Pipeline, make_pipeline
         from sklearn.preprocessing import OneHotEncoder, StandardScaler
 
         cat_imp = make_pipeline(
-            SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore")
+            SimpleImputer(strategy="most_frequent"),
+            OneHotEncoder(handle_unknown="ignore"),
         )
         cont_imp = make_pipeline(CustomImputer(), StandardScaler())
         ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
         clf = Pipeline(
-            steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())]
+            steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())],
         )  # build a sklearn classifier
 
         task = openml.tasks.get_task(253)  # profb; crossvalidation
@@ -2282,7 +2288,7 @@ def test_failed_serialization_of_custom_class(self):
             else:
                 raise Exception(e)
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     @unittest.skipIf(
         LooseVersion(sklearn.__version__) < "0.20",
         reason="columntransformer introduction in 0.20.0",
@@ -2301,7 +2307,7 @@ def column_transformer_pipe(task_id):
                 transformers=[
                     ("num", StandardScaler(), cont),
                     ("cat", OneHotEncoder(handle_unknown="ignore"), cat),
-                ]
+                ],
             )
             # make pipeline
             clf = SVC(gamma="scale", random_state=1)
@@ -2309,11 +2315,10 @@ def column_transformer_pipe(task_id):
             # run task
             run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)
             run.publish()
-            new_run = openml.runs.get_run(run.run_id)
-            return new_run
+            return openml.runs.get_run(run.run_id)
 
         run1 = column_transformer_pipe(11)  # only categorical
         TestBase._mark_entity_for_removal("run", run1.run_id)
         run2 = column_transformer_pipe(23)  # only numeric
         TestBase._mark_entity_for_removal("run", run2.run_id)
-        self.assertEqual(run1.setup_id, run2.setup_id)
+        assert run1.setup_id == run2.setup_id
diff --git a/tests/test_flows/dummy_learn/dummy_forest.py b/tests/test_flows/dummy_learn/dummy_forest.py
index 613f73852..65e79e760 100644
--- a/tests/test_flows/dummy_learn/dummy_forest.py
+++ b/tests/test_flows/dummy_learn/dummy_forest.py
@@ -1,7 +1,8 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
 
-class DummyRegressor(object):
+class DummyRegressor:
     def fit(self, X, y):
         return self
 
diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py
index 983ea206d..afa31ef63 100644
--- a/tests/test_flows/test_flow.py
+++ b/tests/test_flows/test_flow.py
@@ -1,14 +1,15 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
 import collections
 import copy
-from distutils.version import LooseVersion
 import hashlib
 import re
 import time
+from distutils.version import LooseVersion
 from unittest import mock
-import pytest
 
+import pytest
 import scipy.stats
 import sklearn
 import sklearn.datasets
@@ -17,19 +18,18 @@
 import sklearn.ensemble
 import sklearn.feature_selection
 import sklearn.model_selection
+import sklearn.naive_bayes
 import sklearn.pipeline
 import sklearn.preprocessing
-import sklearn.naive_bayes
 import sklearn.tree
-
 import xmltodict
 
 import openml
-from openml._api_calls import _perform_api_call
 import openml.exceptions
 import openml.extensions.sklearn
-from openml.testing import TestBase, SimpleImputer
 import openml.utils
+from openml._api_calls import _perform_api_call
+from openml.testing import SimpleImputer, TestBase
 
 
 class TestFlow(TestBase):
@@ -42,38 +42,40 @@ def setUp(self):
     def tearDown(self):
         super().tearDown()
 
+    @pytest.mark.production()
     def test_get_flow(self):
         # We need to use the production server here because 4024 is not the
         # test server
         openml.config.server = self.production_server
 
         flow = openml.flows.get_flow(4024)
-        self.assertIsInstance(flow, openml.OpenMLFlow)
-        self.assertEqual(flow.flow_id, 4024)
-        self.assertEqual(len(flow.parameters), 24)
-        self.assertEqual(len(flow.components), 1)
-
-        subflow_1 = list(flow.components.values())[0]
-        self.assertIsInstance(subflow_1, openml.OpenMLFlow)
-        self.assertEqual(subflow_1.flow_id, 4025)
-        self.assertEqual(len(subflow_1.parameters), 14)
-        self.assertEqual(subflow_1.parameters["E"], "CC")
-        self.assertEqual(len(subflow_1.components), 1)
-
-        subflow_2 = list(subflow_1.components.values())[0]
-        self.assertIsInstance(subflow_2, openml.OpenMLFlow)
-        self.assertEqual(subflow_2.flow_id, 4026)
-        self.assertEqual(len(subflow_2.parameters), 13)
-        self.assertEqual(subflow_2.parameters["I"], "10")
-        self.assertEqual(len(subflow_2.components), 1)
-
-        subflow_3 = list(subflow_2.components.values())[0]
-        self.assertIsInstance(subflow_3, openml.OpenMLFlow)
-        self.assertEqual(subflow_3.flow_id, 1724)
-        self.assertEqual(len(subflow_3.parameters), 11)
-        self.assertEqual(subflow_3.parameters["L"], "-1")
-        self.assertEqual(len(subflow_3.components), 0)
-
+        assert isinstance(flow, openml.OpenMLFlow)
+        assert flow.flow_id == 4024
+        assert len(flow.parameters) == 24
+        assert len(flow.components) == 1
+
+        subflow_1 = next(iter(flow.components.values()))
+        assert isinstance(subflow_1, openml.OpenMLFlow)
+        assert subflow_1.flow_id == 4025
+        assert len(subflow_1.parameters) == 14
+        assert subflow_1.parameters["E"] == "CC"
+        assert len(subflow_1.components) == 1
+
+        subflow_2 = next(iter(subflow_1.components.values()))
+        assert isinstance(subflow_2, openml.OpenMLFlow)
+        assert subflow_2.flow_id == 4026
+        assert len(subflow_2.parameters) == 13
+        assert subflow_2.parameters["I"] == "10"
+        assert len(subflow_2.components) == 1
+
+        subflow_3 = next(iter(subflow_2.components.values()))
+        assert isinstance(subflow_3, openml.OpenMLFlow)
+        assert subflow_3.flow_id == 1724
+        assert len(subflow_3.parameters) == 11
+        assert subflow_3.parameters["L"] == "-1"
+        assert len(subflow_3.components) == 0
+
+    @pytest.mark.production()
     def test_get_structure(self):
         # also responsible for testing: flow.get_subflow
         # We need to use the production server here because 4024 is not the
@@ -85,33 +87,35 @@ def test_get_structure(self):
         flow_structure_id = flow.get_structure("flow_id")
         # components: root (filteredclassifier), multisearch, loginboost,
         # reptree
-        self.assertEqual(len(flow_structure_name), 4)
-        self.assertEqual(len(flow_structure_id), 4)
+        assert len(flow_structure_name) == 4
+        assert len(flow_structure_id) == 4
 
         for sub_flow_name, structure in flow_structure_name.items():
             if len(structure) > 0:  # skip root element
                 subflow = flow.get_subflow(structure)
-                self.assertEqual(subflow.name, sub_flow_name)
+                assert subflow.name == sub_flow_name
 
         for sub_flow_id, structure in flow_structure_id.items():
             if len(structure) > 0:  # skip root element
                 subflow = flow.get_subflow(structure)
-                self.assertEqual(subflow.flow_id, sub_flow_id)
+                assert subflow.flow_id == sub_flow_id
 
     def test_tagging(self):
         flows = openml.flows.list_flows(size=1, output_format="dataframe")
         flow_id = flows["id"].iloc[0]
         flow = openml.flows.get_flow(flow_id)
-        tag = "testing_tag_{}_{}".format(self.id(), time.time())
+        # tags can be at most 64 alphanumeric (+ underscore) chars
+        unique_indicator = str(time.time()).replace(".", "")
+        tag = f"test_tag_TestFlow_{unique_indicator}"
         flows = openml.flows.list_flows(tag=tag, output_format="dataframe")
-        self.assertEqual(len(flows), 0)
+        assert len(flows) == 0
         flow.push_tag(tag)
         flows = openml.flows.list_flows(tag=tag, output_format="dataframe")
-        self.assertEqual(len(flows), 1)
-        self.assertIn(flow_id, flows["id"])
+        assert len(flows) == 1
+        assert flow_id in flows["id"]
         flow.remove_tag(tag)
         flows = openml.flows.list_flows(tag=tag, output_format="dataframe")
-        self.assertEqual(len(flows), 0)
+        assert len(flows) == 0
 
     def test_from_xml_to_xml(self):
         # Get the raw xml thing
@@ -147,13 +151,13 @@ def test_from_xml_to_xml(self):
             )
             new_xml = re.sub(r"^$", "", new_xml)
 
-            self.assertEqual(new_xml, flow_xml)
+            assert new_xml == flow_xml
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_to_xml_from_xml(self):
         scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
         boosting = sklearn.ensemble.AdaBoostClassifier(
-            base_estimator=sklearn.tree.DecisionTreeClassifier()
+            base_estimator=sklearn.tree.DecisionTreeClassifier(),
         )
         model = sklearn.pipeline.Pipeline(steps=(("scaler", scaler), ("boosting", boosting)))
         flow = self.extension.model_to_flow(model)
@@ -166,9 +170,9 @@ def test_to_xml_from_xml(self):
 
         # Would raise exception if they are not legal
         openml.flows.functions.assert_flows_equal(new_flow, flow)
-        self.assertIsNot(new_flow, flow)
+        assert new_flow is not flow
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_publish_flow(self):
         flow = openml.OpenMLFlow(
             name="sklearn.dummy.DummyClassifier",
@@ -190,70 +194,65 @@ def test_publish_flow(self):
         flow, _ = self._add_sentinel_to_flow_name(flow, None)
 
         flow.publish()
-        TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name))
+        TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
         TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id))
-        self.assertIsInstance(flow.flow_id, int)
+        assert isinstance(flow.flow_id, int)
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     @mock.patch("openml.flows.functions.flow_exists")
     def test_publish_existing_flow(self, flow_exists_mock):
         clf = sklearn.tree.DecisionTreeClassifier(max_depth=2)
         flow = self.extension.model_to_flow(clf)
         flow_exists_mock.return_value = 1
 
-        with self.assertRaises(openml.exceptions.PyOpenMLError) as context_manager:
+        with pytest.raises(openml.exceptions.PyOpenMLError, match="OpenMLFlow already exists"):
             flow.publish(raise_error_if_exists=True)
-            TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name))
-            TestBase.logger.info(
-                "collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id)
-            )
 
-        self.assertTrue("OpenMLFlow already exists" in context_manager.exception.message)
+        TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
+        TestBase.logger.info(
+            "collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id),
+        )
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_publish_flow_with_similar_components(self):
         clf = sklearn.ensemble.VotingClassifier(
-            [("lr", sklearn.linear_model.LogisticRegression(solver="lbfgs"))]
+            [("lr", sklearn.linear_model.LogisticRegression(solver="lbfgs"))],
         )
         flow = self.extension.model_to_flow(clf)
         flow, _ = self._add_sentinel_to_flow_name(flow, None)
         flow.publish()
-        TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name))
+        TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
         TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id))
         # For a flow where both components are published together, the upload
         # date should be equal
-        self.assertEqual(
-            flow.upload_date,
-            flow.components["lr"].upload_date,
-            msg=(
-                flow.name,
-                flow.flow_id,
-                flow.components["lr"].name,
-                flow.components["lr"].flow_id,
-            ),
+        assert flow.upload_date == flow.components["lr"].upload_date, (
+            flow.name,
+            flow.flow_id,
+            flow.components["lr"].name,
+            flow.components["lr"].flow_id,
         )
 
         clf1 = sklearn.tree.DecisionTreeClassifier(max_depth=2)
         flow1 = self.extension.model_to_flow(clf1)
         flow1, sentinel = self._add_sentinel_to_flow_name(flow1, None)
         flow1.publish()
-        TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name))
+        TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
         TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], flow1.flow_id))
 
         # In order to assign different upload times to the flows!
         time.sleep(1)
 
         clf2 = sklearn.ensemble.VotingClassifier(
-            [("dt", sklearn.tree.DecisionTreeClassifier(max_depth=2))]
+            [("dt", sklearn.tree.DecisionTreeClassifier(max_depth=2))],
         )
         flow2 = self.extension.model_to_flow(clf2)
         flow2, _ = self._add_sentinel_to_flow_name(flow2, sentinel)
         flow2.publish()
-        TestBase._mark_entity_for_removal("flow", (flow2.flow_id, flow2.name))
+        TestBase._mark_entity_for_removal("flow", flow2.flow_id, flow2.name)
         TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], flow2.flow_id))
         # If one component was published before the other, the components in
         # the flow should have different upload dates
-        self.assertNotEqual(flow2.upload_date, flow2.components["dt"].upload_date)
+        assert flow2.upload_date != flow2.components["dt"].upload_date
 
         clf3 = sklearn.ensemble.AdaBoostClassifier(sklearn.tree.DecisionTreeClassifier(max_depth=3))
         flow3 = self.extension.model_to_flow(clf3)
@@ -261,27 +260,27 @@ def test_publish_flow_with_similar_components(self):
         # Child flow has different parameter. Check for storing the flow
         # correctly on the server should thus not check the child's parameters!
         flow3.publish()
-        TestBase._mark_entity_for_removal("flow", (flow3.flow_id, flow3.name))
+        TestBase._mark_entity_for_removal("flow", flow3.flow_id, flow3.name)
         TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], flow3.flow_id))
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_semi_legal_flow(self):
         # TODO: Test if parameters are set correctly!
         # should not throw error as it contains two differentiable forms of
         # Bagging i.e., Bagging(Bagging(J48)) and Bagging(J48)
         semi_legal = sklearn.ensemble.BaggingClassifier(
             base_estimator=sklearn.ensemble.BaggingClassifier(
-                base_estimator=sklearn.tree.DecisionTreeClassifier()
-            )
+                base_estimator=sklearn.tree.DecisionTreeClassifier(),
+            ),
         )
         flow = self.extension.model_to_flow(semi_legal)
         flow, _ = self._add_sentinel_to_flow_name(flow, None)
 
         flow.publish()
-        TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name))
+        TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
         TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id))
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     @mock.patch("openml.flows.functions.get_flow")
     @mock.patch("openml.flows.functions.flow_exists")
     @mock.patch("openml._api_calls._perform_api_call")
@@ -297,22 +296,15 @@ def test_publish_error(self, api_call_mock, flow_exists_mock, get_flow_mock):
         flow.publish()
         # Not collecting flow_id for deletion since this is a test for failed upload
 
-        self.assertEqual(api_call_mock.call_count, 1)
-        self.assertEqual(get_flow_mock.call_count, 1)
-        self.assertEqual(flow_exists_mock.call_count, 1)
+        assert api_call_mock.call_count == 1
+        assert get_flow_mock.call_count == 1
+        assert flow_exists_mock.call_count == 1
 
         flow_copy = copy.deepcopy(flow)
         flow_copy.name = flow_copy.name[:-1]
         get_flow_mock.return_value = flow_copy
         flow_exists_mock.return_value = 1
 
-        with self.assertRaises(ValueError) as context_manager:
-            flow.publish()
-            TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name))
-            TestBase.logger.info(
-                "collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id)
-            )
-
         if LooseVersion(sklearn.__version__) < "0.22":
             fixture = (
                 "The flow on the server is inconsistent with the local flow. "
@@ -334,11 +326,17 @@ def test_publish_error(self, api_call_mock, flow_exists_mock, get_flow_mock):
                 "'sklearn.ensemble._forest.RandomForestClassifier'"
                 "\nvs\n'sklearn.ensemble._forest.RandomForestClassifie'.'"
             )
+        with pytest.raises(ValueError, match=fixture):
+            flow.publish()
+
+        TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
+        TestBase.logger.info(
+            "collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id),
+        )
 
-        self.assertEqual(context_manager.exception.args[0], fixture)
-        self.assertEqual(get_flow_mock.call_count, 2)
+        assert get_flow_mock.call_count == 2
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_illegal_flow(self):
         # should throw error as it contains two imputers
         illegal = sklearn.pipeline.Pipeline(
@@ -346,7 +344,7 @@ def test_illegal_flow(self):
                 ("imputer1", SimpleImputer()),
                 ("imputer2", SimpleImputer()),
                 ("classif", sklearn.tree.DecisionTreeClassifier()),
-            ]
+            ],
         )
         self.assertRaises(ValueError, self.extension.model_to_flow, illegal)
 
@@ -358,16 +356,15 @@ def get_sentinel():
             md5 = hashlib.md5()
             md5.update(str(time.time()).encode("utf-8"))
             sentinel = md5.hexdigest()[:10]
-            sentinel = "TEST%s" % sentinel
-            return sentinel
+            return "TEST%s" % sentinel
 
         name = get_sentinel() + get_sentinel()
         version = get_sentinel()
 
         flow_id = openml.flows.flow_exists(name, version)
-        self.assertFalse(flow_id)
+        assert not flow_id
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_existing_flow_exists(self):
         # create a flow
         nb = sklearn.naive_bayes.GaussianNB()
@@ -391,9 +388,9 @@ def test_existing_flow_exists(self):
             flow, _ = self._add_sentinel_to_flow_name(flow, None)
             # publish the flow
             flow = flow.publish()
-            TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name))
+            TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
             TestBase.logger.info(
-                "collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id)
+                "collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id),
             )
             # redownload the flow
             flow = openml.flows.get_flow(flow.flow_id)
@@ -404,9 +401,9 @@ def test_existing_flow_exists(self):
                 flow.name,
                 flow.external_version,
             )
-            self.assertEqual(downloaded_flow_id, flow.flow_id)
+            assert downloaded_flow_id == flow.flow_id
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_sklearn_to_upload_to_flow(self):
         iris = sklearn.datasets.load_iris()
         X = iris.data
@@ -420,14 +417,15 @@ def test_sklearn_to_upload_to_flow(self):
         scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
         pca = sklearn.decomposition.TruncatedSVD()
         fs = sklearn.feature_selection.SelectPercentile(
-            score_func=sklearn.feature_selection.f_classif, percentile=30
+            score_func=sklearn.feature_selection.f_classif,
+            percentile=30,
         )
         fu = sklearn.pipeline.FeatureUnion(transformer_list=[("pca", pca), ("fs", fs)])
         boosting = sklearn.ensemble.AdaBoostClassifier(
-            base_estimator=sklearn.tree.DecisionTreeClassifier()
+            base_estimator=sklearn.tree.DecisionTreeClassifier(),
         )
         model = sklearn.pipeline.Pipeline(
-            steps=[("ohe", ohe), ("scaler", scaler), ("fu", fu), ("boosting", boosting)]
+            steps=[("ohe", ohe), ("scaler", scaler), ("fu", fu), ("boosting", boosting)],
         )
         parameter_grid = {
             "boosting__n_estimators": [1, 5, 10, 100],
@@ -436,7 +434,9 @@ def test_sklearn_to_upload_to_flow(self):
         }
         cv = sklearn.model_selection.StratifiedKFold(n_splits=5, shuffle=True)
         rs = sklearn.model_selection.RandomizedSearchCV(
-            estimator=model, param_distributions=parameter_grid, cv=cv
+            estimator=model,
+            param_distributions=parameter_grid,
+            cv=cv,
         )
         rs.fit(X, y)
         flow = self.extension.model_to_flow(rs)
@@ -451,9 +451,9 @@ def test_sklearn_to_upload_to_flow(self):
         flow, sentinel = self._add_sentinel_to_flow_name(flow, None)
 
         flow.publish()
-        TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name))
+        TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
         TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id))
-        self.assertIsInstance(flow.flow_id, int)
+        assert isinstance(flow.flow_id, int)
 
         # Check whether we can load the flow again
         # Remove the sentinel from the name again so that we can reinstantiate
@@ -463,7 +463,7 @@ def test_sklearn_to_upload_to_flow(self):
         local_xml = flow._to_xml()
         server_xml = new_flow._to_xml()
 
-        for i in range(10):
+        for _i in range(10):
             # Make sure that we replace all occurences of two newlines
             local_xml = local_xml.replace(sentinel, "")
             local_xml = (
@@ -484,19 +484,19 @@ def test_sklearn_to_upload_to_flow(self):
             )
             server_xml = re.sub(r"^$", "", server_xml)
 
-        self.assertEqual(server_xml, local_xml)
+        assert server_xml == local_xml
 
         # Would raise exception if they are not equal!
         openml.flows.functions.assert_flows_equal(new_flow, flow)
-        self.assertIsNot(new_flow, flow)
+        assert new_flow is not flow
 
         # OneHotEncoder was moved to _encoders module in 0.20
         module_name_encoder = "_encoders" if LooseVersion(sklearn.__version__) >= "0.20" else "data"
         if LooseVersion(sklearn.__version__) < "0.22":
             fixture_name = (
-                "%ssklearn.model_selection._search.RandomizedSearchCV("
+                f"{sentinel}sklearn.model_selection._search.RandomizedSearchCV("
                 "estimator=sklearn.pipeline.Pipeline("
-                "ohe=sklearn.preprocessing.%s.OneHotEncoder,"
+                f"ohe=sklearn.preprocessing.{module_name_encoder}.OneHotEncoder,"
                 "scaler=sklearn.preprocessing.data.StandardScaler,"
                 "fu=sklearn.pipeline.FeatureUnion("
                 "pca=sklearn.decomposition.truncated_svd.TruncatedSVD,"
@@ -504,7 +504,6 @@ def test_sklearn_to_upload_to_flow(self):
                 "sklearn.feature_selection.univariate_selection.SelectPercentile),"
                 "boosting=sklearn.ensemble.weight_boosting.AdaBoostClassifier("
                 "base_estimator=sklearn.tree.tree.DecisionTreeClassifier)))"
-                % (sentinel, module_name_encoder)
             )
         else:
             # sklearn.sklearn.preprocessing.data -> sklearn.sklearn.preprocessing._data
@@ -514,9 +513,9 @@ def test_sklearn_to_upload_to_flow(self):
             # sklearn.ensemble.weight_boosting -> sklearn.ensemble._weight_boosting
             # sklearn.tree.tree.DecisionTree... -> sklearn.tree._classes.DecisionTree...
             fixture_name = (
-                "%ssklearn.model_selection._search.RandomizedSearchCV("
+                f"{sentinel}sklearn.model_selection._search.RandomizedSearchCV("
                 "estimator=sklearn.pipeline.Pipeline("
-                "ohe=sklearn.preprocessing.%s.OneHotEncoder,"
+                f"ohe=sklearn.preprocessing.{module_name_encoder}.OneHotEncoder,"
                 "scaler=sklearn.preprocessing._data.StandardScaler,"
                 "fu=sklearn.pipeline.FeatureUnion("
                 "pca=sklearn.decomposition._truncated_svd.TruncatedSVD,"
@@ -524,44 +523,44 @@ def test_sklearn_to_upload_to_flow(self):
                 "sklearn.feature_selection._univariate_selection.SelectPercentile),"
                 "boosting=sklearn.ensemble._weight_boosting.AdaBoostClassifier("
                 "base_estimator=sklearn.tree._classes.DecisionTreeClassifier)))"
-                % (sentinel, module_name_encoder)
             )
-        self.assertEqual(new_flow.name, fixture_name)
+        assert new_flow.name == fixture_name
         new_flow.model.fit(X, y)
 
     def test_extract_tags(self):
         flow_xml = "<oml:tag>study_14</oml:tag>"
         flow_dict = xmltodict.parse(flow_xml)
         tags = openml.utils.extract_xml_tags("oml:tag", flow_dict)
-        self.assertEqual(tags, ["study_14"])
+        assert tags == ["study_14"]
 
         flow_xml = "<oml:flow><oml:tag>OpenmlWeka</oml:tag>\n" "<oml:tag>weka</oml:tag></oml:flow>"
         flow_dict = xmltodict.parse(flow_xml)
         tags = openml.utils.extract_xml_tags("oml:tag", flow_dict["oml:flow"])
-        self.assertEqual(tags, ["OpenmlWeka", "weka"])
+        assert tags == ["OpenmlWeka", "weka"]
 
+    @pytest.mark.production()
     def test_download_non_scikit_learn_flows(self):
         openml.config.server = self.production_server
 
         flow = openml.flows.get_flow(6742)
-        self.assertIsInstance(flow, openml.OpenMLFlow)
-        self.assertEqual(flow.flow_id, 6742)
-        self.assertEqual(len(flow.parameters), 19)
-        self.assertEqual(len(flow.components), 1)
-        self.assertIsNone(flow.model)
-
-        subflow_1 = list(flow.components.values())[0]
-        self.assertIsInstance(subflow_1, openml.OpenMLFlow)
-        self.assertEqual(subflow_1.flow_id, 6743)
-        self.assertEqual(len(subflow_1.parameters), 8)
-        self.assertEqual(subflow_1.parameters["U"], "0")
-        self.assertEqual(len(subflow_1.components), 1)
-        self.assertIsNone(subflow_1.model)
-
-        subflow_2 = list(subflow_1.components.values())[0]
-        self.assertIsInstance(subflow_2, openml.OpenMLFlow)
-        self.assertEqual(subflow_2.flow_id, 5888)
-        self.assertEqual(len(subflow_2.parameters), 4)
-        self.assertIsNone(subflow_2.parameters["batch-size"])
-        self.assertEqual(len(subflow_2.components), 0)
-        self.assertIsNone(subflow_2.model)
+        assert isinstance(flow, openml.OpenMLFlow)
+        assert flow.flow_id == 6742
+        assert len(flow.parameters) == 19
+        assert len(flow.components) == 1
+        assert flow.model is None
+
+        subflow_1 = next(iter(flow.components.values()))
+        assert isinstance(subflow_1, openml.OpenMLFlow)
+        assert subflow_1.flow_id == 6743
+        assert len(subflow_1.parameters) == 8
+        assert subflow_1.parameters["U"] == "0"
+        assert len(subflow_1.components) == 1
+        assert subflow_1.model is None
+
+        subflow_2 = next(iter(subflow_1.components.values()))
+        assert isinstance(subflow_2, openml.OpenMLFlow)
+        assert subflow_2.flow_id == 5888
+        assert len(subflow_2.parameters) == 4
+        assert subflow_2.parameters["batch-size"] is None
+        assert len(subflow_2.components) == 0
+        assert subflow_2.model is None
diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
index 3814a8f9d..68d49eafa 100644
--- a/tests/test_flows/test_flow_functions.py
+++ b/tests/test_flows/test_flow_functions.py
@@ -1,24 +1,24 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
-from collections import OrderedDict
 import copy
 import functools
 import unittest
+from collections import OrderedDict
+from distutils.version import LooseVersion
 from unittest import mock
 from unittest.mock import patch
 
-from distutils.version import LooseVersion
-
+import pandas as pd
+import pytest
 import requests
 import sklearn
 from sklearn import ensemble
-import pandas as pd
-import pytest
 
 import openml
+import openml.extensions.sklearn
 from openml.exceptions import OpenMLNotAuthorizedError, OpenMLServerException
 from openml.testing import TestBase, create_request_response
-import openml.extensions.sklearn
 
 
 @pytest.mark.usefixtures("long_version")
@@ -26,61 +26,66 @@ class TestFlowFunctions(TestBase):
     _multiprocess_can_split_ = True
 
     def setUp(self):
-        super(TestFlowFunctions, self).setUp()
+        super().setUp()
 
     def tearDown(self):
-        super(TestFlowFunctions, self).tearDown()
+        super().tearDown()
 
     def _check_flow(self, flow):
-        self.assertEqual(type(flow), dict)
-        self.assertEqual(len(flow), 6)
-        self.assertIsInstance(flow["id"], int)
-        self.assertIsInstance(flow["name"], str)
-        self.assertIsInstance(flow["full_name"], str)
-        self.assertIsInstance(flow["version"], str)
+        assert type(flow) == dict
+        assert len(flow) == 6
+        assert isinstance(flow["id"], int)
+        assert isinstance(flow["name"], str)
+        assert isinstance(flow["full_name"], str)
+        assert isinstance(flow["version"], str)
         # There are some runs on openml.org that can have an empty external version
         ext_version_str_or_none = (
             isinstance(flow["external_version"], str) or flow["external_version"] is None
         )
-        self.assertTrue(ext_version_str_or_none)
+        assert ext_version_str_or_none
 
+    @pytest.mark.production()
     def test_list_flows(self):
         openml.config.server = self.production_server
         # We can only perform a smoke test here because we test on dynamic
         # data from the internet...
         flows = openml.flows.list_flows(output_format="dataframe")
         # 3000 as the number of flows on openml.org
-        self.assertGreaterEqual(len(flows), 1500)
+        assert len(flows) >= 1500
         for flow in flows.to_dict(orient="index").values():
             self._check_flow(flow)
 
+    @pytest.mark.production()
     def test_list_flows_output_format(self):
         openml.config.server = self.production_server
         # We can only perform a smoke test here because we test on dynamic
         # data from the internet...
         flows = openml.flows.list_flows(output_format="dataframe")
-        self.assertIsInstance(flows, pd.DataFrame)
-        self.assertGreaterEqual(len(flows), 1500)
+        assert isinstance(flows, pd.DataFrame)
+        assert len(flows) >= 1500
 
+    @pytest.mark.production()
     def test_list_flows_empty(self):
         openml.config.server = self.production_server
         flows = openml.flows.list_flows(tag="NoOneEverUsesThisTag123", output_format="dataframe")
         assert flows.empty
 
+    @pytest.mark.production()
     def test_list_flows_by_tag(self):
         openml.config.server = self.production_server
         flows = openml.flows.list_flows(tag="weka", output_format="dataframe")
-        self.assertGreaterEqual(len(flows), 5)
+        assert len(flows) >= 5
         for flow in flows.to_dict(orient="index").values():
             self._check_flow(flow)
 
+    @pytest.mark.production()
     def test_list_flows_paginate(self):
         openml.config.server = self.production_server
         size = 10
         maximum = 100
         for i in range(0, maximum, size):
             flows = openml.flows.list_flows(offset=i, size=size, output_format="dataframe")
-            self.assertGreaterEqual(size, len(flows))
+            assert size >= len(flows)
             for flow in flows.to_dict(orient="index").values():
                 self._check_flow(flow)
 
@@ -112,10 +117,7 @@ def test_are_flows_equal(self):
         ]:
             new_flow = copy.deepcopy(flow)
             setattr(new_flow, attribute, new_value)
-            self.assertNotEqual(
-                getattr(flow, attribute),
-                getattr(new_flow, attribute),
-            )
+            assert getattr(flow, attribute) != getattr(new_flow, attribute)
             self.assertRaises(
                 ValueError,
                 openml.flows.functions.assert_flows_equal,
@@ -138,10 +140,7 @@ def test_are_flows_equal(self):
         ]:
             new_flow = copy.deepcopy(flow)
             setattr(new_flow, attribute, new_value)
-            self.assertNotEqual(
-                getattr(flow, attribute),
-                getattr(new_flow, attribute),
-            )
+            assert getattr(flow, attribute) != getattr(new_flow, attribute)
             openml.flows.functions.assert_flows_equal(flow, new_flow)
 
         # Now test for parameters
@@ -158,12 +157,18 @@ def test_are_flows_equal(self):
         parent_flow.components["subflow"] = subflow
         openml.flows.functions.assert_flows_equal(parent_flow, parent_flow)
         self.assertRaises(
-            ValueError, openml.flows.functions.assert_flows_equal, parent_flow, subflow
+            ValueError,
+            openml.flows.functions.assert_flows_equal,
+            parent_flow,
+            subflow,
         )
         new_flow = copy.deepcopy(parent_flow)
         new_flow.components["subflow"].name = "Subflow name"
         self.assertRaises(
-            ValueError, openml.flows.functions.assert_flows_equal, parent_flow, new_flow
+            ValueError,
+            openml.flows.functions.assert_flows_equal,
+            parent_flow,
+            new_flow,
         )
 
     def test_are_flows_equal_ignore_parameter_values(self):
@@ -272,7 +277,7 @@ def test_are_flows_equal_ignore_if_older(self):
         )
         assert_flows_equal(flow, flow, ignore_parameter_values_on_older_children=None)
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     @unittest.skipIf(
         LooseVersion(sklearn.__version__) < "0.20",
         reason="OrdinalEncoder introduced in 0.20. "
@@ -290,31 +295,32 @@ def test_sklearn_to_flow_list_of_lists(self):
         # Test flow is accepted by server
         self._add_sentinel_to_flow_name(flow)
         flow.publish()
-        TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name))
+        TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
         TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id))
         # Test deserialization works
         server_flow = openml.flows.get_flow(flow.flow_id, reinstantiate=True)
-        self.assertEqual(server_flow.parameters["categories"], "[[0, 1], [0, 1]]")
-        self.assertEqual(server_flow.model.categories, flow.model.categories)
+        assert server_flow.parameters["categories"] == "[[0, 1], [0, 1]]"
+        assert server_flow.model.categories == flow.model.categories
 
+    @pytest.mark.production()
     def test_get_flow1(self):
         # Regression test for issue #305
         # Basically, this checks that a flow without an external version can be loaded
         openml.config.server = self.production_server
         flow = openml.flows.get_flow(1)
-        self.assertIsNone(flow.external_version)
+        assert flow.external_version is None
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_get_flow_reinstantiate_model(self):
         model = ensemble.RandomForestClassifier(n_estimators=33)
         extension = openml.extensions.get_extension_by_model(model)
         flow = extension.model_to_flow(model)
         flow.publish(raise_error_if_exists=False)
-        TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name))
+        TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
         TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id))
 
         downloaded_flow = openml.flows.get_flow(flow.flow_id, reinstantiate=True)
-        self.assertIsInstance(downloaded_flow.model, sklearn.ensemble.RandomForestClassifier)
+        assert isinstance(downloaded_flow.model, sklearn.ensemble.RandomForestClassifier)
 
     def test_get_flow_reinstantiate_model_no_extension(self):
         # Flow 10 is a WEKA flow
@@ -326,11 +332,12 @@ def test_get_flow_reinstantiate_model_no_extension(self):
             reinstantiate=True,
         )
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     @unittest.skipIf(
         LooseVersion(sklearn.__version__) == "0.19.1",
         reason="Requires scikit-learn!=0.19.1, because target flow is from that version.",
     )
+    @pytest.mark.production()
     def test_get_flow_with_reinstantiate_strict_with_wrong_version_raises_exception(self):
         openml.config.server = self.production_server
         flow = 8175
@@ -344,44 +351,47 @@ def test_get_flow_with_reinstantiate_strict_with_wrong_version_raises_exception(
             strict_version=True,
         )
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     @unittest.skipIf(
         LooseVersion(sklearn.__version__) < "1" and LooseVersion(sklearn.__version__) != "1.0.0",
-        reason="Requires scikit-learn < 1.0.1."
+        reason="Requires scikit-learn < 1.0.1.",
         # Because scikit-learn dropped min_impurity_split hyperparameter in 1.0,
         # and the requested flow is from 1.0.0 exactly.
     )
+    @pytest.mark.production()
     def test_get_flow_reinstantiate_flow_not_strict_post_1(self):
         openml.config.server = self.production_server
         flow = openml.flows.get_flow(flow_id=19190, reinstantiate=True, strict_version=False)
         assert flow.flow_id is None
         assert "sklearn==1.0.0" not in flow.dependencies
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     @unittest.skipIf(
         (LooseVersion(sklearn.__version__) < "0.23.2")
-        or ("1.0" < LooseVersion(sklearn.__version__)),
-        reason="Requires scikit-learn 0.23.2 or ~0.24."
+        or (LooseVersion(sklearn.__version__) > "1.0"),
+        reason="Requires scikit-learn 0.23.2 or ~0.24.",
         # Because these still have min_impurity_split, but with new scikit-learn module structure."
     )
+    @pytest.mark.production()
     def test_get_flow_reinstantiate_flow_not_strict_023_and_024(self):
         openml.config.server = self.production_server
         flow = openml.flows.get_flow(flow_id=18587, reinstantiate=True, strict_version=False)
         assert flow.flow_id is None
         assert "sklearn==0.23.1" not in flow.dependencies
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     @unittest.skipIf(
-        "0.23" < LooseVersion(sklearn.__version__),
+        LooseVersion(sklearn.__version__) > "0.23",
         reason="Requires scikit-learn<=0.23, because the scikit-learn module structure changed.",
     )
+    @pytest.mark.production()
     def test_get_flow_reinstantiate_flow_not_strict_pre_023(self):
         openml.config.server = self.production_server
         flow = openml.flows.get_flow(flow_id=8175, reinstantiate=True, strict_version=False)
         assert flow.flow_id is None
         assert "sklearn==0.19.1" not in flow.dependencies
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_get_flow_id(self):
         if self.long_version:
             list_all = openml.utils._list_all
@@ -390,27 +400,28 @@ def test_get_flow_id(self):
         with patch("openml.utils._list_all", list_all):
             clf = sklearn.tree.DecisionTreeClassifier()
             flow = openml.extensions.get_extension_by_model(clf).model_to_flow(clf).publish()
-            TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name))
+            TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
             TestBase.logger.info(
-                "collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id)
+                "collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id),
             )
 
-            self.assertEqual(openml.flows.get_flow_id(model=clf, exact_version=True), flow.flow_id)
+            assert openml.flows.get_flow_id(model=clf, exact_version=True) == flow.flow_id
             flow_ids = openml.flows.get_flow_id(model=clf, exact_version=False)
-            self.assertIn(flow.flow_id, flow_ids)
-            self.assertGreater(len(flow_ids), 0)
+            assert flow.flow_id in flow_ids
+            assert len(flow_ids) > 0
 
             # Check that the output of get_flow_id is identical if only the name is given, no matter
             # whether exact_version is set to True or False.
             flow_ids_exact_version_True = openml.flows.get_flow_id(
-                name=flow.name, exact_version=True
+                name=flow.name,
+                exact_version=True,
             )
             flow_ids_exact_version_False = openml.flows.get_flow_id(
                 name=flow.name,
                 exact_version=False,
             )
-            self.assertEqual(flow_ids_exact_version_True, flow_ids_exact_version_False)
-            self.assertIn(flow.flow_id, flow_ids_exact_version_True)
+            assert flow_ids_exact_version_True == flow_ids_exact_version_False
+            assert flow.flow_id in flow_ids_exact_version_True
 
     def test_delete_flow(self):
         flow = openml.OpenMLFlow(
@@ -431,7 +442,7 @@ def test_delete_flow(self):
 
         flow.publish()
         _flow_id = flow.flow_id
-        self.assertTrue(openml.flows.delete_flow(_flow_id))
+        assert openml.flows.delete_flow(_flow_id)
 
 
 @mock.patch.object(requests.Session, "delete")
@@ -439,7 +450,8 @@ def test_delete_flow_not_owned(mock_delete, test_files_directory, test_api_key):
     openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "flows" / "flow_delete_not_owned.xml"
     mock_delete.return_value = create_request_response(
-        status_code=412, content_filepath=content_file
+        status_code=412,
+        content_filepath=content_file,
     )
 
     with pytest.raises(
@@ -460,7 +472,8 @@ def test_delete_flow_with_run(mock_delete, test_files_directory, test_api_key):
     openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "flows" / "flow_delete_has_runs.xml"
     mock_delete.return_value = create_request_response(
-        status_code=412, content_filepath=content_file
+        status_code=412,
+        content_filepath=content_file,
     )
 
     with pytest.raises(
@@ -481,7 +494,8 @@ def test_delete_subflow(mock_delete, test_files_directory, test_api_key):
     openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "flows" / "flow_delete_is_subflow.xml"
     mock_delete.return_value = create_request_response(
-        status_code=412, content_filepath=content_file
+        status_code=412,
+        content_filepath=content_file,
     )
 
     with pytest.raises(
@@ -502,7 +516,8 @@ def test_delete_flow_success(mock_delete, test_files_directory, test_api_key):
     openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "flows" / "flow_delete_successful.xml"
     mock_delete.return_value = create_request_response(
-        status_code=200, content_filepath=content_file
+        status_code=200,
+        content_filepath=content_file,
     )
 
     success = openml.flows.delete_flow(33364)
@@ -520,7 +535,8 @@ def test_delete_unknown_flow(mock_delete, test_files_directory, test_api_key):
     openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "flows" / "flow_delete_not_exist.xml"
     mock_delete.return_value = create_request_response(
-        status_code=412, content_filepath=content_file
+        status_code=412,
+        content_filepath=content_file,
     )
 
     with pytest.raises(
diff --git a/tests/test_openml/test_api_calls.py b/tests/test_openml/test_api_calls.py
index 4a4764bed..8c4c03276 100644
--- a/tests/test_openml/test_api_calls.py
+++ b/tests/test_openml/test_api_calls.py
@@ -1,15 +1,16 @@
+from __future__ import annotations
+
 import unittest.mock
 
+import pytest
+
 import openml
 import openml.testing
 
 
 class TestConfig(openml.testing.TestBase):
     def test_too_long_uri(self):
-        with self.assertRaisesRegex(
-            openml.exceptions.OpenMLServerError,
-            "URI too long!",
-        ):
+        with pytest.raises(openml.exceptions.OpenMLServerError, match="URI too long!"):
             openml.datasets.list_datasets(data_id=list(range(10000)), output_format="dataframe")
 
     @unittest.mock.patch("time.sleep")
@@ -25,9 +26,7 @@ def test_retry_on_database_error(self, Session_class_mock, _):
             "</oml:error>"
         )
         Session_class_mock.return_value.__enter__.return_value.get.return_value = response_mock
-        with self.assertRaisesRegex(
-            openml.exceptions.OpenMLServerException, "/abc returned code 107"
-        ):
+        with pytest.raises(openml.exceptions.OpenMLServerException, match="/abc returned code 107"):
             openml._api_calls._send_request("get", "/abc", {})
 
-        self.assertEqual(Session_class_mock.return_value.__enter__.return_value.get.call_count, 20)
+        assert Session_class_mock.return_value.__enter__.return_value.get.call_count == 20
diff --git a/tests/test_openml/test_config.py b/tests/test_openml/test_config.py
index ba70689a1..bfb88a5db 100644
--- a/tests/test_openml/test_config.py
+++ b/tests/test_openml/test_config.py
@@ -1,30 +1,33 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
-import tempfile
 import os
+import tempfile
 import unittest.mock
+from copy import copy
+from pathlib import Path
+
+import pytest
 
 import openml.config
 import openml.testing
 
 
 class TestConfig(openml.testing.TestBase):
-    @unittest.mock.patch("os.path.expanduser")
     @unittest.mock.patch("openml.config.openml_logger.warning")
     @unittest.mock.patch("openml.config._create_log_handlers")
     @unittest.skipIf(os.name == "nt", "https://github.com/openml/openml-python/issues/1033")
-    def test_non_writable_home(self, log_handler_mock, warnings_mock, expanduser_mock):
+    def test_non_writable_home(self, log_handler_mock, warnings_mock):
         with tempfile.TemporaryDirectory(dir=self.workdir) as td:
-            expanduser_mock.side_effect = (
-                os.path.join(td, "openmldir"),
-                os.path.join(td, "cachedir"),
-            )
             os.chmod(td, 0o444)
-            openml.config._setup()
+            _dd = copy(openml.config._defaults)
+            _dd["cachedir"] = Path(td) / "something-else"
+            openml.config._setup(_dd)
 
-        self.assertEqual(warnings_mock.call_count, 2)
-        self.assertEqual(log_handler_mock.call_count, 1)
-        self.assertFalse(log_handler_mock.call_args_list[0][1]["create_file_handler"])
+        assert warnings_mock.call_count == 2
+        assert log_handler_mock.call_count == 1
+        assert not log_handler_mock.call_args_list[0][1]["create_file_handler"]
+        assert openml.config._root_cache_directory == Path(td) / "something-else"
 
     @unittest.mock.patch("os.path.expanduser")
     def test_XDG_directories_do_not_exist(self, expanduser_mock):
@@ -39,20 +42,20 @@ def side_effect(path_):
     def test_get_config_as_dict(self):
         """Checks if the current configuration is returned accurately as a dict."""
         config = openml.config.get_config_as_dict()
-        _config = dict()
+        _config = {}
         _config["apikey"] = "610344db6388d9ba34f6db45a3cf71de"
         _config["server"] = "https://test.openml.org/api/v1/xml"
         _config["cachedir"] = self.workdir
         _config["avoid_duplicate_runs"] = False
         _config["connection_n_retries"] = 20
         _config["retry_policy"] = "robot"
-        self.assertIsInstance(config, dict)
-        self.assertEqual(len(config), 6)
+        assert isinstance(config, dict)
+        assert len(config) == 6
         self.assertDictEqual(config, _config)
 
     def test_setup_with_config(self):
         """Checks if the OpenML configuration can be updated using _setup()."""
-        _config = dict()
+        _config = {}
         _config["apikey"] = "610344db6388d9ba34f6db45a3cf71de"
         _config["server"] = "https://www.openml.org/api/v1/xml"
         _config["cachedir"] = self.workdir
@@ -67,6 +70,7 @@ def test_setup_with_config(self):
 
 
 class TestConfigurationForExamples(openml.testing.TestBase):
+    @pytest.mark.production()
     def test_switch_to_example_configuration(self):
         """Verifies the test configuration is loaded properly."""
         # Below is the default test key which would be used anyway, but just for clarity:
@@ -75,9 +79,10 @@ def test_switch_to_example_configuration(self):
 
         openml.config.start_using_configuration_for_example()
 
-        self.assertEqual(openml.config.apikey, "c0c42819af31e706efe1f4b88c23c6c1")
-        self.assertEqual(openml.config.server, self.test_server)
+        assert openml.config.apikey == "c0c42819af31e706efe1f4b88c23c6c1"
+        assert openml.config.server == self.test_server
 
+    @pytest.mark.production()
     def test_switch_from_example_configuration(self):
         """Verifies the previous configuration is loaded after stopping."""
         # Below is the default test key which would be used anyway, but just for clarity:
@@ -87,16 +92,19 @@ def test_switch_from_example_configuration(self):
         openml.config.start_using_configuration_for_example()
         openml.config.stop_using_configuration_for_example()
 
-        self.assertEqual(openml.config.apikey, "610344db6388d9ba34f6db45a3cf71de")
-        self.assertEqual(openml.config.server, self.production_server)
+        assert openml.config.apikey == "610344db6388d9ba34f6db45a3cf71de"
+        assert openml.config.server == self.production_server
 
     def test_example_configuration_stop_before_start(self):
         """Verifies an error is raised is `stop_...` is called before `start_...`."""
         error_regex = ".*stop_use_example_configuration.*start_use_example_configuration.*first"
         self.assertRaisesRegex(
-            RuntimeError, error_regex, openml.config.stop_using_configuration_for_example
+            RuntimeError,
+            error_regex,
+            openml.config.stop_using_configuration_for_example,
         )
 
+    @pytest.mark.production()
     def test_example_configuration_start_twice(self):
         """Checks that the original config can be returned to if `start..` is called twice."""
         openml.config.apikey = "610344db6388d9ba34f6db45a3cf71de"
@@ -106,5 +114,5 @@ def test_example_configuration_start_twice(self):
         openml.config.start_using_configuration_for_example()
         openml.config.stop_using_configuration_for_example()
 
-        self.assertEqual(openml.config.apikey, "610344db6388d9ba34f6db45a3cf71de")
-        self.assertEqual(openml.config.server, self.production_server)
+        assert openml.config.apikey == "610344db6388d9ba34f6db45a3cf71de"
+        assert openml.config.server == self.production_server
diff --git a/tests/test_openml/test_openml.py b/tests/test_openml/test_openml.py
index 93d2e6925..998046726 100644
--- a/tests/test_openml/test_openml.py
+++ b/tests/test_openml/test_openml.py
@@ -1,9 +1,10 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
 from unittest import mock
 
-from openml.testing import TestBase
 import openml
+from openml.testing import TestBase
 
 
 class TestInit(TestBase):
@@ -22,21 +23,21 @@ def test_populate_cache(
         task_mock,
     ):
         openml.populate_cache(task_ids=[1, 2], dataset_ids=[3, 4], flow_ids=[5, 6], run_ids=[7, 8])
-        self.assertEqual(run_mock.call_count, 2)
+        assert run_mock.call_count == 2
         for argument, fixture in zip(run_mock.call_args_list, [(7,), (8,)]):
-            self.assertEqual(argument[0], fixture)
+            assert argument[0] == fixture
 
-        self.assertEqual(flow_mock.call_count, 2)
+        assert flow_mock.call_count == 2
         for argument, fixture in zip(flow_mock.call_args_list, [(5,), (6,)]):
-            self.assertEqual(argument[0], fixture)
+            assert argument[0] == fixture
 
-        self.assertEqual(dataset_mock.call_count, 2)
+        assert dataset_mock.call_count == 2
         for argument, fixture in zip(
             dataset_mock.call_args_list,
             [(3,), (4,)],
         ):
-            self.assertEqual(argument[0], fixture)
+            assert argument[0] == fixture
 
-        self.assertEqual(task_mock.call_count, 2)
+        assert task_mock.call_count == 2
         for argument, fixture in zip(task_mock.call_args_list, [(1,), (2,)]):
-            self.assertEqual(argument[0], fixture)
+            assert argument[0] == fixture
diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py
index 0396d0f19..ce46b6548 100644
--- a/tests/test_runs/test_run.py
+++ b/tests/test_runs/test_run.py
@@ -1,24 +1,24 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
-import numpy as np
-import random
 import os
+import random
 from time import time
 
+import numpy as np
+import pytest
 import xmltodict
+from sklearn.base import clone
 from sklearn.dummy import DummyClassifier
 from sklearn.linear_model import LinearRegression
-from sklearn.tree import DecisionTreeClassifier
 from sklearn.model_selection import GridSearchCV
 from sklearn.pipeline import Pipeline
-from sklearn.base import clone
+from sklearn.tree import DecisionTreeClassifier
 
-from openml import OpenMLRun
-from openml.testing import TestBase, SimpleImputer
 import openml
 import openml.extensions.sklearn
-
-import pytest
+from openml import OpenMLRun
+from openml.testing import SimpleImputer, TestBase
 
 
 class TestRun(TestBase):
@@ -30,22 +30,24 @@ def test_tagging(self):
         assert not runs.empty, "Test server state is incorrect"
         run_id = runs["run_id"].iloc[0]
         run = openml.runs.get_run(run_id)
-        tag = "testing_tag_{}_{}".format(self.id(), time())
+        # tags can be at most 64 alphanumeric (+ underscore) chars
+        unique_indicator = str(time()).replace(".", "")
+        tag = f"test_tag_TestRun_{unique_indicator}"
         runs = openml.runs.list_runs(tag=tag, output_format="dataframe")
-        self.assertEqual(len(runs), 0)
+        assert len(runs) == 0
         run.push_tag(tag)
         runs = openml.runs.list_runs(tag=tag, output_format="dataframe")
-        self.assertEqual(len(runs), 1)
-        self.assertIn(run_id, runs["run_id"])
+        assert len(runs) == 1
+        assert run_id in runs["run_id"]
         run.remove_tag(tag)
         runs = openml.runs.list_runs(tag=tag, output_format="dataframe")
-        self.assertEqual(len(runs), 0)
+        assert len(runs) == 0
 
     @staticmethod
     def _test_prediction_data_equal(run, run_prime):
         # Determine which attributes are numeric and which not
         num_cols = np.array(
-            [d_type == "NUMERIC" for _, d_type in run._generate_arff_dict()["attributes"]]
+            [d_type == "NUMERIC" for _, d_type in run._generate_arff_dict()["attributes"]],
         )
         # Get run data consistently
         #   (For run from server, .data_content does not exist)
@@ -68,15 +70,12 @@ def _test_run_obj_equals(self, run, run_prime):
                 # should be none or empty
                 other = getattr(run_prime, dictionary)
                 if other is not None:
-                    self.assertDictEqual(other, dict())
-        self.assertEqual(run._to_xml(), run_prime._to_xml())
+                    self.assertDictEqual(other, {})
+        assert run._to_xml() == run_prime._to_xml()
         self._test_prediction_data_equal(run, run_prime)
 
         # Test trace
-        if run.trace is not None:
-            run_trace_content = run.trace.trace_to_arff()["data"]
-        else:
-            run_trace_content = None
+        run_trace_content = run.trace.trace_to_arff()["data"] if run.trace is not None else None
 
         if run_prime.trace is not None:
             run_prime_trace_content = run_prime.trace.trace_to_arff()["data"]
@@ -88,7 +87,7 @@ def _test_run_obj_equals(self, run, run_prime):
             def _check_array(array, type_):
                 for line in array:
                     for entry in line:
-                        self.assertIsInstance(entry, type_)
+                        assert isinstance(entry, type_)
 
             int_part = [line[:3] for line in run_trace_content]
             _check_array(int_part, int)
@@ -106,25 +105,25 @@ def _check_array(array, type_):
             bool_part = [line[4] for line in run_trace_content]
             bool_part_prime = [line[4] for line in run_prime_trace_content]
             for bp, bpp in zip(bool_part, bool_part_prime):
-                self.assertIn(bp, ["true", "false"])
-                self.assertIn(bpp, ["true", "false"])
+                assert bp in ["true", "false"]
+                assert bpp in ["true", "false"]
             string_part = np.array(run_trace_content)[:, 5:]
             string_part_prime = np.array(run_prime_trace_content)[:, 5:]
 
             np.testing.assert_array_almost_equal(int_part, int_part_prime)
             np.testing.assert_array_almost_equal(float_part, float_part_prime)
-            self.assertEqual(bool_part, bool_part_prime)
+            assert bool_part == bool_part_prime
             np.testing.assert_array_equal(string_part, string_part_prime)
         else:
-            self.assertIsNone(run_prime_trace_content)
+            assert run_prime_trace_content is None
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_to_from_filesystem_vanilla(self):
         model = Pipeline(
             [
                 ("imputer", SimpleImputer(strategy="mean")),
                 ("classifier", DecisionTreeClassifier(max_depth=1)),
-            ]
+            ],
         )
         task = openml.tasks.get_task(119)  # diabetes; crossvalidation
         run = openml.runs.run_model_on_task(
@@ -144,23 +143,23 @@ def test_to_from_filesystem_vanilla(self):
 
         run_prime = openml.runs.OpenMLRun.from_filesystem(cache_path)
         # The flow has been uploaded to server, so only the reference flow_id should be present
-        self.assertTrue(run_prime.flow_id is not None)
-        self.assertTrue(run_prime.flow is None)
+        assert run_prime.flow_id is not None
+        assert run_prime.flow is None
         self._test_run_obj_equals(run, run_prime)
         run_prime.publish()
         TestBase._mark_entity_for_removal("run", run_prime.run_id)
         TestBase.logger.info(
-            "collected from {}: {}".format(__file__.split("/")[-1], run_prime.run_id)
+            "collected from {}: {}".format(__file__.split("/")[-1], run_prime.run_id),
         )
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     @pytest.mark.flaky()
     def test_to_from_filesystem_search(self):
         model = Pipeline(
             [
                 ("imputer", SimpleImputer(strategy="mean")),
                 ("classifier", DecisionTreeClassifier(max_depth=1)),
-            ]
+            ],
         )
         model = GridSearchCV(
             estimator=model,
@@ -186,13 +185,13 @@ def test_to_from_filesystem_search(self):
         run_prime.publish()
         TestBase._mark_entity_for_removal("run", run_prime.run_id)
         TestBase.logger.info(
-            "collected from {}: {}".format(__file__.split("/")[-1], run_prime.run_id)
+            "collected from {}: {}".format(__file__.split("/")[-1], run_prime.run_id),
         )
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_to_from_filesystem_no_model(self):
         model = Pipeline(
-            [("imputer", SimpleImputer(strategy="mean")), ("classifier", DummyClassifier())]
+            [("imputer", SimpleImputer(strategy="mean")), ("classifier", DummyClassifier())],
         )
         task = openml.tasks.get_task(119)  # diabetes; crossvalidation
         run = openml.runs.run_model_on_task(model=model, task=task, add_local_measures=False)
@@ -211,7 +210,7 @@ def _get_models_tasks_for_tests():
             [
                 ("imputer", SimpleImputer(strategy="mean")),
                 ("classifier", DummyClassifier(strategy="prior")),
-            ]
+            ],
         )
         model_reg = Pipeline(
             [
@@ -221,7 +220,7 @@ def _get_models_tasks_for_tests():
                     # LR because dummy does not produce enough float-like values
                     LinearRegression(),
                 ),
-            ]
+            ],
         )
 
         task_clf = openml.tasks.get_task(119)  # diabetes; hold out validation
@@ -256,7 +255,7 @@ def assert_run_prediction_data(task, run, model):
 
             # Get stored data for fold
             saved_fold_data = run.predictions[run.predictions["fold"] == fold_id].sort_values(
-                by="row_id"
+                by="row_id",
             )
             saved_y_pred = saved_fold_data["prediction"].values
             gt_key = "truth" if "truth" in list(saved_fold_data) else "correct"
@@ -272,7 +271,7 @@ def assert_run_prediction_data(task, run, model):
             assert_method(y_pred, saved_y_pred)
             assert_method(y_test, saved_y_test)
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_publish_with_local_loaded_flow(self):
         """
         Publish a run tied to a local flow after it has first been saved to
@@ -284,7 +283,7 @@ def test_publish_with_local_loaded_flow(self):
             # Make sure the flow does not exist on the server yet.
             flow = extension.model_to_flow(model)
             self._add_sentinel_to_flow_name(flow)
-            self.assertFalse(openml.flows.flow_exists(flow.name, flow.external_version))
+            assert not openml.flows.flow_exists(flow.name, flow.external_version)
 
             run = openml.runs.run_flow_on_task(
                 flow=flow,
@@ -295,7 +294,7 @@ def test_publish_with_local_loaded_flow(self):
             )
 
             # Make sure that the flow has not been uploaded as requested.
-            self.assertFalse(openml.flows.flow_exists(flow.name, flow.external_version))
+            assert not openml.flows.flow_exists(flow.name, flow.external_version)
 
             # Make sure that the prediction data stored in the run is correct.
             self.assert_run_prediction_data(task, run, clone(model))
@@ -309,14 +308,14 @@ def test_publish_with_local_loaded_flow(self):
             # Clean up
             TestBase._mark_entity_for_removal("run", loaded_run.run_id)
             TestBase.logger.info(
-                "collected from {}: {}".format(__file__.split("/")[-1], loaded_run.run_id)
+                "collected from {}: {}".format(__file__.split("/")[-1], loaded_run.run_id),
             )
 
             # make sure the flow is published as part of publishing the run.
-            self.assertTrue(openml.flows.flow_exists(flow.name, flow.external_version))
+            assert openml.flows.flow_exists(flow.name, flow.external_version)
             openml.runs.get_run(loaded_run.run_id)
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_offline_and_online_run_identical(self):
         extension = openml.extensions.sklearn.SklearnExtension()
 
@@ -324,7 +323,7 @@ def test_offline_and_online_run_identical(self):
             # Make sure the flow does not exist on the server yet.
             flow = extension.model_to_flow(model)
             self._add_sentinel_to_flow_name(flow)
-            self.assertFalse(openml.flows.flow_exists(flow.name, flow.external_version))
+            assert not openml.flows.flow_exists(flow.name, flow.external_version)
 
             run = openml.runs.run_flow_on_task(
                 flow=flow,
@@ -335,7 +334,7 @@ def test_offline_and_online_run_identical(self):
             )
 
             # Make sure that the flow has not been uploaded as requested.
-            self.assertFalse(openml.flows.flow_exists(flow.name, flow.external_version))
+            assert not openml.flows.flow_exists(flow.name, flow.external_version)
 
             # Load from filesystem
             cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128)))
@@ -347,7 +346,7 @@ def test_offline_and_online_run_identical(self):
 
             # Publish and test for offline - online
             run.publish()
-            self.assertTrue(openml.flows.flow_exists(flow.name, flow.external_version))
+            assert openml.flows.flow_exists(flow.name, flow.external_version)
 
             try:
                 online_run = openml.runs.get_run(run.run_id, ignore_cache=True)
@@ -356,7 +355,7 @@ def test_offline_and_online_run_identical(self):
                 # Clean up
                 TestBase._mark_entity_for_removal("run", run.run_id)
                 TestBase.logger.info(
-                    "collected from {}: {}".format(__file__.split("/")[-1], loaded_run.run_id)
+                    "collected from {}: {}".format(__file__.split("/")[-1], loaded_run.run_id),
                 )
 
     def test_run_setup_string_included_in_xml(self):
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 8f3c0a71b..edd7e0198 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -1,57 +1,60 @@
 # License: BSD 3-Clause
-import arff
-from distutils.version import LooseVersion
+from __future__ import annotations
+
+import ast
 import os
 import random
 import time
-import sys
-import ast
+import unittest
+import warnings
+from distutils.version import LooseVersion
 from unittest import mock
 
-import numpy as np
+import arff
 import joblib
+import numpy as np
+import pandas as pd
+import pytest
 import requests
+import sklearn
 from joblib import parallel_backend
+from sklearn.dummy import DummyClassifier
+from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
+from sklearn.feature_selection import VarianceThreshold
+from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier
+from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
+from sklearn.model_selection._search import BaseSearchCV
+from sklearn.naive_bayes import GaussianNB
+from sklearn.pipeline import Pipeline, make_pipeline
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
 
 import openml
-import openml.exceptions
 import openml._api_calls
-import sklearn
-import unittest
-import warnings
-import pandas as pd
-import pytest
-
+import openml.exceptions
 import openml.extensions.sklearn
-from openml.testing import TestBase, SimpleImputer, CustomImputer, create_request_response
+from openml.exceptions import (
+    OpenMLNotAuthorizedError,
+    OpenMLServerException,
+)
 from openml.extensions.sklearn import cat, cont
 from openml.runs.functions import (
     _run_task_get_arffcontent,
-    run_exists,
-    format_prediction,
     delete_run,
+    format_prediction,
+    run_exists,
 )
 from openml.runs.trace import OpenMLRunTrace
 from openml.tasks import TaskType
-from openml.testing import check_task_existence
-from openml.exceptions import (
-    OpenMLServerException,
-    OpenMLNotAuthorizedError,
+from openml.testing import (
+    CustomImputer,
+    SimpleImputer,
+    TestBase,
+    check_task_existence,
+    create_request_response,
 )
 
-from sklearn.naive_bayes import GaussianNB
-from sklearn.model_selection._search import BaseSearchCV
-from sklearn.tree import DecisionTreeClassifier
-
-from sklearn.dummy import DummyClassifier
-from sklearn.preprocessing import StandardScaler, OneHotEncoder
-from sklearn.feature_selection import VarianceThreshold
-from sklearn.linear_model import LogisticRegression, SGDClassifier, LinearRegression
-from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
-from sklearn.svm import SVC
-from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold
-from sklearn.pipeline import Pipeline, make_pipeline
-
 
 class TestRun(TestBase):
     _multiprocess_can_split_ = True
@@ -131,14 +134,12 @@ def _wait_for_processed_run(self, run_id, max_waiting_time_seconds):
             return
 
         raise RuntimeError(
-            "Could not find any evaluations! Please check whether run {} was "
-            "evaluated correctly on the server".format(run_id)
+            f"Could not find any evaluations! Please check whether run {run_id} was "
+            "evaluated correctly on the server",
         )
 
     def _assert_predictions_equal(self, predictions, predictions_prime):
-        self.assertEqual(
-            np.array(predictions_prime["data"]).shape, np.array(predictions["data"]).shape
-        )
+        assert np.array(predictions_prime["data"]).shape == np.array(predictions["data"]).shape
 
         # The original search model does not submit confidence
         # bounds, so we can not compare the arff line
@@ -150,14 +151,14 @@ def _assert_predictions_equal(self, predictions, predictions_prime):
             for col_idx in compare_slice:
                 val_1 = predictions["data"][idx][col_idx]
                 val_2 = predictions_prime["data"][idx][col_idx]
-                if type(val_1) == float or type(val_2) == float:
+                if isinstance(val_1, float) or isinstance(val_2, float):
                     self.assertAlmostEqual(
                         float(val_1),
                         float(val_2),
                         places=6,
                     )
                 else:
-                    self.assertEqual(val_1, val_2)
+                    assert val_1 == val_2
 
     def _rerun_model_and_compare_predictions(self, run_id, model_prime, seed, create_task_obj):
         run = openml.runs.get_run(run_id)
@@ -211,7 +212,7 @@ def _perform_run(
         Runs a classifier on a task, and performs some basic checks.
         Also uploads the run.
 
-        Parameters:
+        Parameters
         ----------
         task_id : int
 
@@ -238,8 +239,8 @@ def _perform_run(
         sentinel: optional, str
             in case the sentinel should be user specified
 
-        Returns:
-        --------
+        Returns
+        -------
         run: OpenMLRun
             The performed run (with run id)
         """
@@ -262,13 +263,13 @@ def _remove_random_state(flow):
         flow, _ = self._add_sentinel_to_flow_name(flow, sentinel)
         if not openml.flows.flow_exists(flow.name, flow.external_version):
             flow.publish()
-            TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name))
-            TestBase.logger.info("collected from test_run_functions: {}".format(flow.flow_id))
+            TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
+            TestBase.logger.info(f"collected from test_run_functions: {flow.flow_id}")
 
         task = openml.tasks.get_task(task_id)
 
         X, y = task.get_X_and_y()
-        self.assertEqual(np.count_nonzero(np.isnan(X)), n_missing_vals)
+        assert np.count_nonzero(np.isnan(X)) == n_missing_vals
         run = openml.runs.run_flow_on_task(
             flow=flow,
             task=task,
@@ -277,9 +278,9 @@ def _remove_random_state(flow):
         )
         run_ = run.publish()
         TestBase._mark_entity_for_removal("run", run.run_id)
-        TestBase.logger.info("collected from test_run_functions: {}".format(run.run_id))
-        self.assertEqual(run_, run)
-        self.assertIsInstance(run.dataset_id, int)
+        TestBase.logger.info(f"collected from test_run_functions: {run.run_id}")
+        assert run_ == run
+        assert isinstance(run.dataset_id, int)
 
         # This is only a smoke check right now
         # TODO add a few asserts here
@@ -290,7 +291,7 @@ def _remove_random_state(flow):
             run.trace.trace_to_arff()
 
         # check arff output
-        self.assertEqual(len(run.data_content), num_instances)
+        assert len(run.data_content) == num_instances
 
         if check_setup:
             # test the initialize setup function
@@ -307,14 +308,14 @@ def _remove_random_state(flow):
                     flow.class_name,
                     flow.flow_id,
                 )
-                self.assertIn("random_state", flow.parameters, error_msg)
+                assert "random_state" in flow.parameters, error_msg
                 # If the flow is initialized from a model without a random
                 # state, the flow is on the server without any random state
-                self.assertEqual(flow.parameters["random_state"], "null")
+                assert flow.parameters["random_state"] == "null"
                 # As soon as a flow is run, a random state is set in the model.
                 # If a flow is re-instantiated
-                self.assertEqual(flow_local.parameters["random_state"], flow_expected_rsv)
-                self.assertEqual(flow_server.parameters["random_state"], flow_expected_rsv)
+                assert flow_local.parameters["random_state"] == flow_expected_rsv
+                assert flow_server.parameters["random_state"] == flow_expected_rsv
             _remove_random_state(flow_local)
             _remove_random_state(flow_server)
             openml.flows.assert_flows_equal(flow_local, flow_server)
@@ -325,7 +326,7 @@ def _remove_random_state(flow):
             )
             flow_server2 = self.extension.model_to_flow(clf_server2)
             if flow.class_name not in classes_without_random_state:
-                self.assertEqual(flow_server2.parameters["random_state"], flow_expected_rsv)
+                assert flow_server2.parameters["random_state"] == flow_expected_rsv
 
             _remove_random_state(flow_server2)
             openml.flows.assert_flows_equal(flow_local, flow_server2)
@@ -345,7 +346,12 @@ def _remove_random_state(flow):
         return run
 
     def _check_sample_evaluations(
-        self, sample_evaluations, num_repeats, num_folds, num_samples, max_time_allowed=60000
+        self,
+        sample_evaluations,
+        num_repeats,
+        num_folds,
+        num_samples,
+        max_time_allowed=60000,
     ):
         """
         Checks whether the right timing measures are attached to the run
@@ -356,7 +362,6 @@ def _check_sample_evaluations(
         default max_time_allowed (per fold, in milli seconds) = 1 minute,
         quite pessimistic
         """
-
         # a dict mapping from openml measure to a tuple with the minimum and
         # maximum allowed value
         check_measures = {
@@ -370,31 +375,28 @@ def _check_sample_evaluations(
             "predictive_accuracy": (0, 1),
         }
 
-        self.assertIsInstance(sample_evaluations, dict)
-        if sys.version_info[:2] >= (3, 3):
-            # this only holds if we are allowed to record time (otherwise some
-            # are missing)
-            self.assertEqual(set(sample_evaluations.keys()), set(check_measures.keys()))
+        assert isinstance(sample_evaluations, dict)
+        assert set(sample_evaluations.keys()) == set(check_measures.keys())
 
-        for measure in check_measures.keys():
+        for measure in check_measures:
             if measure in sample_evaluations:
                 num_rep_entrees = len(sample_evaluations[measure])
-                self.assertEqual(num_rep_entrees, num_repeats)
+                assert num_rep_entrees == num_repeats
                 for rep in range(num_rep_entrees):
                     num_fold_entrees = len(sample_evaluations[measure][rep])
-                    self.assertEqual(num_fold_entrees, num_folds)
+                    assert num_fold_entrees == num_folds
                     for fold in range(num_fold_entrees):
                         num_sample_entrees = len(sample_evaluations[measure][rep][fold])
-                        self.assertEqual(num_sample_entrees, num_samples)
+                        assert num_sample_entrees == num_samples
                         for sample in range(num_sample_entrees):
                             evaluation = sample_evaluations[measure][rep][fold][sample]
-                            self.assertIsInstance(evaluation, float)
+                            assert isinstance(evaluation, float)
                             if not (os.environ.get("CI_WINDOWS") or os.name == "nt"):
                                 # Windows seems to get an eval-time of 0 sometimes.
-                                self.assertGreater(evaluation, 0)
-                            self.assertLess(evaluation, max_time_allowed)
+                                assert evaluation > 0
+                            assert evaluation < max_time_allowed
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_run_regression_on_classif_task(self):
         task_id = 115  # diabetes; crossvalidation
 
@@ -402,8 +404,8 @@ def test_run_regression_on_classif_task(self):
         task = openml.tasks.get_task(task_id)
         # internally dataframe is loaded and targets are categorical
         # which LinearRegression() cannot handle
-        with self.assertRaisesRegex(
-            AttributeError, "'LinearRegression' object has no attribute 'classes_'"
+        with pytest.raises(
+            AttributeError, match="'LinearRegression' object has no attribute 'classes_'"
         ):
             openml.runs.run_model_on_task(
                 model=clf,
@@ -412,7 +414,7 @@ def test_run_regression_on_classif_task(self):
                 dataset_format="array",
             )
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_check_erronous_sklearn_flow_fails(self):
         task_id = 115  # diabetes; crossvalidation
         task = openml.tasks.get_task(task_id)
@@ -431,7 +433,7 @@ def test_check_erronous_sklearn_flow_fails(self):
             exceptions = (ValueError, InvalidParameterError)
         except ImportError:
             exceptions = (ValueError,)
-        with self.assertRaises(exceptions):
+        with pytest.raises(exceptions):
             openml.runs.run_model_on_task(
                 task=task,
                 model=clf,
@@ -492,18 +494,18 @@ def determine_grid_size(param_grid):
         scores = run.get_metric_fn(metric)
         # compare with the scores in user defined measures
         scores_provided = []
-        for rep in run.fold_evaluations[metric_name].keys():
-            for fold in run.fold_evaluations[metric_name][rep].keys():
+        for rep in run.fold_evaluations[metric_name]:
+            for fold in run.fold_evaluations[metric_name][rep]:
                 scores_provided.append(run.fold_evaluations[metric_name][rep][fold])
-        self.assertEqual(sum(scores_provided), sum(scores))
+        assert sum(scores_provided) == sum(scores)
 
         if isinstance(clf, BaseSearchCV):
             trace_content = run.trace.trace_to_arff()["data"]
             if isinstance(clf, GridSearchCV):
                 grid_iterations = determine_grid_size(clf.param_grid)
-                self.assertEqual(len(trace_content), grid_iterations * num_folds)
+                assert len(trace_content) == grid_iterations * num_folds
             else:
-                self.assertEqual(len(trace_content), num_iterations * num_folds)
+                assert len(trace_content) == num_iterations * num_folds
 
             # downloads the best model based on the optimization trace
             # suboptimal (slow), and not guaranteed to work if evaluation
@@ -521,24 +523,41 @@ def determine_grid_size(param_grid):
                 raise e
 
             self._rerun_model_and_compare_predictions(
-                run.run_id, model_prime, seed, create_task_obj=True
+                run.run_id,
+                model_prime,
+                seed,
+                create_task_obj=True,
             )
             self._rerun_model_and_compare_predictions(
-                run.run_id, model_prime, seed, create_task_obj=False
+                run.run_id,
+                model_prime,
+                seed,
+                create_task_obj=False,
             )
         else:
             run_downloaded = openml.runs.get_run(run.run_id)
             sid = run_downloaded.setup_id
             model_prime = openml.setups.initialize_model(sid)
             self._rerun_model_and_compare_predictions(
-                run.run_id, model_prime, seed, create_task_obj=True
+                run.run_id,
+                model_prime,
+                seed,
+                create_task_obj=True,
             )
             self._rerun_model_and_compare_predictions(
-                run.run_id, model_prime, seed, create_task_obj=False
+                run.run_id,
+                model_prime,
+                seed,
+                create_task_obj=False,
             )
 
         # todo: check if runtime is present
-        self._check_fold_timing_evaluations(run.fold_evaluations, 1, num_folds, task_type=task_type)
+        self._check_fold_timing_evaluations(
+            fold_evaluations=run.fold_evaluations,
+            num_repeats=1,
+            num_folds=num_folds,
+            task_type=task_type
+        )
 
         # Check if run string and print representation do not run into an error
         #   The above check already verifies that all columns needed for supported
@@ -550,7 +569,13 @@ def determine_grid_size(param_grid):
         return run
 
     def _run_and_upload_classification(
-        self, clf, task_id, n_missing_vals, n_test_obs, flow_expected_rsv, sentinel=None
+        self,
+        clf,
+        task_id,
+        n_missing_vals,
+        n_test_obs,
+        flow_expected_rsv,
+        sentinel=None,
     ):
         num_folds = 1  # because of holdout
         num_iterations = 5  # for base search algorithms
@@ -573,7 +598,13 @@ def _run_and_upload_classification(
         )
 
     def _run_and_upload_regression(
-        self, clf, task_id, n_missing_vals, n_test_obs, flow_expected_rsv, sentinel=None
+        self,
+        clf,
+        task_id,
+        n_missing_vals,
+        n_test_obs,
+        flow_expected_rsv,
+        sentinel=None,
     ):
         num_folds = 10  # because of cross-validation
         num_iterations = 5  # for base search algorithms
@@ -595,7 +626,7 @@ def _run_and_upload_regression(
             sentinel=sentinel,
         )
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_run_and_upload_logistic_regression(self):
         lr = LogisticRegression(solver="lbfgs", max_iter=1000)
         task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"]
@@ -603,7 +634,7 @@ def test_run_and_upload_logistic_regression(self):
         n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"]
         self._run_and_upload_classification(lr, task_id, n_missing_vals, n_test_obs, "62501")
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_run_and_upload_linear_regression(self):
         lr = LinearRegression()
         task_id = self.TEST_SERVER_TASK_REGRESSION["task_id"]
@@ -627,26 +658,26 @@ def test_run_and_upload_linear_regression(self):
                     raise Exception(repr(e))
             # mark to remove the uploaded task
             TestBase._mark_entity_for_removal("task", task_id)
-            TestBase.logger.info("collected from test_run_functions: {}".format(task_id))
+            TestBase.logger.info(f"collected from test_run_functions: {task_id}")
 
         n_missing_vals = self.TEST_SERVER_TASK_REGRESSION["n_missing_vals"]
         n_test_obs = self.TEST_SERVER_TASK_REGRESSION["n_test_obs"]
         self._run_and_upload_regression(lr, task_id, n_missing_vals, n_test_obs, "62501")
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_run_and_upload_pipeline_dummy_pipeline(self):
         pipeline1 = Pipeline(
             steps=[
                 ("scaler", StandardScaler(with_mean=False)),
                 ("dummy", DummyClassifier(strategy="prior")),
-            ]
+            ],
         )
         task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"]
         n_missing_vals = self.TEST_SERVER_TASK_SIMPLE["n_missing_vals"]
         n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"]
         self._run_and_upload_classification(pipeline1, task_id, n_missing_vals, n_test_obs, "62501")
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     @unittest.skipIf(
         LooseVersion(sklearn.__version__) < "0.20",
         reason="columntransformer introduction in 0.20.0",
@@ -661,7 +692,8 @@ def get_ct_cf(nominal_indices, numeric_indices):
                     (
                         "numeric",
                         make_pipeline(
-                            SimpleImputer(strategy="mean"), sklearn.preprocessing.StandardScaler()
+                            SimpleImputer(strategy="mean"),
+                            sklearn.preprocessing.StandardScaler(),
                         ),
                         numeric_indices,
                     ),
@@ -680,7 +712,7 @@ def get_ct_cf(nominal_indices, numeric_indices):
                 steps=[
                     ("transformer", inner),
                     ("classifier", sklearn.tree.DecisionTreeClassifier()),
-                ]
+                ],
             )
 
         sentinel = self._get_sentinel()
@@ -709,7 +741,7 @@ def get_ct_cf(nominal_indices, numeric_indices):
             sentinel=sentinel,
         )
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     @unittest.skip("https://github.com/openml/OpenML/issues/1180")
     @unittest.skipIf(
         LooseVersion(sklearn.__version__) < "0.20",
@@ -718,7 +750,8 @@ def get_ct_cf(nominal_indices, numeric_indices):
     @mock.patch("warnings.warn")
     def test_run_and_upload_knn_pipeline(self, warnings_mock):
         cat_imp = make_pipeline(
-            SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore")
+            SimpleImputer(strategy="most_frequent"),
+            OneHotEncoder(handle_unknown="ignore"),
         )
         cont_imp = make_pipeline(CustomImputer(), StandardScaler())
         from sklearn.compose import ColumnTransformer
@@ -733,12 +766,12 @@ def test_run_and_upload_knn_pipeline(self, warnings_mock):
                     "Estimator",
                     RandomizedSearchCV(
                         KNeighborsClassifier(),
-                        {"n_neighbors": [x for x in range(2, 10)]},
+                        {"n_neighbors": list(range(2, 10))},
                         cv=3,
                         n_iter=10,
                     ),
                 ),
-            ]
+            ],
         )
 
         task_id = self.TEST_SERVER_TASK_MISSING_VALS["task_id"]
@@ -758,9 +791,9 @@ def test_run_and_upload_knn_pipeline(self, warnings_mock):
         for _warnings in warnings_mock.call_args_list:
             if _warnings[0][0] == warning_msg:
                 call_count += 1
-        self.assertEqual(call_count, 3)
+        assert call_count == 3
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_run_and_upload_gridsearch(self):
         gridsearch = GridSearchCV(
             BaggingClassifier(base_estimator=SVC()),
@@ -777,9 +810,9 @@ def test_run_and_upload_gridsearch(self):
             n_test_obs=n_test_obs,
             flow_expected_rsv="62501",
         )
-        self.assertEqual(len(run.trace.trace_iterations), 9)
+        assert len(run.trace.trace_iterations) == 9
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_run_and_upload_randomsearch(self):
         randomsearch = RandomizedSearchCV(
             RandomForestClassifier(n_estimators=5),
@@ -807,11 +840,11 @@ def test_run_and_upload_randomsearch(self):
             n_test_obs=n_test_obs,
             flow_expected_rsv="12172",
         )
-        self.assertEqual(len(run.trace.trace_iterations), 5)
+        assert len(run.trace.trace_iterations) == 5
         trace = openml.runs.get_run_trace(run.run_id)
-        self.assertEqual(len(trace.trace_iterations), 5)
+        assert len(trace.trace_iterations) == 5
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_run_and_upload_maskedarrays(self):
         # This testcase is important for 2 reasons:
         # 1) it verifies the correct handling of masked arrays (not all
@@ -829,12 +862,16 @@ def test_run_and_upload_maskedarrays(self):
         n_missing_vals = self.TEST_SERVER_TASK_SIMPLE["n_missing_vals"]
         n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"]
         self._run_and_upload_classification(
-            gridsearch, task_id, n_missing_vals, n_test_obs, "12172"
+            gridsearch,
+            task_id,
+            n_missing_vals,
+            n_test_obs,
+            "12172",
         )
 
     ##########################################################################
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_learning_curve_task_1(self):
         task_id = 801  # diabates dataset
         num_test_instances = 6144  # for learning curve
@@ -847,14 +884,18 @@ def test_learning_curve_task_1(self):
             steps=[
                 ("scaler", StandardScaler(with_mean=False)),
                 ("dummy", DummyClassifier(strategy="prior")),
-            ]
+            ],
         )
         run = self._perform_run(
-            task_id, num_test_instances, num_missing_vals, pipeline1, flow_expected_rsv="62501"
+            task_id,
+            num_test_instances,
+            num_missing_vals,
+            pipeline1,
+            flow_expected_rsv="62501",
         )
         self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples)
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_learning_curve_task_2(self):
         task_id = 801  # diabates dataset
         num_test_instances = 6144  # for learning curve
@@ -873,20 +914,24 @@ def test_learning_curve_task_2(self):
                         DecisionTreeClassifier(),
                         {
                             "min_samples_split": [2**x for x in range(1, 8)],
-                            "min_samples_leaf": [2**x for x in range(0, 7)],
+                            "min_samples_leaf": [2**x for x in range(7)],
                         },
                         cv=3,
                         n_iter=10,
                     ),
                 ),
-            ]
+            ],
         )
         run = self._perform_run(
-            task_id, num_test_instances, num_missing_vals, pipeline2, flow_expected_rsv="62501"
+            task_id,
+            num_test_instances,
+            num_missing_vals,
+            pipeline2,
+            flow_expected_rsv="62501",
         )
         self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples)
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     @unittest.skipIf(
         LooseVersion(sklearn.__version__) < "0.21",
         reason="Pipelines don't support indexing (used for the assert check)",
@@ -911,7 +956,7 @@ def test_initialize_cv_from_run(self):
                         n_iter=2,
                     ),
                 ),
-            ]
+            ],
         )
 
         task = openml.tasks.get_task(11)  # kr-vs-kp; holdout
@@ -923,22 +968,22 @@ def test_initialize_cv_from_run(self):
         )
         run_ = run.publish()
         TestBase._mark_entity_for_removal("run", run.run_id)
-        TestBase.logger.info("collected from test_run_functions: {}".format(run.run_id))
+        TestBase.logger.info(f"collected from test_run_functions: {run.run_id}")
         run = openml.runs.get_run(run_.run_id)
 
         modelR = openml.runs.initialize_model_from_run(run_id=run.run_id)
         modelS = openml.setups.initialize_model(setup_id=run.setup_id)
 
-        self.assertEqual(modelS[-1].cv.random_state, 62501)
-        self.assertEqual(modelR[-1].cv.random_state, 62501)
+        assert modelS[-1].cv.random_state == 62501
+        assert modelR[-1].cv.random_state == 62501
 
     def _test_local_evaluations(self, run):
         # compare with the scores in user defined measures
         accuracy_scores_provided = []
-        for rep in run.fold_evaluations["predictive_accuracy"].keys():
-            for fold in run.fold_evaluations["predictive_accuracy"][rep].keys():
+        for rep in run.fold_evaluations["predictive_accuracy"]:
+            for fold in run.fold_evaluations["predictive_accuracy"][rep]:
                 accuracy_scores_provided.append(
-                    run.fold_evaluations["predictive_accuracy"][rep][fold]
+                    run.fold_evaluations["predictive_accuracy"][rep][fold],
                 )
         accuracy_scores = run.get_metric_fn(sklearn.metrics.accuracy_score)
         np.testing.assert_array_almost_equal(accuracy_scores_provided, accuracy_scores)
@@ -955,17 +1000,17 @@ def _test_local_evaluations(self, run):
             tests.append((sklearn.metrics.jaccard_similarity_score, {}))
         else:
             tests.append((sklearn.metrics.jaccard_score, {}))
-        for test_idx, test in enumerate(tests):
+        for _test_idx, test in enumerate(tests):
             alt_scores = run.get_metric_fn(
                 sklearn_fn=test[0],
                 kwargs=test[1],
             )
-            self.assertEqual(len(alt_scores), 10)
+            assert len(alt_scores) == 10
             for idx in range(len(alt_scores)):
-                self.assertGreaterEqual(alt_scores[idx], 0)
-                self.assertLessEqual(alt_scores[idx], 1)
+                assert alt_scores[idx] >= 0
+                assert alt_scores[idx] <= 1
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_local_run_swapped_parameter_order_model(self):
         clf = DecisionTreeClassifier()
         australian_task = 595  # Australian; crossvalidation
@@ -981,7 +1026,7 @@ def test_local_run_swapped_parameter_order_model(self):
 
         self._test_local_evaluations(run)
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     @unittest.skipIf(
         LooseVersion(sklearn.__version__) < "0.20",
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
@@ -993,7 +1038,7 @@ def test_local_run_swapped_parameter_order_flow(self):
                 ("imputer", SimpleImputer(strategy="most_frequent")),
                 ("encoder", OneHotEncoder(handle_unknown="ignore")),
                 ("estimator", RandomForestClassifier(n_estimators=10)),
-            ]
+            ],
         )
 
         flow = self.extension.model_to_flow(clf)
@@ -1010,7 +1055,7 @@ def test_local_run_swapped_parameter_order_flow(self):
 
         self._test_local_evaluations(run)
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     @unittest.skipIf(
         LooseVersion(sklearn.__version__) < "0.20",
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
@@ -1022,7 +1067,7 @@ def test_local_run_metric_score(self):
                 ("imputer", SimpleImputer(strategy="most_frequent")),
                 ("encoder", OneHotEncoder(handle_unknown="ignore")),
                 ("estimator", RandomForestClassifier(n_estimators=10)),
-            ]
+            ],
         )
 
         # download task
@@ -1038,6 +1083,7 @@ def test_local_run_metric_score(self):
 
         self._test_local_evaluations(run)
 
+    @pytest.mark.production()
     def test_online_run_metric_score(self):
         openml.config.server = self.production_server
 
@@ -1047,7 +1093,7 @@ def test_online_run_metric_score(self):
 
         self._test_local_evaluations(run)
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     @unittest.skipIf(
         LooseVersion(sklearn.__version__) < "0.20",
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
@@ -1058,7 +1104,7 @@ def test_initialize_model_from_run(self):
                 ("Imputer", SimpleImputer(strategy="most_frequent")),
                 ("VarianceThreshold", VarianceThreshold(threshold=0.05)),
                 ("Estimator", GaussianNB()),
-            ]
+            ],
         )
         task_meta_data = {
             "task_type": TaskType.SUPERVISED_CLASSIFICATION,
@@ -1084,7 +1130,7 @@ def test_initialize_model_from_run(self):
                     raise Exception(repr(e))
             # mark to remove the uploaded task
             TestBase._mark_entity_for_removal("task", task_id)
-            TestBase.logger.info("collected from test_run_functions: {}".format(task_id))
+            TestBase.logger.info(f"collected from test_run_functions: {task_id}")
 
         task = openml.tasks.get_task(task_id)
         run = openml.runs.run_model_on_task(
@@ -1094,7 +1140,7 @@ def test_initialize_model_from_run(self):
         )
         run_ = run.publish()
         TestBase._mark_entity_for_removal("run", run_.run_id)
-        TestBase.logger.info("collected from test_run_functions: {}".format(run_.run_id))
+        TestBase.logger.info(f"collected from test_run_functions: {run_.run_id}")
         run = openml.runs.get_run(run_.run_id)
 
         modelR = openml.runs.initialize_model_from_run(run_id=run.run_id)
@@ -1106,10 +1152,10 @@ def test_initialize_model_from_run(self):
         openml.flows.assert_flows_equal(flowR, flowL)
         openml.flows.assert_flows_equal(flowS, flowL)
 
-        self.assertEqual(flowS.components["Imputer"].parameters["strategy"], '"most_frequent"')
-        self.assertEqual(flowS.components["VarianceThreshold"].parameters["threshold"], "0.05")
+        assert flowS.components["Imputer"].parameters["strategy"] == '"most_frequent"'
+        assert flowS.components["VarianceThreshold"].parameters["threshold"] == "0.05"
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     @unittest.skipIf(
         LooseVersion(sklearn.__version__) < "0.20",
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
@@ -1125,14 +1171,14 @@ def test__run_exists(self):
                     ("Imputer", SimpleImputer(strategy="mean")),
                     ("VarianceThreshold", VarianceThreshold(threshold=0.05)),
                     ("Estimator", DecisionTreeClassifier(max_depth=4)),
-                ]
+                ],
             ),
             sklearn.pipeline.Pipeline(
                 steps=[
                     ("Imputer", SimpleImputer(strategy="most_frequent")),
                     ("VarianceThreshold", VarianceThreshold(threshold=0.1)),
                     ("Estimator", DecisionTreeClassifier(max_depth=4)),
-                ]
+                ],
             ),
         ]
 
@@ -1143,28 +1189,32 @@ def test__run_exists(self):
                 # first populate the server with this run.
                 # skip run if it was already performed.
                 run = openml.runs.run_model_on_task(
-                    model=clf, task=task, seed=rs, avoid_duplicate_runs=True, upload_flow=True
+                    model=clf,
+                    task=task,
+                    seed=rs,
+                    avoid_duplicate_runs=True,
+                    upload_flow=True,
                 )
                 run.publish()
                 TestBase._mark_entity_for_removal("run", run.run_id)
-                TestBase.logger.info("collected from test_run_functions: {}".format(run.run_id))
+                TestBase.logger.info(f"collected from test_run_functions: {run.run_id}")
             except openml.exceptions.PyOpenMLError:
                 # run already existed. Great.
                 pass
 
             flow = self.extension.model_to_flow(clf)
             flow_exists = openml.flows.flow_exists(flow.name, flow.external_version)
-            self.assertGreater(flow_exists, 0, "Server says flow from run does not exist.")
+            assert flow_exists > 0, "Server says flow from run does not exist."
             # Do NOT use get_flow reinitialization, this potentially sets
             # hyperparameter values wrong. Rather use the local model.
             downloaded_flow = openml.flows.get_flow(flow_exists)
             downloaded_flow.model = clf
             setup_exists = openml.setups.setup_exists(downloaded_flow)
-            self.assertGreater(setup_exists, 0, "Server says setup of run does not exist.")
+            assert setup_exists > 0, "Server says setup of run does not exist."
             run_ids = run_exists(task.task_id, setup_exists)
-            self.assertTrue(run_ids, msg=(run_ids, clf))
+            assert run_ids, (run_ids, clf)
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_run_with_illegal_flow_id(self):
         # check the case where the user adds an illegal flow id to a
         # non-existing flo
@@ -1174,16 +1224,16 @@ def test_run_with_illegal_flow_id(self):
         flow, _ = self._add_sentinel_to_flow_name(flow, None)
         flow.flow_id = -1
         expected_message_regex = (
-            "Flow does not exist on the server, " "but 'flow.flow_id' is not None."
+            r"Flow does not exist on the server, but 'flow.flow_id' is not None."
         )
-        with self.assertRaisesRegex(openml.exceptions.PyOpenMLError, expected_message_regex):
+        with pytest.raises(openml.exceptions.PyOpenMLError, match=expected_message_regex):
             openml.runs.run_flow_on_task(
                 task=task,
                 flow=flow,
                 avoid_duplicate_runs=True,
             )
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_run_with_illegal_flow_id_after_load(self):
         # Same as `test_run_with_illegal_flow_id`, but test this error is also
         # caught if the run is stored to and loaded from disk first.
@@ -1193,7 +1243,10 @@ def test_run_with_illegal_flow_id_after_load(self):
         flow, _ = self._add_sentinel_to_flow_name(flow, None)
         flow.flow_id = -1
         run = openml.runs.run_flow_on_task(
-            task=task, flow=flow, avoid_duplicate_runs=False, upload_flow=False
+            task=task,
+            flow=flow,
+            avoid_duplicate_runs=False,
+            upload_flow=False,
         )
 
         cache_path = os.path.join(
@@ -1205,14 +1258,14 @@ def test_run_with_illegal_flow_id_after_load(self):
         loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path)
 
         expected_message_regex = (
-            "Flow does not exist on the server, " "but 'flow.flow_id' is not None."
+            r"Flow does not exist on the server, but 'flow.flow_id' is not None."
         )
-        with self.assertRaisesRegex(openml.exceptions.PyOpenMLError, expected_message_regex):
+        with pytest.raises(openml.exceptions.PyOpenMLError, match=expected_message_regex):
             loaded_run.publish()
             TestBase._mark_entity_for_removal("run", loaded_run.run_id)
-            TestBase.logger.info("collected from test_run_functions: {}".format(loaded_run.run_id))
+            TestBase.logger.info(f"collected from test_run_functions: {loaded_run.run_id}")
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_run_with_illegal_flow_id_1(self):
         # Check the case where the user adds an illegal flow id to an existing
         # flow. Comes to a different value error than the previous test
@@ -1221,8 +1274,8 @@ def test_run_with_illegal_flow_id_1(self):
         flow_orig = self.extension.model_to_flow(clf)
         try:
             flow_orig.publish()  # ensures flow exist on server
-            TestBase._mark_entity_for_removal("flow", (flow_orig.flow_id, flow_orig.name))
-            TestBase.logger.info("collected from test_run_functions: {}".format(flow_orig.flow_id))
+            TestBase._mark_entity_for_removal("flow", flow_orig.flow_id, flow_orig.name)
+            TestBase.logger.info(f"collected from test_run_functions: {flow_orig.flow_id}")
         except openml.exceptions.OpenMLServerException:
             # flow already exists
             pass
@@ -1230,14 +1283,14 @@ def test_run_with_illegal_flow_id_1(self):
 
         flow_new.flow_id = -1
         expected_message_regex = "Local flow_id does not match server flow_id: " "'-1' vs '[0-9]+'"
-        with self.assertRaisesRegex(openml.exceptions.PyOpenMLError, expected_message_regex):
+        with pytest.raises(openml.exceptions.PyOpenMLError, match=expected_message_regex):
             openml.runs.run_flow_on_task(
                 task=task,
                 flow=flow_new,
                 avoid_duplicate_runs=True,
             )
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_run_with_illegal_flow_id_1_after_load(self):
         # Same as `test_run_with_illegal_flow_id_1`, but test this error is
         # also caught if the run is stored to and loaded from disk first.
@@ -1246,8 +1299,8 @@ def test_run_with_illegal_flow_id_1_after_load(self):
         flow_orig = self.extension.model_to_flow(clf)
         try:
             flow_orig.publish()  # ensures flow exist on server
-            TestBase._mark_entity_for_removal("flow", (flow_orig.flow_id, flow_orig.name))
-            TestBase.logger.info("collected from test_run_functions: {}".format(flow_orig.flow_id))
+            TestBase._mark_entity_for_removal("flow", flow_orig.flow_id, flow_orig.name)
+            TestBase.logger.info(f"collected from test_run_functions: {flow_orig.flow_id}")
         except openml.exceptions.OpenMLServerException:
             # flow already exists
             pass
@@ -1255,7 +1308,10 @@ def test_run_with_illegal_flow_id_1_after_load(self):
         flow_new.flow_id = -1
 
         run = openml.runs.run_flow_on_task(
-            task=task, flow=flow_new, avoid_duplicate_runs=False, upload_flow=False
+            task=task,
+            flow=flow_new,
+            avoid_duplicate_runs=False,
+            upload_flow=False,
         )
 
         cache_path = os.path.join(
@@ -1268,10 +1324,12 @@ def test_run_with_illegal_flow_id_1_after_load(self):
 
         expected_message_regex = "Local flow_id does not match server flow_id: " "'-1' vs '[0-9]+'"
         self.assertRaisesRegex(
-            openml.exceptions.PyOpenMLError, expected_message_regex, loaded_run.publish
+            openml.exceptions.PyOpenMLError,
+            expected_message_regex,
+            loaded_run.publish,
         )
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     @unittest.skipIf(
         LooseVersion(sklearn.__version__) < "0.20",
         reason="OneHotEncoder cannot handle mixed type DataFrame as input",
@@ -1283,7 +1341,8 @@ def test__run_task_get_arffcontent(self):
         num_repeats = 1
 
         clf = make_pipeline(
-            OneHotEncoder(handle_unknown="ignore"), SGDClassifier(loss="log", random_state=1)
+            OneHotEncoder(handle_unknown="ignore"),
+            SGDClassifier(loss="log", random_state=1),
         )
         res = openml.runs.functions._run_task_get_arffcontent(
             extension=self.extension,
@@ -1294,46 +1353,50 @@ def test__run_task_get_arffcontent(self):
         )
         arff_datacontent, trace, fold_evaluations, _ = res
         # predictions
-        self.assertIsInstance(arff_datacontent, list)
+        assert isinstance(arff_datacontent, list)
         # trace. SGD does not produce any
-        self.assertIsInstance(trace, type(None))
+        assert isinstance(trace, type(None))
 
         task_type = TaskType.SUPERVISED_CLASSIFICATION
         self._check_fold_timing_evaluations(
-            fold_evaluations, num_repeats, num_folds, task_type=task_type
+            fold_evaluations=fold_evaluations,
+            num_repeats=num_repeats,
+            num_folds=num_folds,
+            task_type=task_type,
         )
 
         # 10 times 10 fold CV of 150 samples
-        self.assertEqual(len(arff_datacontent), num_instances * num_repeats)
+        assert len(arff_datacontent) == num_instances * num_repeats
         for arff_line in arff_datacontent:
             # check number columns
-            self.assertEqual(len(arff_line), 8)
+            assert len(arff_line) == 8
             # check repeat
-            self.assertGreaterEqual(arff_line[0], 0)
-            self.assertLessEqual(arff_line[0], num_repeats - 1)
+            assert arff_line[0] >= 0
+            assert arff_line[0] <= num_repeats - 1
             # check fold
-            self.assertGreaterEqual(arff_line[1], 0)
-            self.assertLessEqual(arff_line[1], num_folds - 1)
+            assert arff_line[1] >= 0
+            assert arff_line[1] <= num_folds - 1
             # check row id
-            self.assertGreaterEqual(arff_line[2], 0)
-            self.assertLessEqual(arff_line[2], num_instances - 1)
+            assert arff_line[2] >= 0
+            assert arff_line[2] <= num_instances - 1
             # check prediction and ground truth columns
-            self.assertIn(arff_line[4], ["won", "nowin"])
-            self.assertIn(arff_line[5], ["won", "nowin"])
+            assert arff_line[4] in ["won", "nowin"]
+            assert arff_line[5] in ["won", "nowin"]
             # check confidences
             self.assertAlmostEqual(sum(arff_line[6:]), 1.0)
 
     def test__create_trace_from_arff(self):
-        with open(self.static_cache_dir + "/misc/trace.arff", "r") as arff_file:
+        with open(self.static_cache_dir / "misc" / "trace.arff") as arff_file:
             trace_arff = arff.load(arff_file)
         OpenMLRunTrace.trace_from_arff(trace_arff)
 
+    @pytest.mark.production()
     def test_get_run(self):
         # this run is not available on test
         openml.config.server = self.production_server
         run = openml.runs.get_run(473351)
-        self.assertEqual(run.dataset_id, 357)
-        self.assertEqual(run.evaluations["f_measure"], 0.841225)
+        assert run.dataset_id == 357
+        assert run.evaluations["f_measure"] == 0.841225
         for i, value in [
             (0, 0.840918),
             (1, 0.839458),
@@ -1346,7 +1409,7 @@ def test_get_run(self):
             (8, 0.84218),
             (9, 0.844014),
         ]:
-            self.assertEqual(run.fold_evaluations["f_measure"][0][i], value)
+            assert run.fold_evaluations["f_measure"][0][i] == value
         assert "weka" in run.tags
         assert "weka_3.7.12" in run.tags
         assert run.predictions_url == (
@@ -1360,14 +1423,15 @@ def _check_run(self, run):
         # They are run_id, task_id, task_type_id, setup_id, flow_id, uploader, upload_time
         # error_message and run_details exist, too, but are not used so far. We need to update
         # this check once they are used!
-        self.assertIsInstance(run, dict)
+        assert isinstance(run, dict)
         assert len(run) == 8, str(run)
 
+    @pytest.mark.production()
     def test_get_runs_list(self):
         # TODO: comes from live, no such lists on test
         openml.config.server = self.production_server
         runs = openml.runs.list_runs(id=[2], show_errors=True, output_format="dataframe")
-        self.assertEqual(len(runs), 1)
+        assert len(runs) == 1
         for run in runs.to_dict(orient="index").values():
             self._check_run(run)
 
@@ -1377,26 +1441,28 @@ def test_list_runs_empty(self):
 
     def test_list_runs_output_format(self):
         runs = openml.runs.list_runs(size=1000, output_format="dataframe")
-        self.assertIsInstance(runs, pd.DataFrame)
+        assert isinstance(runs, pd.DataFrame)
 
+    @pytest.mark.production()
     def test_get_runs_list_by_task(self):
         # TODO: comes from live, no such lists on test
         openml.config.server = self.production_server
         task_ids = [20]
         runs = openml.runs.list_runs(task=task_ids, output_format="dataframe")
-        self.assertGreaterEqual(len(runs), 590)
+        assert len(runs) >= 590
         for run in runs.to_dict(orient="index").values():
-            self.assertIn(run["task_id"], task_ids)
+            assert run["task_id"] in task_ids
             self._check_run(run)
         num_runs = len(runs)
 
         task_ids.append(21)
         runs = openml.runs.list_runs(task=task_ids, output_format="dataframe")
-        self.assertGreaterEqual(len(runs), num_runs + 1)
+        assert len(runs) >= num_runs + 1
         for run in runs.to_dict(orient="index").values():
-            self.assertIn(run["task_id"], task_ids)
+            assert run["task_id"] in task_ids
             self._check_run(run)
 
+    @pytest.mark.production()
     def test_get_runs_list_by_uploader(self):
         # TODO: comes from live, no such lists on test
         openml.config.server = self.production_server
@@ -1404,38 +1470,40 @@ def test_get_runs_list_by_uploader(self):
         uploader_ids = [29]
 
         runs = openml.runs.list_runs(uploader=uploader_ids, output_format="dataframe")
-        self.assertGreaterEqual(len(runs), 2)
+        assert len(runs) >= 2
         for run in runs.to_dict(orient="index").values():
-            self.assertIn(run["uploader"], uploader_ids)
+            assert run["uploader"] in uploader_ids
             self._check_run(run)
         num_runs = len(runs)
 
         uploader_ids.append(274)
 
         runs = openml.runs.list_runs(uploader=uploader_ids, output_format="dataframe")
-        self.assertGreaterEqual(len(runs), num_runs + 1)
+        assert len(runs) >= num_runs + 1
         for run in runs.to_dict(orient="index").values():
-            self.assertIn(run["uploader"], uploader_ids)
+            assert run["uploader"] in uploader_ids
             self._check_run(run)
 
+    @pytest.mark.production()
     def test_get_runs_list_by_flow(self):
         # TODO: comes from live, no such lists on test
         openml.config.server = self.production_server
         flow_ids = [1154]
         runs = openml.runs.list_runs(flow=flow_ids, output_format="dataframe")
-        self.assertGreaterEqual(len(runs), 1)
+        assert len(runs) >= 1
         for run in runs.to_dict(orient="index").values():
-            self.assertIn(run["flow_id"], flow_ids)
+            assert run["flow_id"] in flow_ids
             self._check_run(run)
         num_runs = len(runs)
 
         flow_ids.append(1069)
         runs = openml.runs.list_runs(flow=flow_ids, output_format="dataframe")
-        self.assertGreaterEqual(len(runs), num_runs + 1)
+        assert len(runs) >= num_runs + 1
         for run in runs.to_dict(orient="index").values():
-            self.assertIn(run["flow_id"], flow_ids)
+            assert run["flow_id"] in flow_ids
             self._check_run(run)
 
+    @pytest.mark.production()
     def test_get_runs_pagination(self):
         # TODO: comes from live, no such lists on test
         openml.config.server = self.production_server
@@ -1444,12 +1512,16 @@ def test_get_runs_pagination(self):
         max = 100
         for i in range(0, max, size):
             runs = openml.runs.list_runs(
-                offset=i, size=size, uploader=uploader_ids, output_format="dataframe"
+                offset=i,
+                size=size,
+                uploader=uploader_ids,
+                output_format="dataframe",
             )
-            self.assertGreaterEqual(size, len(runs))
+            assert size >= len(runs)
             for run in runs.to_dict(orient="index").values():
-                self.assertIn(run["uploader"], uploader_ids)
+                assert run["uploader"] in uploader_ids
 
+    @pytest.mark.production()
     def test_get_runs_list_by_filters(self):
         # TODO: comes from live, no such lists on test
         openml.config.server = self.production_server
@@ -1468,30 +1540,34 @@ def test_get_runs_list_by_filters(self):
         # openml.runs.list_runs)
 
         runs = openml.runs.list_runs(id=ids, output_format="dataframe")
-        self.assertEqual(len(runs), 2)
+        assert len(runs) == 2
 
         runs = openml.runs.list_runs(task=tasks, output_format="dataframe")
-        self.assertGreaterEqual(len(runs), 2)
+        assert len(runs) >= 2
 
         runs = openml.runs.list_runs(uploader=uploaders_2, output_format="dataframe")
-        self.assertGreaterEqual(len(runs), 10)
+        assert len(runs) >= 10
 
         runs = openml.runs.list_runs(flow=flows, output_format="dataframe")
-        self.assertGreaterEqual(len(runs), 100)
+        assert len(runs) >= 100
 
         runs = openml.runs.list_runs(
-            id=ids, task=tasks, uploader=uploaders_1, output_format="dataframe"
+            id=ids,
+            task=tasks,
+            uploader=uploaders_1,
+            output_format="dataframe",
         )
-        self.assertEqual(len(runs), 2)
+        assert len(runs) == 2
 
+    @pytest.mark.production()
     def test_get_runs_list_by_tag(self):
         # TODO: comes from live, no such lists on test
         # Unit test works on production server only
         openml.config.server = self.production_server
         runs = openml.runs.list_runs(tag="curves", output_format="dataframe")
-        self.assertGreaterEqual(len(runs), 1)
+        assert len(runs) >= 1
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     @unittest.skipIf(
         LooseVersion(sklearn.__version__) < "0.20",
         reason="columntransformer introduction in 0.20.0",
@@ -1505,12 +1581,13 @@ def test_run_on_dataset_with_missing_labels_dataframe(self):
         from sklearn.compose import ColumnTransformer
 
         cat_imp = make_pipeline(
-            SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore")
+            SimpleImputer(strategy="most_frequent"),
+            OneHotEncoder(handle_unknown="ignore"),
         )
         cont_imp = make_pipeline(CustomImputer(), StandardScaler())
         ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
         model = Pipeline(
-            steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())]
+            steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())],
         )  # build a sklearn classifier
 
         data_content, _, _, _ = _run_task_get_arffcontent(
@@ -1522,12 +1599,12 @@ def test_run_on_dataset_with_missing_labels_dataframe(self):
         )
         # 2 folds, 5 repeats; keep in mind that this task comes from the test
         # server, the task on the live server is different
-        self.assertEqual(len(data_content), 4490)
+        assert len(data_content) == 4490
         for row in data_content:
             # repeat, fold, row_id, 6 confidences, prediction and correct label
-            self.assertEqual(len(row), 12)
+            assert len(row) == 12
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     @unittest.skipIf(
         LooseVersion(sklearn.__version__) < "0.20",
         reason="columntransformer introduction in 0.20.0",
@@ -1548,12 +1625,13 @@ def test_run_on_dataset_with_missing_labels_array(self):
         from sklearn.compose import ColumnTransformer
 
         cat_imp = make_pipeline(
-            SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore")
+            SimpleImputer(strategy="most_frequent"),
+            OneHotEncoder(handle_unknown="ignore"),
         )
         cont_imp = make_pipeline(CustomImputer(), StandardScaler())
         ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
         model = Pipeline(
-            steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())]
+            steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())],
         )  # build a sklearn classifier
 
         data_content, _, _, _ = _run_task_get_arffcontent(
@@ -1565,10 +1643,10 @@ def test_run_on_dataset_with_missing_labels_array(self):
         )
         # 2 folds, 5 repeats; keep in mind that this task comes from the test
         # server, the task on the live server is different
-        self.assertEqual(len(data_content), 4490)
+        assert len(data_content) == 4490
         for row in data_content:
             # repeat, fold, row_id, 6 confidences, prediction and correct label
-            self.assertEqual(len(row), 12)
+            assert len(row) == 12
 
     def test_get_cached_run(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
@@ -1576,16 +1654,16 @@ def test_get_cached_run(self):
 
     def test_get_uncached_run(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
-        with self.assertRaises(openml.exceptions.OpenMLCacheException):
+        with pytest.raises(openml.exceptions.OpenMLCacheException):
             openml.runs.functions._get_cached_run(10)
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_run_flow_on_task_downloaded_flow(self):
         model = sklearn.ensemble.RandomForestClassifier(n_estimators=33)
         flow = self.extension.model_to_flow(model)
         flow.publish(raise_error_if_exists=False)
-        TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name))
-        TestBase.logger.info("collected from test_run_functions: {}".format(flow.flow_id))
+        TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
+        TestBase.logger.info(f"collected from test_run_functions: {flow.flow_id}")
 
         downloaded_flow = openml.flows.get_flow(flow.flow_id)
         task = openml.tasks.get_task(self.TEST_SERVER_TASK_SIMPLE["task_id"])
@@ -1600,49 +1678,51 @@ def test_run_flow_on_task_downloaded_flow(self):
         TestBase._mark_entity_for_removal("run", run.run_id)
         TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], run.run_id))
 
+    @pytest.mark.production()
     def test_format_prediction_non_supervised(self):
         # non-supervised tasks don't exist on the test server
         openml.config.server = self.production_server
         clustering = openml.tasks.get_task(126033, download_data=False)
         ignored_input = [0] * 5
-        with self.assertRaisesRegex(
-            NotImplementedError, r"Formatting for <class '[\w.]+'> is not supported."
+        with pytest.raises(
+            NotImplementedError, match=r"Formatting for <class '[\w.]+'> is not supported."
         ):
             format_prediction(clustering, *ignored_input)
 
     def test_format_prediction_classification_no_probabilities(self):
         classification = openml.tasks.get_task(
-            self.TEST_SERVER_TASK_SIMPLE["task_id"], download_data=False
+            self.TEST_SERVER_TASK_SIMPLE["task_id"],
+            download_data=False,
         )
         ignored_input = [0] * 5
-        with self.assertRaisesRegex(ValueError, "`proba` is required for classification task"):
+        with pytest.raises(ValueError, match="`proba` is required for classification task"):
             format_prediction(classification, *ignored_input, proba=None)
 
     def test_format_prediction_classification_incomplete_probabilities(self):
         classification = openml.tasks.get_task(
-            self.TEST_SERVER_TASK_SIMPLE["task_id"], download_data=False
+            self.TEST_SERVER_TASK_SIMPLE["task_id"],
+            download_data=False,
         )
         ignored_input = [0] * 5
         incomplete_probabilities = {c: 0.2 for c in classification.class_labels[1:]}
-        with self.assertRaisesRegex(ValueError, "Each class should have a predicted probability"):
+        with pytest.raises(ValueError, match="Each class should have a predicted probability"):
             format_prediction(classification, *ignored_input, proba=incomplete_probabilities)
 
     def test_format_prediction_task_without_classlabels_set(self):
         classification = openml.tasks.get_task(
-            self.TEST_SERVER_TASK_SIMPLE["task_id"], download_data=False
+            self.TEST_SERVER_TASK_SIMPLE["task_id"],
+            download_data=False,
         )
         classification.class_labels = None
         ignored_input = [0] * 5
-        with self.assertRaisesRegex(
-            ValueError, "The classification task must have class labels set"
-        ):
+        with pytest.raises(ValueError, match="The classification task must have class labels set"):
             format_prediction(classification, *ignored_input, proba={})
 
     def test_format_prediction_task_learning_curve_sample_not_set(self):
         learning_curve = openml.tasks.get_task(801, download_data=False)  # diabetes;crossvalidation
         probabilities = {c: 0.2 for c in learning_curve.class_labels}
         ignored_input = [0] * 5
-        with self.assertRaisesRegex(ValueError, "`sample` can not be none for LearningCurveTask"):
+        with pytest.raises(ValueError, match="`sample` can not be none for LearningCurveTask"):
             format_prediction(learning_curve, *ignored_input, sample=None, proba=probabilities)
 
     def test_format_prediction_task_regression(self):
@@ -1665,14 +1745,14 @@ def test_format_prediction_task_regression(self):
                     raise Exception(repr(e))
             # mark to remove the uploaded task
             TestBase._mark_entity_for_removal("task", task_id)
-            TestBase.logger.info("collected from test_run_functions: {}".format(task_id))
+            TestBase.logger.info(f"collected from test_run_functions: {task_id}")
 
         regression = openml.tasks.get_task(task_id, download_data=False)
         ignored_input = [0] * 5
         res = format_prediction(regression, *ignored_input)
         self.assertListEqual(res, [0] * 5)
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     @unittest.skipIf(
         LooseVersion(sklearn.__version__) < "0.21",
         reason="couldn't perform local tests successfully w/o bloating RAM",
@@ -1703,12 +1783,12 @@ def test__run_task_get_arffcontent_2(self, parallel_mock):
         # The _prevent_optimize_n_jobs() is a function executed within the _run_model_on_fold()
         # block and mocking this function doesn't affect rest of the pipeline, but is adequately
         # indicative if _run_model_on_fold() is being called or not.
-        self.assertEqual(parallel_mock.call_count, 0)
-        self.assertIsInstance(res[0], list)
-        self.assertEqual(len(res[0]), num_instances)
-        self.assertEqual(len(res[0][0]), line_length)
-        self.assertEqual(len(res[2]), 7)
-        self.assertEqual(len(res[3]), 7)
+        assert parallel_mock.call_count == 0
+        assert isinstance(res[0], list)
+        assert len(res[0]) == num_instances
+        assert len(res[0][0]) == line_length
+        assert len(res[2]) == 7
+        assert len(res[3]) == 7
         expected_scores = [
             0.965625,
             0.94375,
@@ -1723,10 +1803,12 @@ def test__run_task_get_arffcontent_2(self, parallel_mock):
         ]
         scores = [v for k, v in res[2]["predictive_accuracy"][0].items()]
         np.testing.assert_array_almost_equal(
-            scores, expected_scores, decimal=2 if os.name == "nt" else 7
+            scores,
+            expected_scores,
+            decimal=2 if os.name == "nt" else 7,
         )
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     @unittest.skipIf(
         LooseVersion(sklearn.__version__) < "0.21",
         reason="couldn't perform local tests successfully w/o bloating RAM",
@@ -1760,7 +1842,9 @@ def test_joblib_backends(self, parallel_mock):
                 },
                 random_state=1,
                 cv=sklearn.model_selection.StratifiedKFold(
-                    n_splits=2, shuffle=True, random_state=1
+                    n_splits=2,
+                    shuffle=True,
+                    random_state=1,
                 ),
                 n_iter=5,
                 n_jobs=n_jobs,
@@ -1774,14 +1858,14 @@ def test_joblib_backends(self, parallel_mock):
                     dataset_format="array",  # "dataframe" would require handling of categoricals
                     n_jobs=n_jobs,
                 )
-            self.assertEqual(type(res[0]), list)
-            self.assertEqual(len(res[0]), num_instances)
-            self.assertEqual(len(res[0][0]), line_length)
+            assert type(res[0]) == list
+            assert len(res[0]) == num_instances
+            assert len(res[0][0]) == line_length
             # usercpu_time_millis_* not recorded when n_jobs > 1
             # *_time_millis_* not recorded when n_jobs = -1
-            self.assertEqual(len(res[2]["predictive_accuracy"][0]), 10)
-            self.assertEqual(len(res[3]["predictive_accuracy"][0]), 10)
-            self.assertEqual(parallel_mock.call_count, call_count)
+            assert len(res[2]["predictive_accuracy"][0]) == 10
+            assert len(res[3]["predictive_accuracy"][0]) == 10
+            assert parallel_mock.call_count == call_count
 
     @unittest.skipIf(
         LooseVersion(sklearn.__version__) < "0.20",
@@ -1790,17 +1874,17 @@ def test_joblib_backends(self, parallel_mock):
     def test_delete_run(self):
         rs = 1
         clf = sklearn.pipeline.Pipeline(
-            steps=[("imputer", SimpleImputer()), ("estimator", DecisionTreeClassifier())]
+            steps=[("imputer", SimpleImputer()), ("estimator", DecisionTreeClassifier())],
         )
         task = openml.tasks.get_task(32)  # diabetes; crossvalidation
 
         run = openml.runs.run_model_on_task(model=clf, task=task, seed=rs)
         run.publish()
         TestBase._mark_entity_for_removal("run", run.run_id)
-        TestBase.logger.info("collected from test_run_functions: {}".format(run.run_id))
+        TestBase.logger.info(f"collected from test_run_functions: {run.run_id}")
 
         _run_id = run.run_id
-        self.assertTrue(delete_run(_run_id))
+        assert delete_run(_run_id)
 
 
 @mock.patch.object(requests.Session, "delete")
@@ -1808,7 +1892,8 @@ def test_delete_run_not_owned(mock_delete, test_files_directory, test_api_key):
     openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "runs" / "run_delete_not_owned.xml"
     mock_delete.return_value = create_request_response(
-        status_code=412, content_filepath=content_file
+        status_code=412,
+        content_filepath=content_file,
     )
 
     with pytest.raises(
@@ -1829,7 +1914,8 @@ def test_delete_run_success(mock_delete, test_files_directory, test_api_key):
     openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "runs" / "run_delete_successful.xml"
     mock_delete.return_value = create_request_response(
-        status_code=200, content_filepath=content_file
+        status_code=200,
+        content_filepath=content_file,
     )
 
     success = openml.runs.delete_run(10591880)
@@ -1847,7 +1933,8 @@ def test_delete_unknown_run(mock_delete, test_files_directory, test_api_key):
     openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "runs" / "run_delete_not_exist.xml"
     mock_delete.return_value = create_request_response(
-        status_code=412, content_filepath=content_file
+        status_code=412,
+        content_filepath=content_file,
     )
 
     with pytest.raises(
diff --git a/tests/test_runs/test_trace.py b/tests/test_runs/test_trace.py
index d08c99e88..bdf9de42d 100644
--- a/tests/test_runs/test_trace.py
+++ b/tests/test_runs/test_trace.py
@@ -1,4 +1,7 @@
 # License: BSD 3-Clause
+from __future__ import annotations
+
+import pytest
 
 from openml.runs import OpenMLRunTrace, OpenMLTraceIteration
 from openml.testing import TestBase
@@ -23,30 +26,21 @@ def test_get_selected_iteration(self):
 
         trace = OpenMLRunTrace(-1, trace_iterations=trace_iterations)
         # This next one should simply not fail
-        self.assertEqual(trace.get_selected_iteration(2, 2), 2)
-        with self.assertRaisesRegex(
-            ValueError,
-            "Could not find the selected iteration for rep/fold 3/3",
+        assert trace.get_selected_iteration(2, 2) == 2
+        with pytest.raises(
+            ValueError, match="Could not find the selected iteration for rep/fold 3/3"
         ):
             trace.get_selected_iteration(3, 3)
 
     def test_initialization(self):
         """Check all different ways to fail the initialization"""
-        with self.assertRaisesRegex(
-            ValueError,
-            "Trace content not available.",
-        ):
+        with pytest.raises(ValueError, match="Trace content not available."):
             OpenMLRunTrace.generate(attributes="foo", content=None)
-        with self.assertRaisesRegex(
-            ValueError,
-            "Trace attributes not available.",
-        ):
+        with pytest.raises(ValueError, match="Trace attributes not available."):
             OpenMLRunTrace.generate(attributes=None, content="foo")
-        with self.assertRaisesRegex(ValueError, "Trace content is empty."):
+        with pytest.raises(ValueError, match="Trace content is empty."):
             OpenMLRunTrace.generate(attributes="foo", content=[])
-        with self.assertRaisesRegex(
-            ValueError, "Trace_attributes and trace_content not compatible:"
-        ):
+        with pytest.raises(ValueError, match="Trace_attributes and trace_content not compatible:"):
             OpenMLRunTrace.generate(attributes=["abc"], content=[[1, 2]])
 
     def test_duplicate_name(self):
@@ -61,8 +55,9 @@ def test_duplicate_name(self):
             ("repeat", "NUMERICAL"),
         ]
         trace_content = [[0, 0, 0, 0.5, "true", 1], [0, 0, 0, 0.9, "false", 2]]
-        with self.assertRaisesRegex(
-            ValueError, "Either `setup_string` or `parameters` needs to be passed as argument."
+        with pytest.raises(
+            ValueError,
+            match="Either `setup_string` or `parameters` needs to be passed as argument.",
         ):
             OpenMLRunTrace.generate(trace_attributes, trace_content)
 
@@ -75,8 +70,9 @@ def test_duplicate_name(self):
             ("sunshine", "NUMERICAL"),
         ]
         trace_content = [[0, 0, 0, 0.5, "true", 1], [0, 0, 0, 0.9, "false", 2]]
-        with self.assertRaisesRegex(
+        with pytest.raises(
             ValueError,
-            "Encountered unknown attribute sunshine that does not start with " "prefix parameter_",
+            match="Encountered unknown attribute sunshine that does not start with "
+            "prefix parameter_",
         ):
             OpenMLRunTrace.generate(trace_attributes, trace_content)
diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py
index ef1acc405..9e357f6aa 100644
--- a/tests/test_setups/test_setup_functions.py
+++ b/tests/test_setups/test_setup_functions.py
@@ -1,20 +1,21 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
 import hashlib
 import time
 import unittest.mock
+from typing import Dict
+
+import pandas as pd
+import pytest
+import sklearn.base
+import sklearn.naive_bayes
+import sklearn.tree
 
 import openml
 import openml.exceptions
 import openml.extensions.sklearn
 from openml.testing import TestBase
-from typing import Dict
-import pandas as pd
-import pytest
-
-import sklearn.tree
-import sklearn.naive_bayes
-import sklearn.base
 
 
 def get_sentinel():
@@ -24,8 +25,7 @@ def get_sentinel():
     md5 = hashlib.md5()
     md5.update(str(time.time()).encode("utf-8"))
     sentinel = md5.hexdigest()[:10]
-    sentinel = "TEST%s" % sentinel
-    return sentinel
+    return "TEST%s" % sentinel
 
 
 class TestSetupFunctions(TestBase):
@@ -35,37 +35,37 @@ def setUp(self):
         self.extension = openml.extensions.sklearn.SklearnExtension()
         super().setUp()
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_nonexisting_setup_exists(self):
         # first publish a non-existing flow
         sentinel = get_sentinel()
         # because of the sentinel, we can not use flows that contain subflows
         dectree = sklearn.tree.DecisionTreeClassifier()
         flow = self.extension.model_to_flow(dectree)
-        flow.name = "TEST%s%s" % (sentinel, flow.name)
+        flow.name = f"TEST{sentinel}{flow.name}"
         flow.publish()
-        TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name))
+        TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
         TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id))
 
         # although the flow exists (created as of previous statement),
         # we can be sure there are no setups (yet) as it was just created
         # and hasn't been ran
         setup_id = openml.setups.setup_exists(flow)
-        self.assertFalse(setup_id)
+        assert not setup_id
 
     def _existing_setup_exists(self, classif):
         flow = self.extension.model_to_flow(classif)
-        flow.name = "TEST%s%s" % (get_sentinel(), flow.name)
+        flow.name = f"TEST{get_sentinel()}{flow.name}"
         flow.publish()
-        TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name))
+        TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
         TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id))
 
         # although the flow exists, we can be sure there are no
         # setups (yet) as it hasn't been ran
         setup_id = openml.setups.setup_exists(flow)
-        self.assertFalse(setup_id)
+        assert not setup_id
         setup_id = openml.setups.setup_exists(flow)
-        self.assertFalse(setup_id)
+        assert not setup_id
 
         # now run the flow on an easy task:
         task = openml.tasks.get_task(115)  # diabetes; crossvalidation
@@ -80,9 +80,9 @@ def _existing_setup_exists(self, classif):
 
         # execute the function we are interested in
         setup_id = openml.setups.setup_exists(flow)
-        self.assertEqual(setup_id, run.setup_id)
+        assert setup_id == run.setup_id
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_existing_setup_exists_1(self):
         def side_effect(self):
             self.var_smoothing = 1e-9
@@ -97,12 +97,12 @@ def side_effect(self):
             nb = sklearn.naive_bayes.GaussianNB()
             self._existing_setup_exists(nb)
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_exisiting_setup_exists_2(self):
         # Check a flow with one hyperparameter
         self._existing_setup_exists(sklearn.naive_bayes.GaussianNB())
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     def test_existing_setup_exists_3(self):
         # Check a flow with many hyperparameters
         self._existing_setup_exists(
@@ -112,7 +112,7 @@ def test_existing_setup_exists_3(self):
                 # Not setting the random state will make this flow fail as running it
                 # will add a random random_state.
                 random_state=1,
-            )
+            ),
         )
 
     def test_get_setup(self):
@@ -128,10 +128,11 @@ def test_get_setup(self):
             current = openml.setups.get_setup(setups[idx])
             assert current.flow_id > 0
             if num_params[idx] == 0:
-                self.assertIsNone(current.parameters)
+                assert current.parameters is None
             else:
-                self.assertEqual(len(current.parameters), num_params[idx])
+                assert len(current.parameters) == num_params[idx]
 
+    @pytest.mark.production()
     def test_setup_list_filter_flow(self):
         openml.config.server = self.production_server
 
@@ -139,49 +140,47 @@ def test_setup_list_filter_flow(self):
 
         setups = openml.setups.list_setups(flow=flow_id)
 
-        self.assertGreater(len(setups), 0)  # TODO: please adjust 0
-        for setup_id in setups.keys():
-            self.assertEqual(setups[setup_id].flow_id, flow_id)
+        assert len(setups) > 0  # TODO: please adjust 0
+        for setup_id in setups:
+            assert setups[setup_id].flow_id == flow_id
 
     def test_list_setups_empty(self):
         setups = openml.setups.list_setups(setup=[0])
         if len(setups) > 0:
             raise ValueError("UnitTest Outdated, got somehow results")
 
-        self.assertIsInstance(setups, dict)
+        assert isinstance(setups, dict)
 
+    @pytest.mark.production()
     def test_list_setups_output_format(self):
         openml.config.server = self.production_server
         flow_id = 6794
         setups = openml.setups.list_setups(flow=flow_id, output_format="object", size=10)
-        self.assertIsInstance(setups, Dict)
-        self.assertIsInstance(setups[list(setups.keys())[0]], openml.setups.setup.OpenMLSetup)
-        self.assertEqual(len(setups), 10)
+        assert isinstance(setups, Dict)
+        assert isinstance(setups[next(iter(setups.keys()))], openml.setups.setup.OpenMLSetup)
+        assert len(setups) == 10
 
         setups = openml.setups.list_setups(flow=flow_id, output_format="dataframe", size=10)
-        self.assertIsInstance(setups, pd.DataFrame)
-        self.assertEqual(len(setups), 10)
+        assert isinstance(setups, pd.DataFrame)
+        assert len(setups) == 10
 
         # TODO: [0.15] Remove section as `dict` is no longer supported.
         with pytest.warns(FutureWarning):
             setups = openml.setups.list_setups(flow=flow_id, output_format="dict", size=10)
-        self.assertIsInstance(setups, Dict)
-        self.assertIsInstance(setups[list(setups.keys())[0]], Dict)
-        self.assertEqual(len(setups), 10)
+        assert isinstance(setups, Dict)
+        assert isinstance(setups[next(iter(setups.keys()))], Dict)
+        assert len(setups) == 10
 
     def test_setuplist_offset(self):
-        # TODO: remove after pull on live for better testing
-        # openml.config.server = self.production_server
-
         size = 10
         setups = openml.setups.list_setups(offset=0, size=size)
-        self.assertEqual(len(setups), size)
+        assert len(setups) == size
         setups2 = openml.setups.list_setups(offset=size, size=size)
-        self.assertEqual(len(setups2), size)
+        assert len(setups2) == size
 
         all = set(setups.keys()).union(setups2.keys())
 
-        self.assertEqual(len(all), size * 2)
+        assert len(all) == size * 2
 
     def test_get_cached_setup(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
@@ -189,5 +188,5 @@ def test_get_cached_setup(self):
 
     def test_get_uncached_setup(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
-        with self.assertRaises(openml.exceptions.OpenMLCacheException):
+        with pytest.raises(openml.exceptions.OpenMLCacheException):
             openml.setups.functions._get_cached_setup(10)
diff --git a/tests/test_study/test_study_examples.py b/tests/test_study/test_study_examples.py
index cc3294085..b3f418756 100644
--- a/tests/test_study/test_study_examples.py
+++ b/tests/test_study/test_study_examples.py
@@ -1,19 +1,21 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
-from openml.testing import TestBase
-from openml.extensions.sklearn import cat, cont
+import unittest
+from distutils.version import LooseVersion
 
 import pytest
 import sklearn
-import unittest
-from distutils.version import LooseVersion
+
+from openml.extensions.sklearn import cat, cont
+from openml.testing import TestBase
 
 
 class TestStudyFunctions(TestBase):
     _multiprocess_can_split_ = True
     """Test the example code of Bischl et al. (2018)"""
 
-    @pytest.mark.sklearn
+    @pytest.mark.sklearn()
     @unittest.skipIf(
         LooseVersion(sklearn.__version__) < "0.24",
         reason="columntransformer introduction in 0.24.0",
@@ -38,35 +40,38 @@ def test_Figure1a(self):
             run.publish()                                                # publish the experiment on OpenML (optional)
             print('URL for run: %s/run/%d' %(openml.config.server,run.run_id))
         """  # noqa: E501
-        import openml
         import sklearn.metrics
         import sklearn.tree
+        from sklearn.compose import ColumnTransformer
         from sklearn.impute import SimpleImputer
         from sklearn.pipeline import Pipeline, make_pipeline
-        from sklearn.compose import ColumnTransformer
         from sklearn.preprocessing import OneHotEncoder, StandardScaler
 
+        import openml
+
         benchmark_suite = openml.study.get_study("OpenML100", "tasks")  # obtain the benchmark suite
         cat_imp = OneHotEncoder(handle_unknown="ignore")
         cont_imp = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
         ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
         clf = Pipeline(
-            steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())]
+            steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())],
         )  # build a sklearn classifier
         for task_id in benchmark_suite.tasks[:1]:  # iterate over all tasks
             task = openml.tasks.get_task(task_id)  # download the OpenML task
             X, y = task.get_X_and_y()  # get the data (not used in this example)
             openml.config.apikey = openml.config.apikey  # set the OpenML Api Key
             run = openml.runs.run_model_on_task(
-                clf, task, avoid_duplicate_runs=False
+                clf,
+                task,
+                avoid_duplicate_runs=False,
             )  # run classifier on splits (requires API key)
             score = run.get_metric_fn(sklearn.metrics.accuracy_score)  # print accuracy score
             TestBase.logger.info(
-                "Data set: %s; Accuracy: %0.2f" % (task.get_dataset().name, score.mean())
+                f"Data set: {task.get_dataset().name}; Accuracy: {score.mean():0.2f}",
             )
             run.publish()  # publish the experiment on OpenML (optional)
             TestBase._mark_entity_for_removal("run", run.run_id)
             TestBase.logger.info(
-                "collected from {}: {}".format(__file__.split("/")[-1], run.run_id)
+                "collected from {}: {}".format(__file__.split("/")[-1], run.run_id),
             )
             TestBase.logger.info("URL for run: %s/run/%d" % (openml.config.server, run.run_id))
diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py
index bfbbbee49..721c81f9e 100644
--- a/tests/test_study/test_study_functions.py
+++ b/tests/test_study/test_study_functions.py
@@ -1,70 +1,75 @@
 # License: BSD 3-Clause
-from typing import Optional, List
+from __future__ import annotations
+
+import pandas as pd
+import pytest
 
 import openml
 import openml.study
 from openml.testing import TestBase
-import pandas as pd
-import pytest
 
 
 class TestStudyFunctions(TestBase):
     _multiprocess_can_split_ = True
 
+    @pytest.mark.production()
     def test_get_study_old(self):
         openml.config.server = self.production_server
 
         study = openml.study.get_study(34)
-        self.assertEqual(len(study.data), 105)
-        self.assertEqual(len(study.tasks), 105)
-        self.assertEqual(len(study.flows), 27)
-        self.assertEqual(len(study.setups), 30)
-        self.assertIsNone(study.runs)
+        assert len(study.data) == 105
+        assert len(study.tasks) == 105
+        assert len(study.flows) == 27
+        assert len(study.setups) == 30
+        assert study.runs is None
 
+    @pytest.mark.production()
     def test_get_study_new(self):
         openml.config.server = self.production_server
 
         study = openml.study.get_study(123)
-        self.assertEqual(len(study.data), 299)
-        self.assertEqual(len(study.tasks), 299)
-        self.assertEqual(len(study.flows), 5)
-        self.assertEqual(len(study.setups), 1253)
-        self.assertEqual(len(study.runs), 1693)
+        assert len(study.data) == 299
+        assert len(study.tasks) == 299
+        assert len(study.flows) == 5
+        assert len(study.setups) == 1253
+        assert len(study.runs) == 1693
 
+    @pytest.mark.production()
     def test_get_openml100(self):
         openml.config.server = self.production_server
 
         study = openml.study.get_study("OpenML100", "tasks")
-        self.assertIsInstance(study, openml.study.OpenMLBenchmarkSuite)
+        assert isinstance(study, openml.study.OpenMLBenchmarkSuite)
         study_2 = openml.study.get_suite("OpenML100")
-        self.assertIsInstance(study_2, openml.study.OpenMLBenchmarkSuite)
-        self.assertEqual(study.study_id, study_2.study_id)
+        assert isinstance(study_2, openml.study.OpenMLBenchmarkSuite)
+        assert study.study_id == study_2.study_id
 
+    @pytest.mark.production()
     def test_get_study_error(self):
         openml.config.server = self.production_server
 
-        with self.assertRaisesRegex(
-            ValueError,
-            "Unexpected entity type 'task' reported by the server, expected 'run'",
+        with pytest.raises(
+            ValueError, match="Unexpected entity type 'task' reported by the server, expected 'run'"
         ):
             openml.study.get_study(99)
 
+    @pytest.mark.production()
     def test_get_suite(self):
         openml.config.server = self.production_server
 
         study = openml.study.get_suite(99)
-        self.assertEqual(len(study.data), 72)
-        self.assertEqual(len(study.tasks), 72)
-        self.assertIsNone(study.flows)
-        self.assertIsNone(study.runs)
-        self.assertIsNone(study.setups)
+        assert len(study.data) == 72
+        assert len(study.tasks) == 72
+        assert study.flows is None
+        assert study.runs is None
+        assert study.setups is None
 
+    @pytest.mark.production()
     def test_get_suite_error(self):
         openml.config.server = self.production_server
 
-        with self.assertRaisesRegex(
-            ValueError,
-            "Unexpected entity type 'run' reported by the server, expected 'task'",
+        with pytest.raises(
+            ValueError, match="Unexpected entity type 'run' reported by the server, expected 'task'"
         ):
             openml.study.get_suite(123)
 
@@ -84,20 +89,20 @@ def test_publish_benchmark_suite(self):
         TestBase._mark_entity_for_removal("study", study.id)
         TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], study.id))
 
-        self.assertGreater(study.id, 0)
+        assert study.id > 0
 
         # verify main meta data
         study_downloaded = openml.study.get_suite(study.id)
-        self.assertEqual(study_downloaded.alias, fixture_alias)
-        self.assertEqual(study_downloaded.name, fixture_name)
-        self.assertEqual(study_downloaded.description, fixture_descr)
-        self.assertEqual(study_downloaded.main_entity_type, "task")
+        assert study_downloaded.alias == fixture_alias
+        assert study_downloaded.name == fixture_name
+        assert study_downloaded.description == fixture_descr
+        assert study_downloaded.main_entity_type == "task"
         # verify resources
-        self.assertIsNone(study_downloaded.flows)
-        self.assertIsNone(study_downloaded.setups)
-        self.assertIsNone(study_downloaded.runs)
-        self.assertGreater(len(study_downloaded.data), 0)
-        self.assertLessEqual(len(study_downloaded.data), len(fixture_task_ids))
+        assert study_downloaded.flows is None
+        assert study_downloaded.setups is None
+        assert study_downloaded.runs is None
+        assert len(study_downloaded.data) > 0
+        assert len(study_downloaded.data) <= len(fixture_task_ids)
         self.assertSetEqual(set(study_downloaded.tasks), set(fixture_task_ids))
 
         # attach more tasks
@@ -114,11 +119,11 @@ def test_publish_benchmark_suite(self):
         # test status update function
         openml.study.update_suite_status(study.id, "deactivated")
         study_downloaded = openml.study.get_suite(study.id)
-        self.assertEqual(study_downloaded.status, "deactivated")
+        assert study_downloaded.status == "deactivated"
         # can't delete study, now it's not longer in preparation
 
     def _test_publish_empty_study_is_allowed(self, explicit: bool):
-        runs: Optional[List[int]] = [] if explicit else None
+        runs: list[int] | None = [] if explicit else None
         kind = "explicit" if explicit else "implicit"
 
         study = openml.study.create_study(
@@ -131,10 +136,10 @@ def _test_publish_empty_study_is_allowed(self, explicit: bool):
         TestBase._mark_entity_for_removal("study", study.id)
         TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], study.id))
 
-        self.assertGreater(study.id, 0)
+        assert study.id > 0
         study_downloaded = openml.study.get_study(study.id)
-        self.assertEqual(study_downloaded.main_entity_type, "run")
-        self.assertIsNone(study_downloaded.runs)
+        assert study_downloaded.main_entity_type == "run"
+        assert study_downloaded.runs is None
 
     def test_publish_empty_study_explicit(self):
         self._test_publish_empty_study_is_allowed(explicit=True)
@@ -146,14 +151,14 @@ def test_publish_empty_study_implicit(self):
     def test_publish_study(self):
         # get some random runs to attach
         run_list = openml.evaluations.list_evaluations("predictive_accuracy", size=10)
-        self.assertEqual(len(run_list), 10)
+        assert len(run_list) == 10
 
         fixt_alias = None
         fixt_name = "unit tested study"
         fixt_descr = "bla"
-        fixt_flow_ids = set([evaluation.flow_id for evaluation in run_list.values()])
-        fixt_task_ids = set([evaluation.task_id for evaluation in run_list.values()])
-        fixt_setup_ids = set([evaluation.setup_id for evaluation in run_list.values()])
+        fixt_flow_ids = {evaluation.flow_id for evaluation in run_list.values()}
+        fixt_task_ids = {evaluation.task_id for evaluation in run_list.values()}
+        fixt_setup_ids = {evaluation.setup_id for evaluation in run_list.values()}
 
         study = openml.study.create_study(
             alias=fixt_alias,
@@ -165,12 +170,12 @@ def test_publish_study(self):
         study.publish()
         TestBase._mark_entity_for_removal("study", study.id)
         TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], study.id))
-        self.assertGreater(study.id, 0)
+        assert study.id > 0
         study_downloaded = openml.study.get_study(study.id)
-        self.assertEqual(study_downloaded.alias, fixt_alias)
-        self.assertEqual(study_downloaded.name, fixt_name)
-        self.assertEqual(study_downloaded.description, fixt_descr)
-        self.assertEqual(study_downloaded.main_entity_type, "run")
+        assert study_downloaded.alias == fixt_alias
+        assert study_downloaded.name == fixt_name
+        assert study_downloaded.description == fixt_descr
+        assert study_downloaded.main_entity_type == "run"
 
         self.assertSetEqual(set(study_downloaded.runs), set(run_list.keys()))
         self.assertSetEqual(set(study_downloaded.setups), set(fixt_setup_ids))
@@ -183,7 +188,9 @@ def test_publish_study(self):
 
         # test whether the list evaluation function also handles study data fine
         run_ids = openml.evaluations.list_evaluations(
-            "predictive_accuracy", size=None, study=study.id
+            "predictive_accuracy",
+            size=None,
+            study=study.id,
         )
         self.assertSetEqual(set(run_ids), set(study_downloaded.runs))
 
@@ -204,16 +211,16 @@ def test_publish_study(self):
         # test status update function
         openml.study.update_study_status(study.id, "deactivated")
         study_downloaded = openml.study.get_study(study.id)
-        self.assertEqual(study_downloaded.status, "deactivated")
+        assert study_downloaded.status == "deactivated"
 
         res = openml.study.delete_study(study.id)
-        self.assertTrue(res)
+        assert res
 
     def test_study_attach_illegal(self):
         run_list = openml.runs.list_runs(size=10)
-        self.assertEqual(len(run_list), 10)
+        assert len(run_list) == 10
         run_list_more = openml.runs.list_runs(size=20)
-        self.assertEqual(len(run_list_more), 20)
+        assert len(run_list_more) == 20
 
         study = openml.study.create_study(
             alias=None,
@@ -227,14 +234,14 @@ def test_study_attach_illegal(self):
         TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], study.id))
         study_original = openml.study.get_study(study.id)
 
-        with self.assertRaisesRegex(
-            openml.exceptions.OpenMLServerException, "Problem attaching entities."
+        with pytest.raises(
+            openml.exceptions.OpenMLServerException, match="Problem attaching entities."
         ):
             # run id does not exists
             openml.study.attach_to_study(study.id, [0])
 
-        with self.assertRaisesRegex(
-            openml.exceptions.OpenMLServerException, "Problem attaching entities."
+        with pytest.raises(
+            openml.exceptions.OpenMLServerException, match="Problem attaching entities."
         ):
             # some runs already attached
             openml.study.attach_to_study(study.id, list(run_list_more.keys()))
@@ -244,8 +251,8 @@ def test_study_attach_illegal(self):
     def test_study_list(self):
         study_list = openml.study.list_studies(status="in_preparation", output_format="dataframe")
         # might fail if server is recently reset
-        self.assertGreaterEqual(len(study_list), 2)
+        assert len(study_list) >= 2
 
     def test_study_list_output_format(self):
         study_list = openml.study.list_studies(status="in_preparation", output_format="dataframe")
-        self.assertIsInstance(study_list, pd.DataFrame)
+        assert isinstance(study_list, pd.DataFrame)
diff --git a/tests/test_tasks/__init__.py b/tests/test_tasks/__init__.py
index e987ab735..26488a8cc 100644
--- a/tests/test_tasks/__init__.py
+++ b/tests/test_tasks/__init__.py
@@ -1,7 +1,7 @@
 # License: BSD 3-Clause
 
-from .test_task import OpenMLTaskTest
 from .test_supervised_task import OpenMLSupervisedTaskTest
+from .test_task import OpenMLTaskTest
 
 __all__ = [
     "OpenMLTaskTest",
diff --git a/tests/test_tasks/test_classification_task.py b/tests/test_tasks/test_classification_task.py
index 4f03c77fc..661e8eced 100644
--- a/tests/test_tasks/test_classification_task.py
+++ b/tests/test_tasks/test_classification_task.py
@@ -1,8 +1,10 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
 import numpy as np
 
 from openml.tasks import TaskType, get_task
+
 from .test_supervised_task import OpenMLSupervisedTaskTest
 
 
@@ -10,25 +12,25 @@ class OpenMLClassificationTaskTest(OpenMLSupervisedTaskTest):
     __test__ = True
 
     def setUp(self, n_levels: int = 1):
-        super(OpenMLClassificationTaskTest, self).setUp()
+        super().setUp()
         self.task_id = 119  # diabetes
         self.task_type = TaskType.SUPERVISED_CLASSIFICATION
         self.estimation_procedure = 1
 
     def test_get_X_and_Y(self):
-        X, Y = super(OpenMLClassificationTaskTest, self).test_get_X_and_Y()
-        self.assertEqual((768, 8), X.shape)
-        self.assertIsInstance(X, np.ndarray)
-        self.assertEqual((768,), Y.shape)
-        self.assertIsInstance(Y, np.ndarray)
-        self.assertEqual(Y.dtype, int)
+        X, Y = super().test_get_X_and_Y()
+        assert X.shape == (768, 8)
+        assert isinstance(X, np.ndarray)
+        assert Y.shape == (768,)
+        assert isinstance(Y, np.ndarray)
+        assert Y.dtype == int
 
     def test_download_task(self):
-        task = super(OpenMLClassificationTaskTest, self).test_download_task()
-        self.assertEqual(task.task_id, self.task_id)
-        self.assertEqual(task.task_type_id, TaskType.SUPERVISED_CLASSIFICATION)
-        self.assertEqual(task.dataset_id, 20)
+        task = super().test_download_task()
+        assert task.task_id == self.task_id
+        assert task.task_type_id == TaskType.SUPERVISED_CLASSIFICATION
+        assert task.dataset_id == 20
 
     def test_class_labels(self):
         task = get_task(self.task_id)
-        self.assertEqual(task.class_labels, ["tested_negative", "tested_positive"])
+        assert task.class_labels == ["tested_negative", "tested_positive"]
diff --git a/tests/test_tasks/test_clustering_task.py b/tests/test_tasks/test_clustering_task.py
index d7a414276..bc59ad26c 100644
--- a/tests/test_tasks/test_clustering_task.py
+++ b/tests/test_tasks/test_clustering_task.py
@@ -1,34 +1,40 @@
 # License: BSD 3-Clause
+from __future__ import annotations
+
+import pytest
 
 import openml
+from openml.exceptions import OpenMLServerException
 from openml.tasks import TaskType
 from openml.testing import TestBase
+
 from .test_task import OpenMLTaskTest
-from openml.exceptions import OpenMLServerException
 
 
 class OpenMLClusteringTaskTest(OpenMLTaskTest):
     __test__ = True
 
     def setUp(self, n_levels: int = 1):
-        super(OpenMLClusteringTaskTest, self).setUp()
+        super().setUp()
         self.task_id = 146714
         self.task_type = TaskType.CLUSTERING
         self.estimation_procedure = 17
 
+    @pytest.mark.production()
     def test_get_dataset(self):
         # no clustering tasks on test server
         openml.config.server = self.production_server
         task = openml.tasks.get_task(self.task_id)
         task.get_dataset()
 
+    @pytest.mark.production()
     def test_download_task(self):
         # no clustering tasks on test server
         openml.config.server = self.production_server
-        task = super(OpenMLClusteringTaskTest, self).test_download_task()
-        self.assertEqual(task.task_id, self.task_id)
-        self.assertEqual(task.task_type_id, TaskType.CLUSTERING)
-        self.assertEqual(task.dataset_id, 36)
+        task = super().test_download_task()
+        assert task.task_id == self.task_id
+        assert task.task_type_id == TaskType.CLUSTERING
+        assert task.dataset_id == 36
 
     def test_upload_task(self):
         compatible_datasets = self._get_compatible_rand_dataset()
@@ -44,7 +50,7 @@ def test_upload_task(self):
                 task = task.publish()
                 TestBase._mark_entity_for_removal("task", task.id)
                 TestBase.logger.info(
-                    "collected from {}: {}".format(__file__.split("/")[-1], task.id)
+                    "collected from {}: {}".format(__file__.split("/")[-1], task.id),
                 )
                 # success
                 break
@@ -58,5 +64,5 @@ def test_upload_task(self):
                     raise e
         else:
             raise ValueError(
-                "Could not create a valid task for task type ID {}".format(self.task_type)
+                f"Could not create a valid task for task type ID {self.task_type}",
             )
diff --git a/tests/test_tasks/test_learning_curve_task.py b/tests/test_tasks/test_learning_curve_task.py
index b3543f9ca..0e781c8ff 100644
--- a/tests/test_tasks/test_learning_curve_task.py
+++ b/tests/test_tasks/test_learning_curve_task.py
@@ -1,8 +1,10 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
 import numpy as np
 
 from openml.tasks import TaskType, get_task
+
 from .test_supervised_task import OpenMLSupervisedTaskTest
 
 
@@ -10,25 +12,25 @@ class OpenMLLearningCurveTaskTest(OpenMLSupervisedTaskTest):
     __test__ = True
 
     def setUp(self, n_levels: int = 1):
-        super(OpenMLLearningCurveTaskTest, self).setUp()
+        super().setUp()
         self.task_id = 801  # diabetes
         self.task_type = TaskType.LEARNING_CURVE
         self.estimation_procedure = 13
 
     def test_get_X_and_Y(self):
-        X, Y = super(OpenMLLearningCurveTaskTest, self).test_get_X_and_Y()
-        self.assertEqual((768, 8), X.shape)
-        self.assertIsInstance(X, np.ndarray)
-        self.assertEqual((768,), Y.shape)
-        self.assertIsInstance(Y, np.ndarray)
-        self.assertEqual(Y.dtype, int)
+        X, Y = super().test_get_X_and_Y()
+        assert X.shape == (768, 8)
+        assert isinstance(X, np.ndarray)
+        assert Y.shape == (768,)
+        assert isinstance(Y, np.ndarray)
+        assert Y.dtype == int
 
     def test_download_task(self):
-        task = super(OpenMLLearningCurveTaskTest, self).test_download_task()
-        self.assertEqual(task.task_id, self.task_id)
-        self.assertEqual(task.task_type_id, TaskType.LEARNING_CURVE)
-        self.assertEqual(task.dataset_id, 20)
+        task = super().test_download_task()
+        assert task.task_id == self.task_id
+        assert task.task_type_id == TaskType.LEARNING_CURVE
+        assert task.dataset_id == 20
 
     def test_class_labels(self):
         task = get_task(self.task_id)
-        self.assertEqual(task.class_labels, ["tested_negative", "tested_positive"])
+        assert task.class_labels == ["tested_negative", "tested_positive"]
diff --git a/tests/test_tasks/test_regression_task.py b/tests/test_tasks/test_regression_task.py
index c958bb3dd..29a8254df 100644
--- a/tests/test_tasks/test_regression_task.py
+++ b/tests/test_tasks/test_regression_task.py
@@ -1,13 +1,15 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
 import ast
+
 import numpy as np
 
 import openml
-from openml.tasks import TaskType
-from openml.testing import TestBase
-from openml.testing import check_task_existence
 from openml.exceptions import OpenMLServerException
+from openml.tasks import TaskType
+from openml.testing import TestBase, check_task_existence
+
 from .test_supervised_task import OpenMLSupervisedTaskTest
 
 
@@ -15,7 +17,7 @@ class OpenMLRegressionTaskTest(OpenMLSupervisedTaskTest):
     __test__ = True
 
     def setUp(self, n_levels: int = 1):
-        super(OpenMLRegressionTaskTest, self).setUp()
+        super().setUp()
 
         task_meta_data = {
             "task_type": TaskType.SUPERVISED_REGRESSION,
@@ -34,7 +36,7 @@ def setUp(self, n_levels: int = 1):
                 task_id = new_task.task_id
                 # mark to remove the uploaded task
                 TestBase._mark_entity_for_removal("task", task_id)
-                TestBase.logger.info("collected from test_run_functions: {}".format(task_id))
+                TestBase.logger.info(f"collected from test_run_functions: {task_id}")
             except OpenMLServerException as e:
                 if e.code == 614:  # Task already exists
                     # the exception message contains the task_id that was matched in the format
@@ -47,15 +49,15 @@ def setUp(self, n_levels: int = 1):
         self.estimation_procedure = 7
 
     def test_get_X_and_Y(self):
-        X, Y = super(OpenMLRegressionTaskTest, self).test_get_X_and_Y()
-        self.assertEqual((194, 32), X.shape)
-        self.assertIsInstance(X, np.ndarray)
-        self.assertEqual((194,), Y.shape)
-        self.assertIsInstance(Y, np.ndarray)
-        self.assertEqual(Y.dtype, float)
+        X, Y = super().test_get_X_and_Y()
+        assert X.shape == (194, 32)
+        assert isinstance(X, np.ndarray)
+        assert Y.shape == (194,)
+        assert isinstance(Y, np.ndarray)
+        assert Y.dtype == float
 
     def test_download_task(self):
-        task = super(OpenMLRegressionTaskTest, self).test_download_task()
-        self.assertEqual(task.task_id, self.task_id)
-        self.assertEqual(task.task_type_id, TaskType.SUPERVISED_REGRESSION)
-        self.assertEqual(task.dataset_id, 105)
+        task = super().test_download_task()
+        assert task.task_id == self.task_id
+        assert task.task_type_id == TaskType.SUPERVISED_REGRESSION
+        assert task.dataset_id == 105
diff --git a/tests/test_tasks/test_split.py b/tests/test_tasks/test_split.py
index 7d8004a91..12cb632d9 100644
--- a/tests/test_tasks/test_split.py
+++ b/tests/test_tasks/test_split.py
@@ -1,7 +1,9 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
 import inspect
 import os
+from pathlib import Path
 
 import numpy as np
 
@@ -17,18 +19,17 @@ def setUp(self):
         __file__ = inspect.getfile(OpenMLSplitTest)
         self.directory = os.path.dirname(__file__)
         # This is for dataset
-        self.arff_filename = os.path.join(
-            self.directory,
-            "..",
-            "files",
-            "org",
-            "openml",
-            "test",
-            "tasks",
-            "1882",
-            "datasplits.arff",
+        self.arff_filepath = (
+            Path(self.directory).parent
+            / "files"
+            / "org"
+            / "openml"
+            / "test"
+            / "tasks"
+            / "1882"
+            / "datasplits.arff"
         )
-        self.pd_filename = self.arff_filename.replace(".arff", ".pkl.py3")
+        self.pd_filename = self.arff_filepath.with_suffix(".pkl.py3")
 
     def tearDown(self):
         try:
@@ -38,49 +39,49 @@ def tearDown(self):
             pass
 
     def test_eq(self):
-        split = OpenMLSplit._from_arff_file(self.arff_filename)
-        self.assertEqual(split, split)
+        split = OpenMLSplit._from_arff_file(self.arff_filepath)
+        assert split == split
 
-        split2 = OpenMLSplit._from_arff_file(self.arff_filename)
+        split2 = OpenMLSplit._from_arff_file(self.arff_filepath)
         split2.name = "a"
-        self.assertNotEqual(split, split2)
+        assert split != split2
 
-        split2 = OpenMLSplit._from_arff_file(self.arff_filename)
+        split2 = OpenMLSplit._from_arff_file(self.arff_filepath)
         split2.description = "a"
-        self.assertNotEqual(split, split2)
+        assert split != split2
 
-        split2 = OpenMLSplit._from_arff_file(self.arff_filename)
-        split2.split[10] = dict()
-        self.assertNotEqual(split, split2)
+        split2 = OpenMLSplit._from_arff_file(self.arff_filepath)
+        split2.split[10] = {}
+        assert split != split2
 
-        split2 = OpenMLSplit._from_arff_file(self.arff_filename)
-        split2.split[0][10] = dict()
-        self.assertNotEqual(split, split2)
+        split2 = OpenMLSplit._from_arff_file(self.arff_filepath)
+        split2.split[0][10] = {}
+        assert split != split2
 
     def test_from_arff_file(self):
-        split = OpenMLSplit._from_arff_file(self.arff_filename)
-        self.assertIsInstance(split.split, dict)
-        self.assertIsInstance(split.split[0], dict)
-        self.assertIsInstance(split.split[0][0], dict)
-        self.assertIsInstance(split.split[0][0][0][0], np.ndarray)
-        self.assertIsInstance(split.split[0][0][0].train, np.ndarray)
-        self.assertIsInstance(split.split[0][0][0].train, np.ndarray)
-        self.assertIsInstance(split.split[0][0][0][1], np.ndarray)
-        self.assertIsInstance(split.split[0][0][0].test, np.ndarray)
-        self.assertIsInstance(split.split[0][0][0].test, np.ndarray)
+        split = OpenMLSplit._from_arff_file(self.arff_filepath)
+        assert isinstance(split.split, dict)
+        assert isinstance(split.split[0], dict)
+        assert isinstance(split.split[0][0], dict)
+        assert isinstance(split.split[0][0][0][0], np.ndarray)
+        assert isinstance(split.split[0][0][0].train, np.ndarray)
+        assert isinstance(split.split[0][0][0].train, np.ndarray)
+        assert isinstance(split.split[0][0][0][1], np.ndarray)
+        assert isinstance(split.split[0][0][0].test, np.ndarray)
+        assert isinstance(split.split[0][0][0].test, np.ndarray)
         for i in range(10):
             for j in range(10):
-                self.assertGreaterEqual(split.split[i][j][0].train.shape[0], 808)
-                self.assertGreaterEqual(split.split[i][j][0].test.shape[0], 89)
-                self.assertEqual(
-                    split.split[i][j][0].train.shape[0] + split.split[i][j][0].test.shape[0], 898
+                assert split.split[i][j][0].train.shape[0] >= 808
+                assert split.split[i][j][0].test.shape[0] >= 89
+                assert (
+                    split.split[i][j][0].train.shape[0] + split.split[i][j][0].test.shape[0] == 898
                 )
 
     def test_get_split(self):
-        split = OpenMLSplit._from_arff_file(self.arff_filename)
+        split = OpenMLSplit._from_arff_file(self.arff_filepath)
         train_split, test_split = split.get(fold=5, repeat=2)
-        self.assertEqual(train_split.shape[0], 808)
-        self.assertEqual(test_split.shape[0], 90)
+        assert train_split.shape[0] == 808
+        assert test_split.shape[0] == 90
         self.assertRaisesRegex(
             ValueError,
             "Repeat 10 not known",
diff --git a/tests/test_tasks/test_supervised_task.py b/tests/test_tasks/test_supervised_task.py
index 69b6a3c1d..00ce1f276 100644
--- a/tests/test_tasks/test_supervised_task.py
+++ b/tests/test_tasks/test_supervised_task.py
@@ -1,11 +1,12 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
-from typing import Tuple
 import unittest
 
 import numpy as np
 
 from openml.tasks import get_task
+
 from .test_task import OpenMLTaskTest
 
 
@@ -21,12 +22,12 @@ class OpenMLSupervisedTaskTest(OpenMLTaskTest):
     def setUpClass(cls):
         if cls is OpenMLSupervisedTaskTest:
             raise unittest.SkipTest("Skip OpenMLSupervisedTaskTest tests," " it's a base class")
-        super(OpenMLSupervisedTaskTest, cls).setUpClass()
+        super().setUpClass()
 
     def setUp(self, n_levels: int = 1):
-        super(OpenMLSupervisedTaskTest, self).setUp()
+        super().setUp()
 
-    def test_get_X_and_Y(self) -> Tuple[np.ndarray, np.ndarray]:
+    def test_get_X_and_Y(self) -> tuple[np.ndarray, np.ndarray]:
         task = get_task(self.task_id)
         X, Y = task.get_X_and_y()
         return X, Y
diff --git a/tests/test_tasks/test_task.py b/tests/test_tasks/test_task.py
index cd8e515c1..ec5a8caf5 100644
--- a/tests/test_tasks/test_task.py
+++ b/tests/test_tasks/test_task.py
@@ -1,16 +1,16 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
 import unittest
-from typing import List
 from random import randint, shuffle
 
-from openml.exceptions import OpenMLServerException
-from openml.testing import TestBase
 from openml.datasets import (
     get_dataset,
     list_datasets,
 )
+from openml.exceptions import OpenMLServerException
 from openml.tasks import TaskType, create_task, get_task
+from openml.testing import TestBase
 
 
 class OpenMLTaskTest(TestBase):
@@ -25,10 +25,10 @@ class OpenMLTaskTest(TestBase):
     def setUpClass(cls):
         if cls is OpenMLTaskTest:
             raise unittest.SkipTest("Skip OpenMLTaskTest tests," " it's a base class")
-        super(OpenMLTaskTest, cls).setUpClass()
+        super().setUpClass()
 
     def setUp(self, n_levels: int = 1):
-        super(OpenMLTaskTest, self).setUp()
+        super().setUp()
 
     def test_download_task(self):
         return get_task(self.task_id)
@@ -53,7 +53,7 @@ def test_upload_task(self):
                 task.publish()
                 TestBase._mark_entity_for_removal("task", task.id)
                 TestBase.logger.info(
-                    "collected from {}: {}".format(__file__.split("/")[-1], task.id)
+                    "collected from {}: {}".format(__file__.split("/")[-1], task.id),
                 )
                 # success
                 break
@@ -67,10 +67,10 @@ def test_upload_task(self):
                     raise e
         else:
             raise ValueError(
-                "Could not create a valid task for task type ID {}".format(self.task_type)
+                f"Could not create a valid task for task type ID {self.task_type}",
             )
 
-    def _get_compatible_rand_dataset(self) -> List:
+    def _get_compatible_rand_dataset(self) -> list:
         active_datasets = list_datasets(status="active", output_format="dataframe")
 
         # depending on the task type, find either datasets
diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py
index 481ef2d83..3dc776a2b 100644
--- a/tests/test_tasks/test_task_functions.py
+++ b/tests/test_tasks/test_task_functions.py
@@ -1,41 +1,42 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
 import os
+import unittest
 from typing import cast
 from unittest import mock
 
+import pandas as pd
 import pytest
 import requests
 
-from openml.tasks import TaskType
-from openml.testing import TestBase, create_request_response
+import openml
 from openml import OpenMLSplit, OpenMLTask
 from openml.exceptions import OpenMLCacheException, OpenMLNotAuthorizedError, OpenMLServerException
-import openml
-import unittest
-import pandas as pd
+from openml.tasks import TaskType
+from openml.testing import TestBase, create_request_response
 
 
 class TestTask(TestBase):
     _multiprocess_can_split_ = True
 
     def setUp(self):
-        super(TestTask, self).setUp()
+        super().setUp()
 
     def tearDown(self):
-        super(TestTask, self).tearDown()
+        super().tearDown()
 
     def test__get_cached_tasks(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         tasks = openml.tasks.functions._get_cached_tasks()
-        self.assertIsInstance(tasks, dict)
-        self.assertEqual(len(tasks), 3)
-        self.assertIsInstance(list(tasks.values())[0], OpenMLTask)
+        assert isinstance(tasks, dict)
+        assert len(tasks) == 3
+        assert isinstance(next(iter(tasks.values())), OpenMLTask)
 
     def test__get_cached_task(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         task = openml.tasks.functions._get_cached_task(1)
-        self.assertIsInstance(task, OpenMLTask)
+        assert isinstance(task, OpenMLTask)
 
     def test__get_cached_task_not_cached(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
@@ -48,12 +49,11 @@ def test__get_cached_task_not_cached(self):
 
     def test__get_estimation_procedure_list(self):
         estimation_procedures = openml.tasks.functions._get_estimation_procedure_list()
-        self.assertIsInstance(estimation_procedures, list)
-        self.assertIsInstance(estimation_procedures[0], dict)
-        self.assertEqual(
-            estimation_procedures[0]["task_type_id"], TaskType.SUPERVISED_CLASSIFICATION
-        )
+        assert isinstance(estimation_procedures, list)
+        assert isinstance(estimation_procedures[0], dict)
+        assert estimation_procedures[0]["task_type_id"] == TaskType.SUPERVISED_CLASSIFICATION
 
+    @pytest.mark.production()
     def test_list_clustering_task(self):
         # as shown by #383, clustering tasks can give list/dict casting problems
         openml.config.server = self.production_server
@@ -61,28 +61,28 @@ def test_list_clustering_task(self):
         # the expected outcome is that it doesn't crash. No assertions.
 
     def _check_task(self, task):
-        self.assertEqual(type(task), dict)
-        self.assertGreaterEqual(len(task), 2)
-        self.assertIn("did", task)
-        self.assertIsInstance(task["did"], int)
-        self.assertIn("status", task)
-        self.assertIsInstance(task["status"], str)
-        self.assertIn(task["status"], ["in_preparation", "active", "deactivated"])
+        assert type(task) == dict
+        assert len(task) >= 2
+        assert "did" in task
+        assert isinstance(task["did"], int)
+        assert "status" in task
+        assert isinstance(task["status"], str)
+        assert task["status"] in ["in_preparation", "active", "deactivated"]
 
     def test_list_tasks_by_type(self):
         num_curves_tasks = 198  # number is flexible, check server if fails
         ttid = TaskType.LEARNING_CURVE
         tasks = openml.tasks.list_tasks(task_type=ttid, output_format="dataframe")
-        self.assertGreaterEqual(len(tasks), num_curves_tasks)
+        assert len(tasks) >= num_curves_tasks
         for task in tasks.to_dict(orient="index").values():
-            self.assertEqual(ttid, task["ttid"])
+            assert ttid == task["ttid"]
             self._check_task(task)
 
     def test_list_tasks_output_format(self):
         ttid = TaskType.LEARNING_CURVE
         tasks = openml.tasks.list_tasks(task_type=ttid, output_format="dataframe")
-        self.assertIsInstance(tasks, pd.DataFrame)
-        self.assertGreater(len(tasks), 100)
+        assert isinstance(tasks, pd.DataFrame)
+        assert len(tasks) > 100
 
     def test_list_tasks_empty(self):
         tasks = cast(
@@ -94,13 +94,13 @@ def test_list_tasks_empty(self):
     def test_list_tasks_by_tag(self):
         num_basic_tasks = 100  # number is flexible, check server if fails
         tasks = openml.tasks.list_tasks(tag="OpenML100", output_format="dataframe")
-        self.assertGreaterEqual(len(tasks), num_basic_tasks)
+        assert len(tasks) >= num_basic_tasks
         for task in tasks.to_dict(orient="index").values():
             self._check_task(task)
 
     def test_list_tasks(self):
         tasks = openml.tasks.list_tasks(output_format="dataframe")
-        self.assertGreaterEqual(len(tasks), 900)
+        assert len(tasks) >= 900
         for task in tasks.to_dict(orient="index").values():
             self._check_task(task)
 
@@ -109,7 +109,7 @@ def test_list_tasks_paginate(self):
         max = 100
         for i in range(0, max, size):
             tasks = openml.tasks.list_tasks(offset=i, size=size, output_format="dataframe")
-            self.assertGreaterEqual(size, len(tasks))
+            assert size >= len(tasks)
             for task in tasks.to_dict(orient="index").values():
                 self._check_task(task)
 
@@ -124,11 +124,14 @@ def test_list_tasks_per_type_paginate(self):
         for j in task_types:
             for i in range(0, max, size):
                 tasks = openml.tasks.list_tasks(
-                    task_type=j, offset=i, size=size, output_format="dataframe"
+                    task_type=j,
+                    offset=i,
+                    size=size,
+                    output_format="dataframe",
                 )
-                self.assertGreaterEqual(size, len(tasks))
+                assert size >= len(tasks)
                 for task in tasks.to_dict(orient="index").values():
-                    self.assertEqual(j, task["ttid"])
+                    assert j == task["ttid"]
                     self._check_task(task)
 
     def test__get_task(self):
@@ -136,8 +139,9 @@ def test__get_task(self):
         openml.tasks.get_task(1882)
 
     @unittest.skip(
-        "Please await outcome of discussion: https://github.com/openml/OpenML/issues/776"
-    )  # noqa: E501
+        "Please await outcome of discussion: https://github.com/openml/OpenML/issues/776",
+    )
+    @pytest.mark.production()
     def test__get_task_live(self):
         # Test the following task as it used to throw an Unicode Error.
         # https://github.com/openml/openml-python/issues/378
@@ -146,66 +150,36 @@ def test__get_task_live(self):
 
     def test_get_task(self):
         task = openml.tasks.get_task(1)  # anneal; crossvalidation
-        self.assertIsInstance(task, OpenMLTask)
-        self.assertTrue(
-            os.path.exists(
-                os.path.join(
-                    self.workdir,
-                    "org",
-                    "openml",
-                    "test",
-                    "tasks",
-                    "1",
-                    "task.xml",
-                )
-            )
+        assert isinstance(task, OpenMLTask)
+        assert os.path.exists(
+            os.path.join(self.workdir, "org", "openml", "test", "tasks", "1", "task.xml")
         )
-        self.assertTrue(
-            os.path.exists(
-                os.path.join(self.workdir, "org", "openml", "test", "tasks", "1", "datasplits.arff")
-            )
+        assert os.path.exists(
+            os.path.join(self.workdir, "org", "openml", "test", "tasks", "1", "datasplits.arff")
         )
-        self.assertTrue(
-            os.path.exists(
-                os.path.join(self.workdir, "org", "openml", "test", "datasets", "1", "dataset.arff")
-            )
+        assert os.path.exists(
+            os.path.join(self.workdir, "org", "openml", "test", "datasets", "1", "dataset.arff")
         )
 
     def test_get_task_lazy(self):
         task = openml.tasks.get_task(2, download_data=False)  # anneal; crossvalidation
-        self.assertIsInstance(task, OpenMLTask)
-        self.assertTrue(
-            os.path.exists(
-                os.path.join(
-                    self.workdir,
-                    "org",
-                    "openml",
-                    "test",
-                    "tasks",
-                    "2",
-                    "task.xml",
-                )
-            )
+        assert isinstance(task, OpenMLTask)
+        assert os.path.exists(
+            os.path.join(self.workdir, "org", "openml", "test", "tasks", "2", "task.xml")
         )
-        self.assertEqual(task.class_labels, ["1", "2", "3", "4", "5", "U"])
+        assert task.class_labels == ["1", "2", "3", "4", "5", "U"]
 
-        self.assertFalse(
-            os.path.exists(
-                os.path.join(self.workdir, "org", "openml", "test", "tasks", "2", "datasplits.arff")
-            )
+        assert not os.path.exists(
+            os.path.join(self.workdir, "org", "openml", "test", "tasks", "2", "datasplits.arff")
         )
         # Since the download_data=False is propagated to get_dataset
-        self.assertFalse(
-            os.path.exists(
-                os.path.join(self.workdir, "org", "openml", "test", "datasets", "2", "dataset.arff")
-            )
+        assert not os.path.exists(
+            os.path.join(self.workdir, "org", "openml", "test", "datasets", "2", "dataset.arff")
         )
 
         task.download_split()
-        self.assertTrue(
-            os.path.exists(
-                os.path.join(self.workdir, "org", "openml", "test", "tasks", "2", "datasplits.arff")
-            )
+        assert os.path.exists(
+            os.path.join(self.workdir, "org", "openml", "test", "tasks", "2", "datasplits.arff")
         )
 
     @mock.patch("openml.tasks.functions.get_dataset")
@@ -224,13 +198,14 @@ def assert_and_raise(*args, **kwargs):
         except WeirdException:
             pass
         # Now the file should no longer exist
-        self.assertFalse(os.path.exists(os.path.join(os.getcwd(), "tasks", "1", "tasks.xml")))
+        assert not os.path.exists(os.path.join(os.getcwd(), "tasks", "1", "tasks.xml"))
 
     def test_get_task_with_cache(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         task = openml.tasks.get_task(1)
-        self.assertIsInstance(task, OpenMLTask)
+        assert isinstance(task, OpenMLTask)
 
+    @pytest.mark.production()
     def test_get_task_different_types(self):
         openml.config.server = self.production_server
         # Regression task
@@ -243,11 +218,9 @@ def test_get_task_different_types(self):
     def test_download_split(self):
         task = openml.tasks.get_task(1)  # anneal; crossvalidation
         split = task.download_split()
-        self.assertEqual(type(split), OpenMLSplit)
-        self.assertTrue(
-            os.path.exists(
-                os.path.join(self.workdir, "org", "openml", "test", "tasks", "1", "datasplits.arff")
-            )
+        assert type(split) == OpenMLSplit
+        assert os.path.exists(
+            os.path.join(self.workdir, "org", "openml", "test", "tasks", "1", "datasplits.arff")
         )
 
     def test_deletion_of_cache_dir(self):
@@ -256,9 +229,9 @@ def test_deletion_of_cache_dir(self):
             "tasks",
             1,
         )
-        self.assertTrue(os.path.exists(tid_cache_dir))
+        assert os.path.exists(tid_cache_dir)
         openml.utils._remove_cache_dir_for_id("tasks", tid_cache_dir)
-        self.assertFalse(os.path.exists(tid_cache_dir))
+        assert not os.path.exists(tid_cache_dir)
 
 
 @mock.patch.object(requests.Session, "delete")
@@ -266,7 +239,8 @@ def test_delete_task_not_owned(mock_delete, test_files_directory, test_api_key):
     openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_not_owned.xml"
     mock_delete.return_value = create_request_response(
-        status_code=412, content_filepath=content_file
+        status_code=412,
+        content_filepath=content_file,
     )
 
     with pytest.raises(
@@ -287,7 +261,8 @@ def test_delete_task_with_run(mock_delete, test_files_directory, test_api_key):
     openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_has_runs.xml"
     mock_delete.return_value = create_request_response(
-        status_code=412, content_filepath=content_file
+        status_code=412,
+        content_filepath=content_file,
     )
 
     with pytest.raises(
@@ -308,7 +283,8 @@ def test_delete_success(mock_delete, test_files_directory, test_api_key):
     openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_successful.xml"
     mock_delete.return_value = create_request_response(
-        status_code=200, content_filepath=content_file
+        status_code=200,
+        content_filepath=content_file,
     )
 
     success = openml.tasks.delete_task(361323)
@@ -326,7 +302,8 @@ def test_delete_unknown_task(mock_delete, test_files_directory, test_api_key):
     openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_not_exist.xml"
     mock_delete.return_value = create_request_response(
-        status_code=412, content_filepath=content_file
+        status_code=412,
+        content_filepath=content_file,
     )
 
     with pytest.raises(
diff --git a/tests/test_tasks/test_task_methods.py b/tests/test_tasks/test_task_methods.py
index 4f15ccce2..552fbe949 100644
--- a/tests/test_tasks/test_task_methods.py
+++ b/tests/test_tasks/test_task_methods.py
@@ -1,4 +1,5 @@
 # License: BSD 3-Clause
+from __future__ import annotations
 
 from time import time
 
@@ -9,40 +10,50 @@
 # Common methods between tasks
 class OpenMLTaskMethodsTest(TestBase):
     def setUp(self):
-        super(OpenMLTaskMethodsTest, self).setUp()
+        super().setUp()
 
     def tearDown(self):
-        super(OpenMLTaskMethodsTest, self).tearDown()
+        super().tearDown()
 
     def test_tagging(self):
         task = openml.tasks.get_task(1)  # anneal; crossvalidation
-        tag = "testing_tag_{}_{}".format(self.id(), time())
+        # tags can be at most 64 alphanumeric (+ underscore) chars
+        unique_indicator = str(time()).replace(".", "")
+        tag = f"test_tag_OpenMLTaskMethodsTest_{unique_indicator}"
         tasks = openml.tasks.list_tasks(tag=tag, output_format="dataframe")
-        self.assertEqual(len(tasks), 0)
+        assert len(tasks) == 0
         task.push_tag(tag)
         tasks = openml.tasks.list_tasks(tag=tag, output_format="dataframe")
-        self.assertEqual(len(tasks), 1)
-        self.assertIn(1, tasks["tid"])
+        assert len(tasks) == 1
+        assert 1 in tasks["tid"]
         task.remove_tag(tag)
         tasks = openml.tasks.list_tasks(tag=tag, output_format="dataframe")
-        self.assertEqual(len(tasks), 0)
+        assert len(tasks) == 0
 
     def test_get_train_and_test_split_indices(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         task = openml.tasks.get_task(1882)
         train_indices, test_indices = task.get_train_test_split_indices(0, 0)
-        self.assertEqual(16, train_indices[0])
-        self.assertEqual(395, train_indices[-1])
-        self.assertEqual(412, test_indices[0])
-        self.assertEqual(364, test_indices[-1])
+        assert train_indices[0] == 16
+        assert train_indices[-1] == 395
+        assert test_indices[0] == 412
+        assert test_indices[-1] == 364
         train_indices, test_indices = task.get_train_test_split_indices(2, 2)
-        self.assertEqual(237, train_indices[0])
-        self.assertEqual(681, train_indices[-1])
-        self.assertEqual(583, test_indices[0])
-        self.assertEqual(24, test_indices[-1])
+        assert train_indices[0] == 237
+        assert train_indices[-1] == 681
+        assert test_indices[0] == 583
+        assert test_indices[-1] == 24
         self.assertRaisesRegex(
-            ValueError, "Fold 10 not known", task.get_train_test_split_indices, 10, 0
+            ValueError,
+            "Fold 10 not known",
+            task.get_train_test_split_indices,
+            10,
+            0,
         )
         self.assertRaisesRegex(
-            ValueError, "Repeat 10 not known", task.get_train_test_split_indices, 0, 10
+            ValueError,
+            "Repeat 10 not known",
+            task.get_train_test_split_indices,
+            0,
+            10,
         )
diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py
index 93bfdb890..cae947917 100644
--- a/tests/test_utils/test_utils.py
+++ b/tests/test_utils/test_utils.py
@@ -1,118 +1,205 @@
+from __future__ import annotations
+
 import os
-import tempfile
 import unittest.mock
-
+import pytest
+import shutil
 import openml
-from openml.testing import TestBase
+from openml.testing import _check_dataset
 
 
-class OpenMLTaskTest(TestBase):
-    _multiprocess_can_split_ = True
+@pytest.fixture(autouse=True)
+def as_robot():
+    policy = openml.config.retry_policy
+    n_retries = openml.config.connection_n_retries
+    openml.config.set_retry_policy("robot", n_retries=20)
+    yield
+    openml.config.set_retry_policy(policy, n_retries)
 
-    def mocked_perform_api_call(call, request_method):
-        # TODO: JvR: Why is this not a staticmethod?
-        url = openml.config.server + "/" + call
-        return openml._api_calls._download_text_file(url)
 
-    def test_list_all(self):
-        openml.utils._list_all(listing_call=openml.tasks.functions._list_tasks)
-        openml.utils._list_all(
-            listing_call=openml.tasks.functions._list_tasks, output_format="dataframe"
-        )
+@pytest.fixture(autouse=True)
+def with_test_server():
+    openml.config.start_using_configuration_for_example()
+    yield
+    openml.config.stop_using_configuration_for_example()
 
-    def test_list_all_with_multiple_batches(self):
-        res = openml.utils._list_all(
-            listing_call=openml.tasks.functions._list_tasks, output_format="dict", batch_size=1050
-        )
-        # Verify that test server state is still valid for this test to work as intended
-        #  -> If the number of results is less than 1050, the test can not test the
-        #  batching operation. By having more than 1050 results we know that batching
-        # was triggered. 1050 appears to be a number of tasks that is available on a fresh
-        # test server.
-        assert len(res) > 1050
-        openml.utils._list_all(
-            listing_call=openml.tasks.functions._list_tasks,
-            output_format="dataframe",
-            batch_size=1050,
-        )
-        # Comparing the number of tasks is not possible as other unit tests running in
-        # parallel might be adding or removing tasks!
-        # assert len(res) <= len(res2)
-
-    @unittest.mock.patch("openml._api_calls._perform_api_call", side_effect=mocked_perform_api_call)
-    def test_list_all_few_results_available(self, _perform_api_call):
-        # we want to make sure that the number of api calls is only 1.
-        # Although we have multiple versions of the iris dataset, there is only
-        # one with this name/version combination
-
-        datasets = openml.datasets.list_datasets(
-            size=1000, data_name="iris", data_version=1, output_format="dataframe"
-        )
-        self.assertEqual(len(datasets), 1)
-        self.assertEqual(_perform_api_call.call_count, 1)
 
-    def test_list_all_for_datasets(self):
-        required_size = 127  # default test server reset value
-        datasets = openml.datasets.list_datasets(
-            batch_size=100, size=required_size, output_format="dataframe"
+@pytest.fixture(autouse=True)
+def with_test_cache(test_files_directory, request):
+    if not test_files_directory.exists():
+        raise ValueError(
+            f"Cannot find test cache dir, expected it to be {test_files_directory!s}!",
         )
+    _root_cache_directory = openml.config._root_cache_directory
+    tmp_cache = test_files_directory / request.node.name
+    openml.config.set_root_cache_directory(tmp_cache)
+    yield
+    openml.config.set_root_cache_directory(_root_cache_directory)
+    if tmp_cache.exists():
+        shutil.rmtree(tmp_cache)
 
-        self.assertEqual(len(datasets), required_size)
-        for dataset in datasets.to_dict(orient="index").values():
-            self._check_dataset(dataset)
 
-    def test_list_all_for_tasks(self):
-        required_size = 1068  # default test server reset value
-        tasks = openml.tasks.list_tasks(
-            batch_size=1000, size=required_size, output_format="dataframe"
-        )
-        self.assertEqual(len(tasks), required_size)
+@pytest.fixture()
+def min_number_tasks_on_test_server() -> int:
+    """After a reset at least 1068 tasks are on the test server"""
+    return 1068
 
-    def test_list_all_for_flows(self):
-        required_size = 15  # default test server reset value
-        flows = openml.flows.list_flows(
-            batch_size=25, size=required_size, output_format="dataframe"
-        )
-        self.assertEqual(len(flows), required_size)
 
-    def test_list_all_for_setups(self):
-        required_size = 50
-        # TODO apparently list_setups function does not support kwargs
-        setups = openml.setups.list_setups(size=required_size)
+@pytest.fixture()
+def min_number_datasets_on_test_server() -> int:
+    """After a reset at least 127 datasets are on the test server"""
+    return 127
 
-        # might not be on test server after reset, please rerun test at least once if fails
-        self.assertEqual(len(setups), required_size)
 
-    def test_list_all_for_runs(self):
-        required_size = 21
-        runs = openml.runs.list_runs(batch_size=25, size=required_size)
+@pytest.fixture()
+def min_number_flows_on_test_server() -> int:
+    """After a reset at least 127 flows are on the test server"""
+    return 15
 
-        # might not be on test server after reset, please rerun test at least once if fails
-        self.assertEqual(len(runs), required_size)
 
-    def test_list_all_for_evaluations(self):
-        required_size = 22
-        # TODO apparently list_evaluations function does not support kwargs
-        evaluations = openml.evaluations.list_evaluations(
-            function="predictive_accuracy", size=required_size
-        )
+@pytest.fixture()
+def min_number_setups_on_test_server() -> int:
+    """After a reset at least 50 setups are on the test server"""
+    return 50
+
 
-        # might not be on test server after reset, please rerun test at least once if fails
-        self.assertEqual(len(evaluations), required_size)
-
-    @unittest.mock.patch("openml.config.get_cache_directory")
-    @unittest.skipIf(os.name == "nt", "https://github.com/openml/openml-python/issues/1033")
-    def test__create_cache_directory(self, config_mock):
-        with tempfile.TemporaryDirectory(dir=self.workdir) as td:
-            config_mock.return_value = td
-            openml.utils._create_cache_directory("abc")
-            self.assertTrue(os.path.exists(os.path.join(td, "abc")))
-            subdir = os.path.join(td, "def")
-            os.mkdir(subdir)
-            os.chmod(subdir, 0o444)
-            config_mock.return_value = subdir
-            with self.assertRaisesRegex(
-                openml.exceptions.OpenMLCacheException,
-                r"Cannot create cache directory",
-            ):
-                openml.utils._create_cache_directory("ghi")
+@pytest.fixture()
+def min_number_runs_on_test_server() -> int:
+    """After a reset at least 50 runs are on the test server"""
+    return 21
+
+
+@pytest.fixture()
+def min_number_evaluations_on_test_server() -> int:
+    """After a reset at least 22 evaluations are on the test server"""
+    return 22
+
+
+def _mocked_perform_api_call(call, request_method):
+    url = openml.config.server + "/" + call
+    return openml._api_calls._download_text_file(url)
+
+
+@pytest.mark.server()
+def test_list_all():
+    openml.utils._list_all(listing_call=openml.tasks.functions._list_tasks)
+    openml.utils._list_all(
+        listing_call=openml.tasks.functions._list_tasks,
+        list_output_format="dataframe",
+    )
+
+
+@pytest.mark.server()
+def test_list_all_for_tasks(min_number_tasks_on_test_server):
+    tasks = openml.tasks.list_tasks(
+        batch_size=1000,
+        size=min_number_tasks_on_test_server,
+        output_format="dataframe",
+    )
+    assert min_number_tasks_on_test_server == len(tasks)
+
+
+@pytest.mark.server()
+def test_list_all_with_multiple_batches(min_number_tasks_on_test_server):
+    # By setting the batch size one lower than the minimum we guarantee at least two
+    # batches and at the same time do as few batches (roundtrips) as possible.
+    batch_size = min_number_tasks_on_test_server - 1
+    res = openml.utils._list_all(
+        listing_call=openml.tasks.functions._list_tasks,
+        list_output_format="dataframe",
+        batch_size=batch_size,
+    )
+    assert min_number_tasks_on_test_server <= len(res)
+
+
+@pytest.mark.server()
+def test_list_all_for_datasets(min_number_datasets_on_test_server):
+    datasets = openml.datasets.list_datasets(
+        batch_size=100,
+        size=min_number_datasets_on_test_server,
+        output_format="dataframe",
+    )
+
+    assert min_number_datasets_on_test_server == len(datasets)
+    for dataset in datasets.to_dict(orient="index").values():
+        _check_dataset(dataset)
+
+
+@pytest.mark.server()
+def test_list_all_for_flows(min_number_flows_on_test_server):
+    flows = openml.flows.list_flows(
+        batch_size=25,
+        size=min_number_flows_on_test_server,
+        output_format="dataframe",
+    )
+    assert min_number_flows_on_test_server == len(flows)
+
+
+@pytest.mark.server()
+@pytest.mark.flaky()  # Other tests might need to upload runs first
+def test_list_all_for_setups(min_number_setups_on_test_server):
+    # TODO apparently list_setups function does not support kwargs
+    setups = openml.setups.list_setups(size=min_number_setups_on_test_server)
+    assert min_number_setups_on_test_server == len(setups)
+
+
+@pytest.mark.server()
+@pytest.mark.flaky()  # Other tests might need to upload runs first
+def test_list_all_for_runs(min_number_runs_on_test_server):
+    runs = openml.runs.list_runs(batch_size=25, size=min_number_runs_on_test_server)
+    assert min_number_runs_on_test_server == len(runs)
+
+
+@pytest.mark.server()
+@pytest.mark.flaky()  # Other tests might need to upload runs first
+def test_list_all_for_evaluations(min_number_evaluations_on_test_server):
+    # TODO apparently list_evaluations function does not support kwargs
+    evaluations = openml.evaluations.list_evaluations(
+        function="predictive_accuracy",
+        size=min_number_evaluations_on_test_server,
+    )
+    assert min_number_evaluations_on_test_server == len(evaluations)
+
+
+@pytest.mark.server()
+@unittest.mock.patch("openml._api_calls._perform_api_call", side_effect=_mocked_perform_api_call)
+def test_list_all_few_results_available(_perform_api_call):
+    datasets = openml.datasets.list_datasets(
+        size=1000,
+        data_name="iris",
+        data_version=1,
+        output_format="dataframe",
+    )
+    assert len(datasets) == 1, "only one iris dataset version 1 should be present"
+    assert _perform_api_call.call_count == 1, "expect just one call to get one dataset"
+
+
+@unittest.skipIf(os.name == "nt", "https://github.com/openml/openml-python/issues/1033")
+@unittest.mock.patch("openml.config.get_cache_directory")
+def test__create_cache_directory(config_mock, tmp_path):
+    config_mock.return_value = tmp_path
+    openml.utils._create_cache_directory("abc")
+    assert (tmp_path / "abc").exists()
+
+    subdir = tmp_path / "def"
+    subdir.mkdir()
+    subdir.chmod(0o444)
+    config_mock.return_value = subdir
+    with pytest.raises(
+        openml.exceptions.OpenMLCacheException,
+        match="Cannot create cache directory",
+    ):
+        openml.utils._create_cache_directory("ghi")
+
+
+@pytest.mark.server()
+def test_correct_test_server_download_state():
+    """This test verifies that the test server downloads the data from the correct source.
+
+    If this tests fails, it is highly likely that the test server is not configured correctly.
+    Usually, this means that the test server is serving data from the task with the same ID from the production server.
+    That is, it serves parquet files wrongly associated with the test server's task.
+    """
+    task = openml.tasks.get_task(119)
+    dataset = task.get_dataset()
+    assert len(dataset.features) == dataset.get_data(dataset_format="dataframe")[0].shape[1]
\ No newline at end of file