diff --git a/.flake8 b/.flake8 deleted file mode 100644 index 2d17eec10..000000000 --- a/.flake8 +++ /dev/null @@ -1,11 +0,0 @@ -[flake8] -max-line-length = 100 -show-source = True -select = C,E,F,W,B,T -ignore = E203, E402, W503 -per-file-ignores = - *__init__.py:F401 - *cli.py:T201 -exclude = - venv - examples diff --git a/.github/workflows/dist.yaml b/.github/workflows/dist.yaml index 63641ae72..b81651cea 100644 --- a/.github/workflows/dist.yaml +++ b/.github/workflows/dist.yaml @@ -1,19 +1,37 @@ name: dist-check -on: [push, pull_request] +on: + workflow_dispatch: + + push: + branches: + - main + - develop + tags: + - "v*.*.*" + + pull_request: + branches: + - main + - develop + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true jobs: dist: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: 3.8 - name: Build dist run: | - python setup.py sdist + pip install build + python -m build --sdist - name: Twine check run: | pip install twine diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index e601176b3..e50d67710 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -1,13 +1,30 @@ name: Docs -on: [pull_request, push] +on: + workflow_dispatch: + + push: + branches: + - main + - develop + tags: + - "v*.*.*" + + pull_request: + branches: + - main + - develop + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true jobs: build-and-deploy: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: 3.8 - name: Install dependencies diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml index 074ae7add..9d1ab7fa8 100644 --- a/.github/workflows/pre-commit.yaml +++ b/.github/workflows/pre-commit.yaml @@ -1,14 +1,31 @@ name: pre-commit -on: [push] +on: + workflow_dispatch: + + push: + branches: + - main + - develop + tags: + - "v*.*.*" + + pull_request: + branches: + - main + - develop + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true jobs: run-all-files: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup Python 3.8 - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: 3.8 - name: Install pre-commit diff --git a/.github/workflows/release_docker.yaml b/.github/workflows/release_docker.yaml index 6ceb1d060..c8f8c59f8 100644 --- a/.github/workflows/release_docker.yaml +++ b/.github/workflows/release_docker.yaml @@ -1,11 +1,20 @@ name: release-docker on: + workflow_dispatch: push: branches: - - 'main' - 'develop' - 'docker' + tags: + - 'v*' + pull_request: + branches: + - 'develop' + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true jobs: @@ -15,34 +24,46 @@ jobs: steps: - name: Set up QEMU - uses: docker/setup-qemu-action@v2 + uses: docker/setup-qemu-action@v3 - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 + uses: docker/setup-buildx-action@v3 - name: Login to DockerHub - uses: docker/login-action@v2 + if: github.event_name != 'pull_request' + uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - name: Check out the repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Extract metadata (tags, labels) for Docker Hub id: meta_dockerhub - uses: docker/metadata-action@v4 + uses: docker/metadata-action@v5 with: images: "openml/openml-python" - name: Build and push id: docker_build - uses: docker/build-push-action@v4 + uses: docker/build-push-action@v5 with: context: ./docker/ - push: true tags: ${{ steps.meta_dockerhub.outputs.tags }} labels: ${{ steps.meta_dockerhub.outputs.labels }} + platforms: linux/amd64,linux/arm64 + push: ${{ github.event_name == 'push' }} + + - name: Update repo description + if: ${{ startsWith(github.ref, 'refs/tags/v') }} + uses: peter-evans/dockerhub-description@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + repository: openml/openml-python + short-description: "pre-installed openml-python environment" + readme-filepath: ./docker/readme.md - name: Image digest run: echo ${{ steps.docker_build.outputs.digest }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 246c38da4..ab60f59c6 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,6 +1,23 @@ name: Tests -on: [push, pull_request] +on: + workflow_dispatch: + + push: + branches: + - main + - develop + tags: + - "v*.*.*" + + pull_request: + branches: + - main + - develop + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true jobs: test: @@ -8,62 +25,42 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [3.7, 3.8] - scikit-learn: [0.21.2, 0.22.2, 0.23.1, 0.24] + python-version: ["3.8"] + # TODO(eddiebergman): We should consider testing against newer version I guess... + # We probably consider just having a `"1"` version to always test against latest + scikit-learn: ["0.23.1", "0.24"] os: [ubuntu-latest] - sklearn-only: ['true'] - exclude: # no scikit-learn 0.21.2 release for Python 3.8 - - python-version: 3.8 - scikit-learn: 0.21.2 + sklearn-only: ["true"] + exclude: # no scikit-learn 0.23 release for Python 3.9 + - python-version: "3.9" + scikit-learn: "0.23.1" include: - - python-version: 3.6 - scikit-learn: 0.18.2 - scipy: 1.2.0 - os: ubuntu-20.04 - sklearn-only: 'true' - - python-version: 3.6 - scikit-learn: 0.19.2 - os: ubuntu-20.04 - sklearn-only: 'true' - - python-version: 3.6 - scikit-learn: 0.20.2 - os: ubuntu-20.04 - sklearn-only: 'true' - - python-version: 3.6 - scikit-learn: 0.21.2 - os: ubuntu-20.04 - sklearn-only: 'true' - - python-version: 3.6 - scikit-learn: 0.22.2 - os: ubuntu-20.04 - sklearn-only: 'true' - - python-version: 3.6 - scikit-learn: 0.23.1 - os: ubuntu-20.04 - sklearn-only: 'true' - - python-version: 3.6 - scikit-learn: 0.24 - os: ubuntu-20.04 - sklearn-only: 'true' - - python-version: 3.8 + - os: ubuntu-latest + python-version: "3.9" + scikit-learn: "0.24" + scipy: "1.10.0" + sklearn-only: "true" + # Include a code cov version + - code-cov: true + os: ubuntu-latest + python-version: "3.8" scikit-learn: 0.23.1 - code-cov: true sklearn-only: 'false' - os: ubuntu-latest + # Include a windows test, for some reason on a later version of scikit-learn - os: windows-latest - sklearn-only: 'false' + python-version: "3.8" scikit-learn: 0.24.* - scipy: 1.10.0 + scipy: "1.10.0" # not sure why the explicit scipy version? + sklearn-only: 'false' fail-fast: false - max-parallel: 4 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 2 - name: Setup Python ${{ matrix.python-version }} if: matrix.os != 'windows-latest' # windows-latest only uses preinstalled Python (3.7.9) - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install test dependencies diff --git a/.gitignore b/.gitignore index 060db33be..90548b2c3 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,14 @@ doc/auto_examples/ doc/modules/generated/ doc/datasets/generated/ +# Some stuff from testing? +tests/files/org/openml/test/datasets/1/ +tests/files/org/openml/test/datasets/2/features.xml.pkl +tests/files/org/openml/test/datasets/2/qualities.xml.pkl +tests/files/org/openml/test/locks/ +tests/files/org/openml/test/tasks/1/datasplits.pkl.py3 +tests/files/org/openml/test/tasks/1882/datasplits.pkl.py3 + # Distribution / packaging .Python diff --git a/.nojekyll b/.nojekyll deleted file mode 100644 index e69de29bb..000000000 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fc1319d79..3505c316b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,42 +1,48 @@ +default_language_version: + python: python3 +files: | + (?x)^( + openml| + tests + )/.*\.py$ repos: - - repo: https://github.com/psf/black - rev: 23.3.0 + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.1.13 hooks: - - id: black - args: [--line-length=100] + - id: ruff + args: [--fix, --exit-non-zero-on-fix, --no-cache] + - id: ruff-format - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.4.1 + rev: v1.8.0 hooks: - id: mypy - name: mypy openml - files: openml/.* additional_dependencies: - types-requests - types-python-dateutil - - id: mypy - name: mypy tests - files: tests/.* - additional_dependencies: - - types-requests - - types-python-dateutil - - id: mypy - name: mypy top-level-functions - files: openml/_api_calls.py - additional_dependencies: - - types-requests - - types-python-dateutil - args: [ --disallow-untyped-defs, --disallow-any-generics, - --disallow-any-explicit, --implicit-optional ] - - repo: https://github.com/pycqa/flake8 - rev: 6.0.0 + - repo: https://github.com/python-jsonschema/check-jsonschema + rev: 0.27.3 hooks: - - id: flake8 - name: flake8 openml - files: openml/.* - additional_dependencies: - - flake8-print==5.0.0 - - id: flake8 - name: flake8 tests - files: tests/.* - additional_dependencies: - - flake8-print==5.0.0 + - id: check-github-workflows + files: '^github/workflows/.*\.ya?ml$' + types: ["yaml"] + - id: check-dependabot + files: '^\.github/dependabot\.ya?ml$' + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: check-added-large-files + files: ".*" + - id: check-case-conflict + files: ".*" + - id: check-merge-conflict + files: ".*" + - id: check-yaml + files: ".*" + - id: end-of-file-fixer + files: ".*" + types: ["yaml"] + - id: check-toml + files: ".*" + types: ["toml"] + - id: debug-statements + files: '^src/.*\.py$' diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 87c8ae3c6..c2b4be187 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -214,28 +214,32 @@ Before each commit, it will automatically run: but make sure to make adjustments if it does fail. If you want to run the pre-commit tests without doing a commit, run: - ```bash - $ pre-commit run --all-files - ``` +```bash +$ make check +``` +or on a system without make, like Windows: +```bash +$ pre-commit run --all-files +``` Make sure to do this at least once before your first commit to check your setup works. Executing a specific unit test can be done by specifying the module, test case, and test. To obtain a hierarchical list of all tests, run - ```bash - $ pytest --collect-only - - - - - - - - - - - - ``` +```bash +$ pytest --collect-only + + + + + + + + + + + +``` You may then run a specific module, test case, or unit test respectively: ```bash diff --git a/Makefile b/Makefile index 165bcea80..b097bd1f9 100644 --- a/Makefile +++ b/Makefile @@ -7,6 +7,9 @@ CTAGS ?= ctags all: clean inplace test +check: + pre-commit run --all-files + clean: $(PYTHON) setup.py clean rm -rf dist openml.egg-info diff --git a/doc/conf.py b/doc/conf.py index a10187486..61ba4a46c 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -119,7 +119,7 @@ # # currently disabled because without intersphinx we cannot link to numpy.ndarray # nitpicky = True - +linkcheck_ignore = [r"https://test.openml.org/t/.*"] # FIXME: to avoid test server bugs avoiding docs building # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for diff --git a/doc/contributing.rst b/doc/contributing.rst index e8d537338..34d1edb14 100644 --- a/doc/contributing.rst +++ b/doc/contributing.rst @@ -19,7 +19,7 @@ In particular, a few ways to contribute to openml-python are: For more information, see the :ref:`extensions` below. * Bug reports. If something doesn't work for you or is cumbersome, please open a new issue to let - us know about the problem. See `this section `_. + us know about the problem. See `this section `_. * `Cite OpenML `_ if you use it in a scientific publication. diff --git a/doc/progress.rst b/doc/progress.rst index 493b029e5..13efd720b 100644 --- a/doc/progress.rst +++ b/doc/progress.rst @@ -6,6 +6,23 @@ Changelog ========= +next +~~~~~~ + + * ... + +0.14.2 +~~~~~~ + + * MAINT #1280: Use the server-provided ``parquet_url`` instead of ``minio_url`` to determine the location of the parquet file. + * ADD #716: add documentation for remaining attributes of classes and functions. + * ADD #1261: more annotations for type hints. + * MAINT #1294: update tests to new tag specification. + * FIX #1314: Update fetching a bucket from MinIO. + * FIX #1315: Make class label retrieval more lenient. + * ADD #1316: add feature descriptions ontologies support. + * MAINT #1310/#1307: switch to ruff and resolve all mypy errors. + 0.14.1 ~~~~~~ diff --git a/docker/Dockerfile b/docker/Dockerfile index c27abba40..a84723309 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -2,8 +2,8 @@ # Useful building docs or running unix tests from a Windows host. FROM python:3.10 -RUN git clone https://github.com/openml/openml-python.git omlp -WORKDIR omlp +RUN git clone https://github.com/openml/openml-python.git openml +WORKDIR openml RUN python -m venv venv RUN venv/bin/pip install wheel setuptools RUN venv/bin/pip install -e .[test,examples,docs,examples_unix] @@ -11,6 +11,8 @@ RUN venv/bin/pip install -e .[test,examples,docs,examples_unix] WORKDIR / RUN mkdir scripts ADD startup.sh scripts/ +ADD readme.md / + # Due to the nature of the Docker container it might often be built from Windows. # It is typical to have the files with \r\n line-ending, we want to remove it for the unix image. RUN sed -i 's/\r//g' scripts/startup.sh diff --git a/docker/readme.md b/docker/readme.md index 47ad6d23b..d0af9d9fe 100644 --- a/docker/readme.md +++ b/docker/readme.md @@ -1,86 +1,131 @@ # OpenML Python Container -This docker container has the latest development version of openml-python downloaded and pre-installed. -It can be used to run the unit tests or build the docs in a fresh and/or isolated unix environment. -Instructions only tested on a Windows host machine. +This docker container has the latest version of openml-python downloaded and pre-installed. +It can also be used by developers to run unit tests or build the docs in +a fresh and/or isolated unix environment. +This document contains information about: -First pull the docker image: + 1. [Usage](#usage): how to use the image and its main modes. + 2. [Using local or remote code](#using-local-or-remote-code): useful when testing your own latest changes. + 3. [Versions](#versions): identify which image to use. + 4. [Development](#for-developers): information about the Docker image for developers. - docker pull openml/openml-python +*note:* each docker image is shipped with a readme, which you can read with: +`docker run --entrypoint=/bin/cat openml/openml-python:TAG readme.md` ## Usage +There are three main ways to use the image: running a pre-installed Python environment, +running tests, and building documentation. - docker run -it openml/openml-python [DOC,TEST] [BRANCH] +### Running `Python` with pre-installed `OpenML-Python` (default): -The image is designed to work with two specified directories which may be mounted ([`docker --mount documentation`](https://docs.docker.com/storage/bind-mounts/#start-a-container-with-a-bind-mount)). -You can mount your openml-python folder to the `/code` directory to run tests or build docs on your local files. -You can mount an `/output` directory to which the container will write output (currently only used for docs). -Each can be mounted by adding a `--mount type=bind,source=SOURCE,destination=/DESTINATION` where `SOURCE` is the absolute path to your code or output directory, and `DESTINATION` is either `code` or `output`. - -E.g. mounting a code directory: +To run `Python` with a pre-installed `OpenML-Python` environment run: - docker run -i --mount type=bind,source="E:\\repositories/openml-python",destination="/code" -t openml/openml-python +```text +docker run -it openml/openml-python +``` -E.g. mounting an output directory: +this accepts the normal `Python` arguments, e.g.: - docker run -i --mount type=bind,source="E:\\files/output",destination="/output" -t openml/openml-python +```text +docker run openml/openml-python -c "import openml; print(openml.__version__)" +``` -You can mount both at the same time. +if you want to run a local script, it needs to be mounted first. Mount it into the +`openml` folder: -### Bash (default) -By default bash is invoked, you should also use the `-i` flag when starting the container so it processes input: +``` +docker run -v PATH/TO/FILE:/openml/MY_SCRIPT.py openml/openml-python MY_SCRIPT.py +``` - docker run -it openml/openml-python +### Running unit tests -### Building Documentation -There are two ways to build documentation, either directly from the `HEAD` of a branch on Github or from your local directory. +You can run the unit tests by passing `test` as the first argument. +It also requires a local or remote repository to be specified, which is explained +[below]((#using-local-or-remote-code). For this example, we specify to test the +`develop` branch: -#### Building from a local repository -Building from a local directory requires you to mount it to the ``/code`` directory: +```text +docker run openml/openml-python test develop +``` - docker run --mount type=bind,source=PATH_TO_REPOSITORY,destination=/code -t openml/openml-python doc +### Building documentation -The produced documentation will be in your repository's ``doc/build`` folder. -If an `/output` folder is mounted, the documentation will *also* be copied there. +You can build the documentation by passing `doc` as the first argument, +you should [mount]((https://docs.docker.com/storage/bind-mounts/#start-a-container-with-a-bind-mount)) +an output directory in which the docs will be stored. You also need to provide a remote +or local repository as explained in [the section below]((#using-local-or-remote-code). +In this example, we build documentation for the `develop` branch. +On Windows: -#### Building from an online repository -Building from a remote repository requires you to specify a branch. -The branch may be specified by name directly if it exists on the original repository (https://github.com/openml/openml-python/): +```text + docker run --mount type=bind,source="E:\\files/output",destination="/output" openml/openml-python doc develop +``` - docker run --mount type=bind,source=PATH_TO_OUTPUT,destination=/output -t openml/openml-python doc BRANCH +on Linux: +```text + docker run --mount type=bind,source="./output",destination="/output" openml/openml-python doc develop +``` + +see [the section below]((#using-local-or-remote-code) for running against local changes +or a remote branch. -Where `BRANCH` is the name of the branch for which to generate the documentation. -It is also possible to build the documentation from the branch on a fork, in this case the `BRANCH` should be specified as `GITHUB_NAME#BRANCH` (e.g. `PGijsbers#my_feature`) and the name of the forked repository should be `openml-python`. +*Note: you can forgo mounting an output directory to test if the docs build successfully, +but the result will only be available within the docker container under `/openml/docs/build`.* -### Running tests -There are two ways to run tests, either directly from the `HEAD` of a branch on Github or from your local directory. -It works similar to building docs, but should specify `test` as mode. -For example, to run tests on your local repository: +## Using local or remote code - docker run --mount type=bind,source=PATH_TO_REPOSITORY,destination=/code -t openml/openml-python test - -Running tests from the state of an online repository is supported similar to building documentation (i.e. specify `BRANCH` instead of mounting `/code`). - -## Troubleshooting +You can build docs or run tests against your local repository or a Github repository. +In the examples below, change the `source` to match the location of your local repository. + +### Using a local repository + +To use a local directory, mount it in the `/code` directory, on Windows: + +```text + docker run --mount type=bind,source="E:\\repositories/openml-python",destination="/code" openml/openml-python test +``` -When you are mounting a directory you can check that it is mounted correctly by running the image in bash mode. -Navigate to the `/code` and `/output` directories and see if the expected files are there. -If e.g. there is no code in your mounted `/code`, you should double-check the provided path to your host directory. +on Linux: +```text + docker run --mount type=bind,source="/Users/pietergijsbers/repositories/openml-python",destination="/code" openml/openml-python test +``` -## Notes for developers -This section contains some notes about the structure of the image, intended for those who want to work on it. +when building docs, you also need to mount an output directory as shown above, so add both: + +```text +docker run --mount type=bind,source="./output",destination="/output" --mount type=bind,source="/Users/pietergijsbers/repositories/openml-python",destination="/code" openml/openml-python doc +``` + +### Using a Github repository +Building from a remote repository requires you to specify a branch. +The branch may be specified by name directly if it exists on the original repository (https://github.com/openml/openml-python/): + + docker run --mount type=bind,source=PATH_TO_OUTPUT,destination=/output openml/openml-python [test,doc] BRANCH + +Where `BRANCH` is the name of the branch for which to generate the documentation. +It is also possible to build the documentation from the branch on a fork, +in this case the `BRANCH` should be specified as `GITHUB_NAME#BRANCH` (e.g. +`PGijsbers#my_feature_branch`) and the name of the forked repository should be `openml-python`. + +## For developers +This section contains some notes about the structure of the image, +intended for those who want to work on it. ### Added Directories The `openml/openml-python` image is built on a vanilla `python:3` image. -Additionally it contains the following files are directories: - - - `/omlp`: contains the openml-python repository in the state with which the image was built by default. - If working with a `BRANCH`, this repository will be set to the `HEAD` of `BRANCH`. - - `/omlp/venv/`: contains the used virtual environment for `doc` and `test`. It has `openml-python` dependencies pre-installed. - When invoked with `doc` or `test`, the dependencies will be updated based on the `setup.py` of the `BRANCH` or mounted `/code`. +Additionally, it contains the following files are directories: + + - `/openml`: contains the openml-python repository in the state with which the image + was built by default. If working with a `BRANCH`, this repository will be set to + the `HEAD` of `BRANCH`. + - `/openml/venv/`: contains the used virtual environment for `doc` and `test`. It has + `openml-python` dependencies pre-installed. When invoked with `doc` or `test`, the + dependencies will be updated based on the `setup.py` of the `BRANCH` or mounted `/code`. - `/scripts/startup.sh`: the entrypoint of the image. Takes care of the automated features (e.g. `doc` and `test`). ## Building the image -To build the image yourself, execute `docker build -f Dockerfile .` from this directory. -It will use the `startup.sh` as is, so any local changes will be present in the image. +To build the image yourself, execute `docker build -f Dockerfile .` from the `docker` +directory of the `openml-python` repository. It will use the `startup.sh` as is, so any +local changes will be present in the image. diff --git a/docker/startup.sh b/docker/startup.sh index 2a75a621c..34a5c61f3 100644 --- a/docker/startup.sh +++ b/docker/startup.sh @@ -1,3 +1,6 @@ +# Entry script to switch between the different Docker functionalities. +# By default, execute Python with OpenML pre-installed +# # Entry script to allow docker to be ran for bash, tests and docs. # The script assumes a code repository can be mounted to ``/code`` and an output directory to ``/output``. # Executes ``mode`` on ``branch`` or the provided ``code`` directory. @@ -10,10 +13,11 @@ # Can be a branch on a Github fork, specified with the USERNAME#BRANCH format. # The test or doc build is executed on this branch. -if [ -z "$1" ]; then - echo "Executing in BASH mode." - bash - exit +if [[ ! ( $1 = "doc" || $1 = "test" ) ]]; then + cd openml + source venv/bin/activate + python "$@" + exit 0 fi # doc and test modes require mounted directories and/or specified branches @@ -32,8 +36,8 @@ if [ "$1" == "doc" ] && [ -n "$2" ] && ! [ -d "/output" ]; then fi if [ -n "$2" ]; then - # if a branch is provided, we will pull it into the `omlp` local repository that was created with the image. - cd omlp + # if a branch is provided, we will pull it into the `openml` local repository that was created with the image. + cd openml if [[ $2 == *#* ]]; then # If a branch is specified on a fork (with NAME#BRANCH format), we have to construct the url before pulling # We add a trailing '#' delimiter so the second element doesn't get the trailing newline from <<< @@ -52,12 +56,12 @@ if [ -n "$2" ]; then exit 1 fi git pull - code_dir="/omlp" + code_dir="/openml" else code_dir="/code" fi -source /omlp/venv/bin/activate +source /openml/venv/bin/activate cd $code_dir # The most recent ``main`` is already installed, but we want to update any outdated dependencies pip install -e .[test,examples,docs,examples_unix] @@ -71,6 +75,6 @@ if [ "$1" == "doc" ]; then make html make linkcheck if [ -d "/output" ]; then - cp -r /omlp/doc/build /output + cp -r /openml/doc/build /output fi -fi +fi \ No newline at end of file diff --git a/mypy.ini b/mypy.ini deleted file mode 100644 index 7f3f8cefb..000000000 --- a/mypy.ini +++ /dev/null @@ -1,6 +0,0 @@ -[mypy] -# Reports any config lines that are not recognized -warn_unused_configs=True - -ignore_missing_imports=True -follow_imports=skip diff --git a/openml/__init__.py b/openml/__init__.py index abb83ac0c..48d301eec 100644 --- a/openml/__init__.py +++ b/openml/__init__.py @@ -16,40 +16,46 @@ """ # License: BSD 3-Clause - -from . import _api_calls -from . import config -from .datasets import OpenMLDataset, OpenMLDataFeature -from . import datasets -from . import evaluations +from __future__ import annotations + +from . import ( + _api_calls, + config, + datasets, + evaluations, + exceptions, + extensions, + flows, + runs, + setups, + study, + tasks, + utils, +) +from .__version__ import __version__ +from .datasets import OpenMLDataFeature, OpenMLDataset from .evaluations import OpenMLEvaluation -from . import extensions -from . import exceptions -from . import tasks +from .flows import OpenMLFlow +from .runs import OpenMLRun +from .setups import OpenMLParameter, OpenMLSetup +from .study import OpenMLBenchmarkSuite, OpenMLStudy from .tasks import ( - OpenMLTask, - OpenMLSplit, - OpenMLSupervisedTask, OpenMLClassificationTask, - OpenMLRegressionTask, OpenMLClusteringTask, OpenMLLearningCurveTask, + OpenMLRegressionTask, + OpenMLSplit, + OpenMLSupervisedTask, + OpenMLTask, ) -from . import runs -from .runs import OpenMLRun -from . import flows -from .flows import OpenMLFlow -from . import study -from .study import OpenMLStudy, OpenMLBenchmarkSuite -from . import utils -from . import setups -from .setups import OpenMLSetup, OpenMLParameter - - -from .__version__ import __version__ # noqa: F401 -def populate_cache(task_ids=None, dataset_ids=None, flow_ids=None, run_ids=None): +def populate_cache( + task_ids: list[int] | None = None, + dataset_ids: list[int | str] | None = None, + flow_ids: list[int] | None = None, + run_ids: list[int] | None = None, +) -> None: """ Populate a cache for offline and parallel usage of the OpenML connector. @@ -117,4 +123,5 @@ def populate_cache(task_ids=None, dataset_ids=None, flow_ids=None, run_ids=None) ] # Load the scikit-learn extension by default -import openml.extensions.sklearn # noqa: F401 +# TODO(eddiebergman): Not sure why this is at the bottom of the file +import openml.extensions.sklearn # noqa: E402, F401 diff --git a/openml/__version__.py b/openml/__version__.py index d44a77ce2..d927c85ca 100644 --- a/openml/__version__.py +++ b/openml/__version__.py @@ -3,4 +3,6 @@ # License: BSD 3-Clause # The following line *must* be the last in the module, exactly as formatted: -__version__ = "0.14.1" +from __future__ import annotations + +__version__ = "0.14.2" diff --git a/openml/_api_calls.py b/openml/_api_calls.py index 9ac49495d..9865c86df 100644 --- a/openml/_api_calls.py +++ b/openml/_api_calls.py @@ -1,34 +1,47 @@ # License: BSD 3-Clause +from __future__ import annotations -import time import hashlib import logging import math -import pathlib import random -import requests +import time import urllib.parse import xml -import xmltodict -from urllib3 import ProxyManager -from typing import Dict, Optional, Tuple, Union import zipfile +from pathlib import Path +from typing import Dict, Tuple, Union import minio +import requests +import requests.utils +import xmltodict +from urllib3 import ProxyManager from . import config from .exceptions import ( + OpenMLHashException, OpenMLServerError, OpenMLServerException, OpenMLServerNoResult, - OpenMLHashException, ) DATA_TYPE = Dict[str, Union[str, int]] FILE_ELEMENTS_TYPE = Dict[str, Union[str, Tuple[str, str]]] +DATABASE_CONNECTION_ERRCODE = 107 + + +def _robot_delay(n: int) -> float: + wait = (1 / (1 + math.exp(-(n * 0.5 - 4)))) * 60 + variation = random.gauss(0, wait / 10) + return max(1.0, wait + variation) -def resolve_env_proxies(url: str) -> Optional[str]: +def _human_delay(n: int) -> float: + return max(1.0, n) + + +def resolve_env_proxies(url: str) -> str | None: """Attempt to find a suitable proxy for this url. Relies on ``requests`` internals to remain consistent. To disable this from the @@ -45,8 +58,7 @@ def resolve_env_proxies(url: str) -> Optional[str]: The proxy url if found, else None """ resolved_proxies = requests.utils.get_environ_proxies(url) - selected_proxy = requests.utils.select_proxy(url, resolved_proxies) - return selected_proxy + return requests.utils.select_proxy(url, resolved_proxies) # type: ignore def _create_url_from_endpoint(endpoint: str) -> str: @@ -60,8 +72,8 @@ def _create_url_from_endpoint(endpoint: str) -> str: def _perform_api_call( call: str, request_method: str, - data: Optional[DATA_TYPE] = None, - file_elements: Optional[FILE_ELEMENTS_TYPE] = None, + data: DATA_TYPE | None = None, + file_elements: FILE_ELEMENTS_TYPE | None = None, ) -> str: """ Perform an API call at the OpenML server. @@ -111,17 +123,17 @@ def _perform_api_call( def _download_minio_file( source: str, - destination: Union[str, pathlib.Path], - exists_ok: bool = True, - proxy: Optional[str] = "auto", + destination: str | Path, + exists_ok: bool = True, # noqa: FBT001, FBT002 + proxy: str | None = "auto", ) -> None: """Download file ``source`` from a MinIO Bucket and store it at ``destination``. Parameters ---------- - source : Union[str, pathlib.Path] + source : str URL to a file in a MinIO bucket. - destination : str + destination : str | Path Path to store the file to, if a directory is provided the original filename is used. exists_ok : bool, optional (default=True) If False, raise FileExists if a file already exists in ``destination``. @@ -130,13 +142,13 @@ def _download_minio_file( automatically find the proxy to use. Pass None or the environment variable ``no_proxy="*"`` to disable proxies. """ - destination = pathlib.Path(destination) + destination = Path(destination) parsed_url = urllib.parse.urlparse(source) # expect path format: /BUCKET/path/to/file.ext bucket, object_name = parsed_url.path[1:].split("/", maxsplit=1) if destination.is_dir(): - destination = pathlib.Path(destination, object_name) + destination = Path(destination, object_name) if destination.is_file() and not exists_ok: raise FileExistsError(f"File already exists in {destination}.") @@ -158,53 +170,52 @@ def _download_minio_file( zip_ref.extractall(destination.parent) except minio.error.S3Error as e: - if e.message.startswith("Object does not exist"): + if e.message is not None and e.message.startswith("Object does not exist"): raise FileNotFoundError(f"Object at '{source}' does not exist.") from e # e.g. permission error, or a bucket does not exist (which is also interpreted as a # permission error on minio level). raise FileNotFoundError("Bucket does not exist or is private.") from e -def _download_minio_bucket( - source: str, - destination: Union[str, pathlib.Path], - exists_ok: bool = True, -) -> None: +def _download_minio_bucket(source: str, destination: str | Path) -> None: """Download file ``source`` from a MinIO Bucket and store it at ``destination``. Parameters ---------- - source : Union[str, pathlib.Path] + source : str URL to a MinIO bucket. - destination : str + destination : str | Path Path to a directory to store the bucket content in. exists_ok : bool, optional (default=True) If False, raise FileExists if a file already exists in ``destination``. """ - - destination = pathlib.Path(destination) + destination = Path(destination) parsed_url = urllib.parse.urlparse(source) # expect path format: /BUCKET/path/to/file.ext - bucket = parsed_url.path[1:] + _, bucket, *prefixes, _file = parsed_url.path.split("/") + prefix = "/".join(prefixes) client = minio.Minio(endpoint=parsed_url.netloc, secure=False) - for file_object in client.list_objects(bucket, recursive=True): + for file_object in client.list_objects(bucket, prefix=prefix, recursive=True): + if file_object.object_name is None: + raise ValueError("Object name is None.") + _download_minio_file( - source=source + "/" + file_object.object_name, - destination=pathlib.Path(destination, file_object.object_name), + source=source.rsplit("/", 1)[0] + "/" + file_object.object_name.rsplit("/", 1)[1], + destination=Path(destination, file_object.object_name.rsplit("/", 1)[1]), exists_ok=True, ) def _download_text_file( source: str, - output_path: Optional[str] = None, - md5_checksum: Optional[str] = None, - exists_ok: bool = True, + output_path: str | Path | None = None, + md5_checksum: str | None = None, + exists_ok: bool = True, # noqa: FBT001, FBT002 encoding: str = "utf8", -) -> Optional[str]: +) -> str | None: """Download the text file at `source` and store it in `output_path`. By default, do nothing if a file already exists in `output_path`. @@ -214,7 +225,7 @@ def _download_text_file( ---------- source : str url of the file to be downloaded - output_path : str, (optional) + output_path : str | Path | None (default=None) full path, including filename, of where the file should be stored. If ``None``, this function returns the downloaded file as string. md5_checksum : str, optional (default=None) @@ -224,15 +235,14 @@ def _download_text_file( encoding : str, optional (default='utf8') The encoding with which the file should be stored. """ - if output_path is not None: - try: - with open(output_path, encoding=encoding): - if exists_ok: - return None - else: - raise FileExistsError - except FileNotFoundError: - pass + if isinstance(output_path, str): + output_path = Path(output_path) + + if output_path is not None and output_path.exists(): + if not exists_ok: + raise FileExistsError + + return None logging.info("Starting [%s] request for the URL %s", "get", source) start = time.time() @@ -248,87 +258,93 @@ def _download_text_file( ) return downloaded_file - else: - with open(output_path, "w", encoding=encoding) as fh: - fh.write(downloaded_file) - - logging.info( - "%.7fs taken for [%s] request for the URL %s", - time.time() - start, - "get", - source, - ) + with output_path.open("w", encoding=encoding) as fh: + fh.write(downloaded_file) - del downloaded_file - return None + logging.info( + "%.7fs taken for [%s] request for the URL %s", + time.time() - start, + "get", + source, + ) + return None -def _file_id_to_url(file_id: str, filename: Optional[str] = None) -> str: +def _file_id_to_url(file_id: int, filename: str | None = None) -> str: """ Presents the URL how to download a given file id filename is optional """ openml_url = config.server.split("/api/") - url = openml_url[0] + "/data/download/%s" % file_id + url = openml_url[0] + f"/data/download/{file_id!s}" if filename is not None: url += "/" + filename return url def _read_url_files( - url: str, data: Optional[DATA_TYPE] = None, file_elements: Optional[FILE_ELEMENTS_TYPE] = None + url: str, + data: DATA_TYPE | None = None, + file_elements: FILE_ELEMENTS_TYPE | None = None, ) -> requests.Response: - """do a post request to url with data - and sending file_elements as files""" - + """Do a post request to url with data + and sending file_elements as files + """ data = {} if data is None else data data["api_key"] = config.apikey if file_elements is None: file_elements = {} # Using requests.post sets header 'Accept-encoding' automatically to # 'gzip,deflate' - response = _send_request( + return _send_request( request_method="post", url=url, data=data, files=file_elements, ) - return response def __read_url( url: str, request_method: str, - data: Optional[DATA_TYPE] = None, - md5_checksum: Optional[str] = None, + data: DATA_TYPE | None = None, + md5_checksum: str | None = None, ) -> requests.Response: data = {} if data is None else data if config.apikey: data["api_key"] = config.apikey return _send_request( - request_method=request_method, url=url, data=data, md5_checksum=md5_checksum + request_method=request_method, + url=url, + data=data, + md5_checksum=md5_checksum, ) -def __is_checksum_equal(downloaded_file_binary: bytes, md5_checksum: Optional[str] = None) -> bool: +def __is_checksum_equal(downloaded_file_binary: bytes, md5_checksum: str | None = None) -> bool: if md5_checksum is None: return True - md5 = hashlib.md5() + md5 = hashlib.md5() # noqa: S324 md5.update(downloaded_file_binary) md5_checksum_download = md5.hexdigest() return md5_checksum == md5_checksum_download -def _send_request( +def _send_request( # noqa: C901 request_method: str, url: str, data: DATA_TYPE, - files: Optional[FILE_ELEMENTS_TYPE] = None, - md5_checksum: Optional[str] = None, + files: FILE_ELEMENTS_TYPE | None = None, + md5_checksum: str | None = None, ) -> requests.Response: n_retries = max(1, config.connection_n_retries) - response: requests.Response + response: requests.Response | None = None + delay_method = _human_delay if config.retry_policy == "human" else _robot_delay + + # Error to raise in case of retrying too often. Will be set to the last observed exception. + retry_raise_e: Exception | None = None + with requests.Session() as session: # Start at one to have a non-zero multiplier for the sleep for retry_counter in range(1, n_retries + 1): @@ -341,7 +357,9 @@ def _send_request( response = session.post(url, data=data, files=files) else: raise NotImplementedError() + __check_response(response=response, url=url, file_elements=files) + if request_method == "get" and not __is_checksum_equal( response.text.encode("utf-8"), md5_checksum ): @@ -352,85 +370,86 @@ def _send_request( "because the text encoding is not UTF-8 when downloading {}. " "There might be a sever-sided issue with the file, " "see: https://github.com/openml/openml-python/issues/1180.".format( - md5_checksum, url - ) + md5_checksum, + url, + ), ) raise OpenMLHashException( "Checksum of downloaded file is unequal to the expected checksum {} " - "when downloading {}.".format(md5_checksum, url) + "when downloading {}.".format(md5_checksum, url), ) - break + + return response + except OpenMLServerException as e: + # Propagate all server errors to the calling functions, except + # for 107 which represents a database connection error. + # These are typically caused by high server load, + # which means trying again might resolve the issue. + if e.code != DATABASE_CONNECTION_ERRCODE: + raise e + retry_raise_e = e + except xml.parsers.expat.ExpatError as e: + if request_method != "get" or retry_counter >= n_retries: + if response is not None: + extra = f"Status code: {response.status_code}\n{response.text}" + else: + extra = "No response retrieved." + + raise OpenMLServerError( + f"Unexpected server error when calling {url}. Please contact the " + f"developers!\n{extra}" + ) from e + retry_raise_e = e except ( requests.exceptions.ChunkedEncodingError, requests.exceptions.ConnectionError, requests.exceptions.SSLError, - OpenMLServerException, - xml.parsers.expat.ExpatError, OpenMLHashException, ) as e: - if isinstance(e, OpenMLServerException) and e.code != 107: - # Propagate all server errors to the calling functions, except - # for 107 which represents a database connection error. - # These are typically caused by high server load, - # which means trying again might resolve the issue. - raise - elif isinstance(e, xml.parsers.expat.ExpatError): - if request_method != "get" or retry_counter >= n_retries: - raise OpenMLServerError( - "Unexpected server error when calling {}. Please contact the " - "developers!\nStatus code: {}\n{}".format( - url, - response.status_code, - response.text, - ) - ) - if retry_counter >= n_retries: - raise - else: + retry_raise_e = e - def robot(n: int) -> float: - wait = (1 / (1 + math.exp(-(n * 0.5 - 4)))) * 60 - variation = random.gauss(0, wait / 10) - return max(1.0, wait + variation) + # We can only be here if there was an exception + assert retry_raise_e is not None + if retry_counter >= n_retries: + raise retry_raise_e + delay = delay_method(retry_counter) + time.sleep(delay) - def human(n: int) -> float: - return max(1.0, n) - - delay = {"human": human, "robot": robot}[config.retry_policy](retry_counter) - time.sleep(delay) + assert response is not None return response def __check_response( - response: requests.Response, url: str, file_elements: Optional[FILE_ELEMENTS_TYPE] + response: requests.Response, + url: str, + file_elements: FILE_ELEMENTS_TYPE | None, ) -> None: if response.status_code != 200: raise __parse_server_exception(response, url, file_elements=file_elements) - elif ( - "Content-Encoding" not in response.headers or response.headers["Content-Encoding"] != "gzip" - ): - logging.warning("Received uncompressed content from OpenML for {}.".format(url)) + if "Content-Encoding" not in response.headers or response.headers["Content-Encoding"] != "gzip": + logging.warning(f"Received uncompressed content from OpenML for {url}.") def __parse_server_exception( response: requests.Response, url: str, - file_elements: Optional[FILE_ELEMENTS_TYPE], + file_elements: FILE_ELEMENTS_TYPE | None, ) -> OpenMLServerError: if response.status_code == 414: - raise OpenMLServerError("URI too long! ({})".format(url)) + raise OpenMLServerError(f"URI too long! ({url})") + try: server_exception = xmltodict.parse(response.text) - except xml.parsers.expat.ExpatError: - raise - except Exception: + except xml.parsers.expat.ExpatError as e: + raise e + except Exception as e: # noqa: BLE001 # OpenML has a sophisticated error system # where information about failures is provided. try to parse this raise OpenMLServerError( - "Unexpected server error when calling {}. Please contact the developers!\n" - "Status code: {}\n{}".format(url, response.status_code, response.text) - ) + f"Unexpected server error when calling {url}. Please contact the developers!\n" + f"Status code: {response.status_code}\n{response.text}", + ) from e server_error = server_exception["oml:error"] code = int(server_error["oml:code"]) @@ -438,7 +457,7 @@ def __parse_server_exception( additional_information = server_error.get("oml:additional_information") if code in [372, 512, 500, 482, 542, 674]: if additional_information: - full_message = "{} - {}".format(message, additional_information) + full_message = f"{message} - {additional_information}" else: full_message = message @@ -457,5 +476,5 @@ def __parse_server_exception( additional_information, ) else: - full_message = "{} - {}".format(message, additional_information) + full_message = f"{message} - {additional_information}" return OpenMLServerException(code=code, message=full_message, url=url) diff --git a/openml/base.py b/openml/base.py index 35a9ce58f..37693a2ec 100644 --- a/openml/base.py +++ b/openml/base.py @@ -1,32 +1,33 @@ # License: BSD 3-Clause +from __future__ import annotations -from abc import ABC, abstractmethod -from collections import OrderedDict import re -from typing import Optional, List, Tuple, Union, Dict import webbrowser +from abc import ABC, abstractmethod +from typing import Iterable, Sequence import xmltodict +import openml._api_calls import openml.config -from .utils import _tag_openml_base, _get_rest_api_type_alias + +from .utils import _get_rest_api_type_alias, _tag_openml_base class OpenMLBase(ABC): """Base object for functionality that is shared across entities.""" - def __repr__(self): + def __repr__(self) -> str: body_fields = self._get_repr_body_fields() return self._apply_repr_template(body_fields) @property @abstractmethod - def id(self) -> Optional[int]: + def id(self) -> int | None: """The id of the entity, it is unique for its entity type.""" - pass @property - def openml_url(self) -> Optional[str]: + def openml_url(self) -> str | None: """The URL of the object on the server, if it was uploaded, else None.""" if self.id is None: return None @@ -36,7 +37,7 @@ def openml_url(self) -> Optional[str]: def url_for_id(cls, id_: int) -> str: """Return the OpenML URL for the object of the class entity with the given id.""" # Sample url for a flow: openml.org/f/123 - return "{}/{}/{}".format(openml.config.get_server_base_url(), cls._entity_letter(), id_) + return f"{openml.config.get_server_base_url()}/{cls._entity_letter()}/{id_}" @classmethod def _entity_letter(cls) -> str: @@ -45,21 +46,24 @@ def _entity_letter(cls) -> str: # which holds for all entities except studies and tasks, which overwrite this method. return cls.__name__.lower()[len("OpenML") :][0] + # TODO(eddiebergman): This would be much cleaner as an iterator... @abstractmethod - def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: + def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str] | None]]: """Collect all information to display in the __repr__ body. Returns - ------ + ------- body_fields : List[Tuple[str, Union[str, int, List[str]]]] A list of (name, value) pairs to display in the body of the __repr__. E.g.: [('metric', 'accuracy'), ('dataset', 'iris')] If value is a List of str, then each item of the list will appear in a separate row. """ # Should be implemented in the base class. - pass - def _apply_repr_template(self, body_fields: List[Tuple[str, str]]) -> str: + def _apply_repr_template( + self, + body_fields: Iterable[tuple[str, str | int | list[str] | None]], + ) -> str: """Generates the header and formats the body for string representation of the object. Parameters @@ -69,33 +73,34 @@ def _apply_repr_template(self, body_fields: List[Tuple[str, str]]) -> str: """ # We add spaces between capitals, e.g. ClassificationTask -> Classification Task name_with_spaces = re.sub( - r"(\w)([A-Z])", r"\1 \2", self.__class__.__name__[len("OpenML") :] + r"(\w)([A-Z])", + r"\1 \2", + self.__class__.__name__[len("OpenML") :], ) - header_text = "OpenML {}".format(name_with_spaces) + header_text = f"OpenML {name_with_spaces}" header = "{}\n{}\n".format(header_text, "=" * len(header_text)) - longest_field_name_length = max(len(name) for name, value in body_fields) - field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length) - body = "\n".join(field_line_format.format(name, value) for name, value in body_fields) + _body_fields: list[tuple[str, str | int | list[str]]] = [ + (k, "None" if v is None else v) for k, v in body_fields + ] + longest_field_name_length = max(len(name) for name, _ in _body_fields) + field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}" + body = "\n".join(field_line_format.format(name, value) for name, value in _body_fields) return header + body @abstractmethod - def _to_dict(self) -> "OrderedDict[str, OrderedDict]": + def _to_dict(self) -> dict[str, dict]: """Creates a dictionary representation of self. - Uses OrderedDict to ensure consistent ordering when converting to xml. - The return value (OrderedDict) will be used to create the upload xml file. + The return value will be used to create the upload xml file. The xml file must have the tags in exactly the order of the object's xsd. (see https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/). Returns ------- - OrderedDict - Flow represented as OrderedDict. - + Thing represented as dict. """ # Should be implemented in the base class. - pass def _to_xml(self) -> str: """Generate xml representation of self for upload to server.""" @@ -104,10 +109,10 @@ def _to_xml(self) -> str: # A task may not be uploaded with the xml encoding specification: # - encoding_specification, xml_body = xml_representation.split("\n", 1) - return xml_body + _encoding_specification, xml_body = xml_representation.split("\n", 1) + return str(xml_body) - def _get_file_elements(self) -> Dict: + def _get_file_elements(self) -> openml._api_calls.FILE_ELEMENTS_TYPE: """Get file_elements to upload to the server, called during Publish. Derived child classes should overwrite this method as necessary. @@ -116,30 +121,37 @@ def _get_file_elements(self) -> Dict: return {} @abstractmethod - def _parse_publish_response(self, xml_response: Dict): + def _parse_publish_response(self, xml_response: dict[str, str]) -> None: """Parse the id from the xml_response and assign it to self.""" - pass - def publish(self) -> "OpenMLBase": + def publish(self) -> OpenMLBase: + """Publish the object on the OpenML server.""" file_elements = self._get_file_elements() if "description" not in file_elements: file_elements["description"] = self._to_xml() - call = "{}/".format(_get_rest_api_type_alias(self)) + call = f"{_get_rest_api_type_alias(self)}/" response_text = openml._api_calls._perform_api_call( - call, "post", file_elements=file_elements + call, + "post", + file_elements=file_elements, ) xml_response = xmltodict.parse(response_text) self._parse_publish_response(xml_response) return self - def open_in_browser(self): + def open_in_browser(self) -> None: """Opens the OpenML web page corresponding to this object in your default browser.""" + if self.openml_url is None: + raise ValueError( + "Cannot open element on OpenML.org when attribute `openml_url` is `None`", + ) + webbrowser.open(self.openml_url) - def push_tag(self, tag: str): + def push_tag(self, tag: str) -> None: """Annotates this entity with a tag on the server. Parameters @@ -149,7 +161,7 @@ def push_tag(self, tag: str): """ _tag_openml_base(self, tag) - def remove_tag(self, tag: str): + def remove_tag(self, tag: str) -> None: """Removes a tag from this entity on the server. Parameters diff --git a/openml/cli.py b/openml/cli.py index 039ac227c..5732442d0 100644 --- a/openml/cli.py +++ b/openml/cli.py @@ -1,13 +1,13 @@ -"""" Command Line Interface for `openml` to configure its settings. """ +""""Command Line Interface for `openml` to configure its settings.""" +from __future__ import annotations import argparse -import os -import pathlib import string -from typing import Union, Callable +import sys +from pathlib import Path +from typing import Callable from urllib.parse import urlparse - from openml import config @@ -19,12 +19,14 @@ def looks_like_url(url: str) -> bool: # There's no thorough url parser, but we only seem to use netloc. try: return bool(urlparse(url).netloc) - except Exception: + except Exception: # noqa: BLE001 return False def wait_until_valid_input( - prompt: str, check: Callable[[str], str], sanitize: Union[Callable[[str], str], None] + prompt: str, + check: Callable[[str], str], + sanitize: Callable[[str], str] | None, ) -> str: """Asks `prompt` until an input is received which returns True for `check`. @@ -43,7 +45,6 @@ def wait_until_valid_input( valid input """ - while True: response = input(prompt) if sanitize: @@ -55,7 +56,7 @@ def wait_until_valid_input( return response -def print_configuration(): +def print_configuration() -> None: file = config.determine_config_file_path() header = f"File '{file}' contains (or defaults to):" print(header) @@ -65,7 +66,7 @@ def print_configuration(): print(f"{field.ljust(max_key_length)}: {value}") -def verbose_set(field, value): +def verbose_set(field: str, value: str) -> None: config.set_field_in_config_file(field, value) print(f"{field} set to '{value}'.") @@ -123,17 +124,20 @@ def replace_shorthand(server: str) -> str: def configure_cachedir(value: str) -> None: def check_cache_dir(path: str) -> str: - p = pathlib.Path(path) - if p.is_file(): - return f"'{path}' is a file, not a directory." - expanded = p.expanduser() + _path = Path(path) + if _path.is_file(): + return f"'{_path}' is a file, not a directory." + + expanded = _path.expanduser() if not expanded.is_absolute(): - return f"'{path}' is not absolute (even after expanding '~')." + return f"'{_path}' is not absolute (even after expanding '~')." + if not expanded.exists(): try: - os.mkdir(expanded) + expanded.mkdir() except PermissionError: return f"'{path}' does not exist and there are not enough permissions to create it." + return "" configure_field( @@ -143,7 +147,6 @@ def check_cache_dir(path: str) -> str: intro_message="Configuring the cache directory. It can not be a relative path.", input_message="Specify the directory to use (or create) as cache directory: ", ) - print("NOTE: Data from your old cache directory is not moved over.") def configure_connection_n_retries(value: str) -> None: @@ -244,13 +247,13 @@ def autocomplete_policy(policy: str) -> str: ) -def configure_field( +def configure_field( # noqa: PLR0913 field: str, - value: Union[None, str], + value: None | str, check_with_message: Callable[[str], str], intro_message: str, input_message: str, - sanitize: Union[Callable[[str], str], None] = None, + sanitize: Callable[[str], str] | None = None, ) -> None: """Configure `field` with `value`. If `value` is None ask the user for input. @@ -284,7 +287,7 @@ def configure_field( malformed_input = check_with_message(value) if malformed_input: print(malformed_input) - quit() + sys.exit() else: print(intro_message) value = wait_until_valid_input( @@ -295,7 +298,7 @@ def configure_field( verbose_set(field, value) -def configure(args: argparse.Namespace): +def configure(args: argparse.Namespace) -> None: """Calls the right submenu(s) to edit `args.field` in the configuration file.""" set_functions = { "apikey": configure_apikey, @@ -307,7 +310,7 @@ def configure(args: argparse.Namespace): "verbosity": configure_verbosity, } - def not_supported_yet(_): + def not_supported_yet(_: str) -> None: print(f"Setting '{args.field}' is not supported yet.") if args.field not in ["all", "none"]: @@ -315,12 +318,11 @@ def not_supported_yet(_): else: if args.value is not None: print(f"Can not set value ('{args.value}') when field is specified as '{args.field}'.") - quit() + sys.exit() print_configuration() if args.field == "all": for set_field_function in set_functions.values(): - print() # Visually separating the output by field. set_field_function(args.value) diff --git a/openml/config.py b/openml/config.py index b68455a9b..4744dbe86 100644 --- a/openml/config.py +++ b/openml/config.py @@ -1,30 +1,38 @@ -""" -Store module level information like the API key, cache directory and the server -""" +"""Store module level information like the API key, cache directory and the server""" # License: BSD 3-Clause +from __future__ import annotations +import configparser import logging import logging.handlers import os -from pathlib import Path import platform -from typing import Tuple, cast, Any, Optional import warnings - from io import StringIO -import configparser +from pathlib import Path +from typing import Any, cast +from typing_extensions import Literal, TypedDict from urllib.parse import urlparse logger = logging.getLogger(__name__) openml_logger = logging.getLogger("openml") -console_handler = None -file_handler = None +console_handler: logging.StreamHandler | None = None +file_handler: logging.handlers.RotatingFileHandler | None = None + +class _Config(TypedDict): + apikey: str + server: str + cachedir: Path + avoid_duplicate_runs: bool + retry_policy: Literal["human", "robot"] + connection_n_retries: int -def _create_log_handlers(create_file_handler=True): + +def _create_log_handlers(create_file_handler: bool = True) -> None: # noqa: FBT001, FBT002 """Creates but does not attach the log handlers.""" - global console_handler, file_handler + global console_handler, file_handler # noqa: PLW0603 if console_handler is not None or file_handler is not None: logger.debug("Requested to create log handlers, but they are already created.") return @@ -37,14 +45,17 @@ def _create_log_handlers(create_file_handler=True): if create_file_handler: one_mb = 2**20 - log_path = os.path.join(_root_cache_directory, "openml_python.log") + log_path = _root_cache_directory / "openml_python.log" file_handler = logging.handlers.RotatingFileHandler( - log_path, maxBytes=one_mb, backupCount=1, delay=True + log_path, + maxBytes=one_mb, + backupCount=1, + delay=True, ) file_handler.setFormatter(output_formatter) -def _convert_log_levels(log_level: int) -> Tuple[int, int]: +def _convert_log_levels(log_level: int) -> tuple[int, int]: """Converts a log level that's either defined by OpenML/Python to both specifications.""" # OpenML verbosity level don't match Python values directly: openml_to_python = {0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG} @@ -61,9 +72,9 @@ def _convert_log_levels(log_level: int) -> Tuple[int, int]: return openml_level, python_level -def _set_level_register_and_store(handler: logging.Handler, log_level: int): +def _set_level_register_and_store(handler: logging.Handler, log_level: int) -> None: """Set handler log level, register it if needed, save setting to config file if specified.""" - oml_level, py_level = _convert_log_levels(log_level) + _oml_level, py_level = _convert_log_levels(log_level) handler.setLevel(py_level) if openml_logger.level > py_level or openml_logger.level == logging.NOTSET: @@ -73,42 +84,38 @@ def _set_level_register_and_store(handler: logging.Handler, log_level: int): openml_logger.addHandler(handler) -def set_console_log_level(console_output_level: int): +def set_console_log_level(console_output_level: int) -> None: """Set console output to the desired level and register it with openml logger if needed.""" - global console_handler - _set_level_register_and_store(cast(logging.Handler, console_handler), console_output_level) + global console_handler # noqa: PLW0602 + assert console_handler is not None + _set_level_register_and_store(console_handler, console_output_level) -def set_file_log_level(file_output_level: int): +def set_file_log_level(file_output_level: int) -> None: """Set file output to the desired level and register it with openml logger if needed.""" - global file_handler - _set_level_register_and_store(cast(logging.Handler, file_handler), file_output_level) + global file_handler # noqa: PLW0602 + assert file_handler is not None + _set_level_register_and_store(file_handler, file_output_level) # Default values (see also https://github.com/openml/OpenML/wiki/Client-API-Standards) -_defaults = { +_user_path = Path("~").expanduser().absolute() +_defaults: _Config = { "apikey": "", "server": "https://www.openml.org/api/v1/xml", "cachedir": ( - os.environ.get( - "XDG_CACHE_HOME", - os.path.join( - "~", - ".cache", - "openml", - ), - ) + Path(os.environ.get("XDG_CACHE_HOME", _user_path / ".cache" / "openml")) if platform.system() == "Linux" - else os.path.join("~", ".openml") + else _user_path / ".openml" ), - "avoid_duplicate_runs": "True", + "avoid_duplicate_runs": True, "retry_policy": "human", - "connection_n_retries": "5", + "connection_n_retries": 5, } # Default values are actually added here in the _setup() function which is # called at the end of this module -server = str(_defaults["server"]) # so mypy knows it is a string +server = _defaults["server"] def get_server_base_url() -> str: @@ -117,32 +124,34 @@ def get_server_base_url() -> str: Turns ``"https://www.openml.org/api/v1/xml"`` in ``"https://www.openml.org/"`` Returns - ======= + ------- str """ return server.split("/api")[0] -apikey = _defaults["apikey"] +apikey: str = _defaults["apikey"] # The current cache directory (without the server name) -_root_cache_directory = str(_defaults["cachedir"]) # so mypy knows it is a string -avoid_duplicate_runs = True if _defaults["avoid_duplicate_runs"] == "True" else False +_root_cache_directory = Path(_defaults["cachedir"]) +avoid_duplicate_runs = _defaults["avoid_duplicate_runs"] retry_policy = _defaults["retry_policy"] -connection_n_retries = int(_defaults["connection_n_retries"]) +connection_n_retries = _defaults["connection_n_retries"] -def set_retry_policy(value: str, n_retries: Optional[int] = None) -> None: - global retry_policy - global connection_n_retries - default_retries_by_policy = dict(human=5, robot=50) +def set_retry_policy(value: Literal["human", "robot"], n_retries: int | None = None) -> None: + global retry_policy # noqa: PLW0603 + global connection_n_retries # noqa: PLW0603 + default_retries_by_policy = {"human": 5, "robot": 50} if value not in default_retries_by_policy: raise ValueError( - f"Detected retry_policy '{value}' but must be one of {default_retries_by_policy}" + f"Detected retry_policy '{value}' but must be one of " + f"{list(default_retries_by_policy.keys())}", ) if n_retries is not None and not isinstance(n_retries, int): raise TypeError(f"`n_retries` must be of type `int` or `None` but is `{type(n_retries)}`.") + if isinstance(n_retries, int) and n_retries < 1: raise ValueError(f"`n_retries` is '{n_retries}' but must be positive.") @@ -160,14 +169,14 @@ class ConfigurationForExamples: _test_apikey = "c0c42819af31e706efe1f4b88c23c6c1" @classmethod - def start_using_configuration_for_example(cls): + def start_using_configuration_for_example(cls) -> None: """Sets the configuration to connect to the test server with valid apikey. To configuration as was before this call is stored, and can be recovered by using the `stop_use_example_configuration` method. """ - global server - global apikey + global server # noqa: PLW0603 + global apikey # noqa: PLW0603 if cls._start_last_called and server == cls._test_server and apikey == cls._test_apikey: # Method is called more than once in a row without modifying the server or apikey. @@ -182,26 +191,27 @@ def start_using_configuration_for_example(cls): server = cls._test_server apikey = cls._test_apikey warnings.warn( - "Switching to the test server {} to not upload results to the live server. " - "Using the test server may result in reduced performance of the API!".format(server) + f"Switching to the test server {server} to not upload results to the live server. " + "Using the test server may result in reduced performance of the API!", + stacklevel=2, ) @classmethod - def stop_using_configuration_for_example(cls): + def stop_using_configuration_for_example(cls) -> None: """Return to configuration as it was before `start_use_example_configuration`.""" if not cls._start_last_called: # We don't want to allow this because it will (likely) result in the `server` and # `apikey` variables being set to None. raise RuntimeError( "`stop_use_example_configuration` called without a saved config." - "`start_use_example_configuration` must be called first." + "`start_use_example_configuration` must be called first.", ) - global server - global apikey + global server # noqa: PLW0603 + global apikey # noqa: PLW0603 - server = cls._last_used_server - apikey = cls._last_used_key + server = cast(str, cls._last_used_server) + apikey = cast(str, cls._last_used_key) cls._start_last_called = False @@ -211,11 +221,11 @@ def determine_config_file_path() -> Path: else: config_dir = Path("~") / ".openml" # Still use os.path.expanduser to trigger the mock in the unit test - config_dir = Path(os.path.expanduser(config_dir)) + config_dir = Path(config_dir).expanduser().resolve() return config_dir / "config" -def _setup(config=None): +def _setup(config: _Config | None = None) -> None: """Setup openml package. Called on first import. Reads the config file and sets up apikey, server, cache appropriately. @@ -224,58 +234,48 @@ def _setup(config=None): openml.config.server = SOMESERVER We could also make it a property but that's less clear. """ - global apikey - global server - global _root_cache_directory - global avoid_duplicate_runs + global apikey # noqa: PLW0603 + global server # noqa: PLW0603 + global _root_cache_directory # noqa: PLW0603 + global avoid_duplicate_runs # noqa: PLW0603 config_file = determine_config_file_path() config_dir = config_file.parent # read config file, create directory for config file - if not os.path.exists(config_dir): - try: - os.makedirs(config_dir, exist_ok=True) - cache_exists = True - except PermissionError: - cache_exists = False - else: - cache_exists = True + try: + if not config_dir.exists(): + config_dir.mkdir(exist_ok=True, parents=True) + except PermissionError: + pass if config is None: config = _parse_config(config_file) - def _get(config, key): - return config.get("FAKE_SECTION", key) + avoid_duplicate_runs = config.get("avoid_duplicate_runs", False) + apikey = config["apikey"] + server = config["server"] + short_cache_dir = config["cachedir"] + n_retries = config["connection_n_retries"] - avoid_duplicate_runs = config.getboolean("FAKE_SECTION", "avoid_duplicate_runs") - else: - - def _get(config, key): - return config.get(key) - - avoid_duplicate_runs = config.get("avoid_duplicate_runs") + set_retry_policy(config["retry_policy"], n_retries) - apikey = _get(config, "apikey") - server = _get(config, "server") - short_cache_dir = _get(config, "cachedir") + _root_cache_directory = short_cache_dir.expanduser().resolve() - n_retries = _get(config, "connection_n_retries") - if n_retries is not None: - n_retries = int(n_retries) - - set_retry_policy(_get(config, "retry_policy"), n_retries) + try: + cache_exists = _root_cache_directory.exists() + except PermissionError: + cache_exists = False - _root_cache_directory = os.path.expanduser(short_cache_dir) # create the cache subdirectory - if not os.path.exists(_root_cache_directory): - try: - os.makedirs(_root_cache_directory, exist_ok=True) - except PermissionError: - openml_logger.warning( - "No permission to create openml cache directory at %s! This can result in " - "OpenML-Python not working properly." % _root_cache_directory - ) + try: + if not _root_cache_directory.exists(): + _root_cache_directory.mkdir(exist_ok=True, parents=True) + except PermissionError: + openml_logger.warning( + "No permission to create openml cache directory at %s! This can result in " + "OpenML-Python not working properly." % _root_cache_directory, + ) if cache_exists: _create_log_handlers() @@ -283,41 +283,43 @@ def _get(config, key): _create_log_handlers(create_file_handler=False) openml_logger.warning( "No permission to create OpenML directory at %s! This can result in OpenML-Python " - "not working properly." % config_dir + "not working properly." % config_dir, ) -def set_field_in_config_file(field: str, value: Any): +def set_field_in_config_file(field: str, value: Any) -> None: """Overwrites the `field` in the configuration file with the new `value`.""" if field not in _defaults: - return ValueError(f"Field '{field}' is not valid and must be one of '{_defaults.keys()}'.") + raise ValueError(f"Field '{field}' is not valid and must be one of '{_defaults.keys()}'.") + # TODO(eddiebergman): This use of globals has gone too far globals()[field] = value config_file = determine_config_file_path() - config = _parse_config(str(config_file)) - with open(config_file, "w") as fh: - for f in _defaults.keys(): + config = _parse_config(config_file) + with config_file.open("w") as fh: + for f in _defaults: # We can't blindly set all values based on globals() because when the user # sets it through config.FIELD it should not be stored to file. # There doesn't seem to be a way to avoid writing defaults to file with configparser, # because it is impossible to distinguish from an explicitly set value that matches # the default value, to one that was set to its default because it was omitted. - value = config.get("FAKE_SECTION", f) + value = config.get("FAKE_SECTION", f) # type: ignore if f == field: value = globals()[f] fh.write(f"{f} = {value}\n") -def _parse_config(config_file: str): +def _parse_config(config_file: str | Path) -> _Config: """Parse the config file, set up defaults.""" - config = configparser.RawConfigParser(defaults=_defaults) + config_file = Path(config_file) + config = configparser.RawConfigParser(defaults=_defaults) # type: ignore # The ConfigParser requires a [SECTION_HEADER], which we do not expect in our config file. # Cheat the ConfigParser module by adding a fake section header config_file_ = StringIO() config_file_.write("[FAKE_SECTION]\n") try: - with open(config_file) as fh: + with config_file.open("w") as fh: for line in fh: config_file_.write(line) except FileNotFoundError: @@ -326,21 +328,22 @@ def _parse_config(config_file: str): logger.info("Error opening file %s: %s", config_file, e.args[0]) config_file_.seek(0) config.read_file(config_file_) - return config + return dict(config.items("FAKE_SECTION")) # type: ignore -def get_config_as_dict(): - config = dict() - config["apikey"] = apikey - config["server"] = server - config["cachedir"] = _root_cache_directory - config["avoid_duplicate_runs"] = avoid_duplicate_runs - config["connection_n_retries"] = connection_n_retries - config["retry_policy"] = retry_policy - return config +def get_config_as_dict() -> _Config: + return { + "apikey": apikey, + "server": server, + "cachedir": _root_cache_directory, + "avoid_duplicate_runs": avoid_duplicate_runs, + "connection_n_retries": connection_n_retries, + "retry_policy": retry_policy, + } -def get_cache_directory(): +# NOTE: For backwards compatibility, we keep the `str` +def get_cache_directory() -> str: """Get the current cache directory. This gets the cache directory for the current server relative @@ -361,12 +364,11 @@ def get_cache_directory(): """ url_suffix = urlparse(server).netloc - reversed_url_suffix = os.sep.join(url_suffix.split(".")[::-1]) - _cachedir = os.path.join(_root_cache_directory, reversed_url_suffix) - return _cachedir + reversed_url_suffix = os.sep.join(url_suffix.split(".")[::-1]) # noqa: PTH118 + return os.path.join(_root_cache_directory, reversed_url_suffix) # noqa: PTH118 -def set_root_cache_directory(root_cache_directory): +def set_root_cache_directory(root_cache_directory: str | Path) -> None: """Set module-wide base cache directory. Sets the root cache directory, wherin the cache directories are @@ -381,13 +383,12 @@ def set_root_cache_directory(root_cache_directory): root_cache_directory : string Path to use as cache directory. - See also + See Also -------- get_cache_directory """ - - global _root_cache_directory - _root_cache_directory = root_cache_directory + global _root_cache_directory # noqa: PLW0603 + _root_cache_directory = Path(root_cache_directory) start_using_configuration_for_example = ( diff --git a/openml/datasets/__init__.py b/openml/datasets/__init__.py index efa5a5d5b..480dd9576 100644 --- a/openml/datasets/__init__.py +++ b/openml/datasets/__init__.py @@ -1,20 +1,20 @@ # License: BSD 3-Clause +from .data_feature import OpenMLDataFeature +from .dataset import OpenMLDataset from .functions import ( attributes_arff_from_df, check_datasets_active, create_dataset, + delete_dataset, + edit_dataset, + fork_dataset, get_dataset, get_datasets, list_datasets, - status_update, list_qualities, - edit_dataset, - fork_dataset, - delete_dataset, + status_update, ) -from .dataset import OpenMLDataset -from .data_feature import OpenMLDataFeature __all__ = [ "attributes_arff_from_df", diff --git a/openml/datasets/data_feature.py b/openml/datasets/data_feature.py index b4550b5d7..218b0066d 100644 --- a/openml/datasets/data_feature.py +++ b/openml/datasets/data_feature.py @@ -1,9 +1,13 @@ # License: BSD 3-Clause +from __future__ import annotations -from typing import List +from typing import TYPE_CHECKING, Any, ClassVar, Sequence +if TYPE_CHECKING: + from IPython.lib import pretty -class OpenMLDataFeature(object): + +class OpenMLDataFeature: """ Data Feature (a.k.a. Attribute) object. @@ -18,52 +22,63 @@ class OpenMLDataFeature(object): nominal_values : list(str) list of the possible values, in case of nominal attribute number_missing_values : int + Number of rows that have a missing value for this feature. + ontologies : list(str) + list of ontologies attached to this feature. An ontology describes the + concept that are described in a feature. An ontology is defined by an + URL where the information is provided. """ - LEGAL_DATA_TYPES = ["nominal", "numeric", "string", "date"] + LEGAL_DATA_TYPES: ClassVar[Sequence[str]] = ["nominal", "numeric", "string", "date"] - def __init__( + def __init__( # noqa: PLR0913 self, index: int, name: str, data_type: str, - nominal_values: List[str], + nominal_values: list[str], number_missing_values: int, + ontologies: list[str] | None = None, ): - if type(index) != int: - raise ValueError("Index is of wrong datatype") + if not isinstance(index, int): + raise TypeError(f"Index must be `int` but is {type(index)}") + if data_type not in self.LEGAL_DATA_TYPES: raise ValueError( - "data type should be in %s, found: %s" % (str(self.LEGAL_DATA_TYPES), data_type) + f"data type should be in {self.LEGAL_DATA_TYPES!s}, found: {data_type}", ) + if data_type == "nominal": if nominal_values is None: raise TypeError( "Dataset features require attribute `nominal_values` for nominal " - "feature type." + "feature type.", ) - elif not isinstance(nominal_values, list): + + if not isinstance(nominal_values, list): raise TypeError( "Argument `nominal_values` is of wrong datatype, should be list, " - "but is {}".format(type(nominal_values)) + f"but is {type(nominal_values)}", ) - else: - if nominal_values is not None: - raise TypeError("Argument `nominal_values` must be None for non-nominal feature.") - if type(number_missing_values) != int: - raise ValueError("number_missing_values is of wrong datatype") + elif nominal_values is not None: + raise TypeError("Argument `nominal_values` must be None for non-nominal feature.") + + if not isinstance(number_missing_values, int): + msg = f"number_missing_values must be int but is {type(number_missing_values)}" + raise TypeError(msg) self.index = index self.name = str(name) self.data_type = str(data_type) self.nominal_values = nominal_values self.number_missing_values = number_missing_values + self.ontologies = ontologies - def __repr__(self): + def __repr__(self) -> str: return "[%d - %s (%s)]" % (self.index, self.name, self.data_type) - def __eq__(self, other): + def __eq__(self, other: Any) -> bool: return isinstance(other, OpenMLDataFeature) and self.__dict__ == other.__dict__ - def _repr_pretty_(self, pp, cycle): + def _repr_pretty_(self, pp: pretty.PrettyPrinter, cycle: bool) -> None: # noqa: FBT001, ARG002 pp.text(str(self)) diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index dcdef162d..0c9da1caf 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -1,13 +1,14 @@ # License: BSD 3-Clause +from __future__ import annotations -from collections import OrderedDict -import re import gzip import logging -import os import pickle -from typing import List, Optional, Union, Tuple, Iterable, Dict +import re import warnings +from pathlib import Path +from typing import Any, Iterable, Sequence +from typing_extensions import Literal import arff import numpy as np @@ -16,8 +17,9 @@ import xmltodict from openml.base import OpenMLBase +from openml.exceptions import PyOpenMLError + from .data_feature import OpenMLDataFeature -from ..exceptions import PyOpenMLError logger = logging.getLogger(__name__) @@ -88,66 +90,70 @@ class OpenMLDataset(OpenMLBase): MD5 checksum to check if the dataset is downloaded without corruption. data_file : str, optional Path to where the dataset is located. - features : dict, optional + features_file : dict, optional A dictionary of dataset features, which maps a feature index to a OpenMLDataFeature. - qualities : dict, optional + qualities_file : dict, optional A dictionary of dataset qualities, which maps a quality name to a quality value. dataset: string, optional Serialized arff dataset string. - minio_url: string, optional - URL to the MinIO bucket with dataset files + parquet_url: string, optional + This is the URL to the storage location where the dataset files are hosted. + This can be a MinIO bucket URL. If specified, the data will be accessed + from this URL when reading the files. parquet_file: string, optional - Path to the local parquet file. + Path to the local file. """ - def __init__( + def __init__( # noqa: C901, PLR0912, PLR0913, PLR0915 self, - name, - description, - data_format="arff", - cache_format="pickle", - dataset_id=None, - version=None, - creator=None, - contributor=None, - collection_date=None, - upload_date=None, - language=None, - licence=None, - url=None, - default_target_attribute=None, - row_id_attribute=None, - ignore_attribute=None, - version_label=None, - citation=None, - tag=None, - visibility=None, - original_data_url=None, - paper_url=None, - update_comment=None, - md5_checksum=None, - data_file=None, - features_file: Optional[str] = None, - qualities_file: Optional[str] = None, - dataset=None, - minio_url: Optional[str] = None, - parquet_file: Optional[str] = None, + name: str, + description: str | None, + data_format: Literal["arff", "sparse_arff"] = "arff", + cache_format: Literal["feather", "pickle"] = "pickle", + dataset_id: int | None = None, + version: int | None = None, + creator: str | None = None, + contributor: str | None = None, + collection_date: str | None = None, + upload_date: str | None = None, + language: str | None = None, + licence: str | None = None, + url: str | None = None, + default_target_attribute: str | None = None, + row_id_attribute: str | None = None, + ignore_attribute: str | list[str] | None = None, + version_label: str | None = None, + citation: str | None = None, + tag: str | None = None, + visibility: str | None = None, + original_data_url: str | None = None, + paper_url: str | None = None, + update_comment: str | None = None, + md5_checksum: str | None = None, + data_file: str | None = None, + features_file: str | None = None, + qualities_file: str | None = None, + dataset: str | None = None, + parquet_url: str | None = None, + parquet_file: str | None = None, ): - def find_invalid_characters(string, pattern): + if cache_format not in ["feather", "pickle"]: + raise ValueError( + "cache_format must be one of 'feather' or 'pickle. " + f"Invalid format specified: {cache_format}", + ) + + def find_invalid_characters(string: str, pattern: str) -> str: invalid_chars = set() regex = re.compile(pattern) for char in string: if not regex.match(char): invalid_chars.add(char) - invalid_chars = ",".join( - [ - "'{}'".format(char) if char != "'" else '"{}"'.format(char) - for char in invalid_chars - ] + return ",".join( + [f"'{char}'" if char != "'" else f'"{char}"' for char in invalid_chars], ) - return invalid_chars if dataset_id is None: pattern = "^[\x00-\x7F]*$" @@ -155,32 +161,35 @@ def find_invalid_characters(string, pattern): # not basiclatin (XSD complains) invalid_characters = find_invalid_characters(description, pattern) raise ValueError( - "Invalid symbols {} in description: {}".format(invalid_characters, description) + f"Invalid symbols {invalid_characters} in description: {description}", ) pattern = "^[\x00-\x7F]*$" if citation and not re.match(pattern, citation): # not basiclatin (XSD complains) invalid_characters = find_invalid_characters(citation, pattern) raise ValueError( - "Invalid symbols {} in citation: {}".format(invalid_characters, citation) + f"Invalid symbols {invalid_characters} in citation: {citation}", ) pattern = "^[a-zA-Z0-9_\\-\\.\\(\\),]+$" if not re.match(pattern, name): # regex given by server in error message invalid_characters = find_invalid_characters(name, pattern) - raise ValueError("Invalid symbols {} in name: {}".format(invalid_characters, name)) + raise ValueError(f"Invalid symbols {invalid_characters} in name: {name}") + + self.ignore_attribute: list[str] | None = None + if isinstance(ignore_attribute, str): + self.ignore_attribute = [ignore_attribute] + elif isinstance(ignore_attribute, list) or ignore_attribute is None: + self.ignore_attribute = ignore_attribute + else: + raise ValueError("Wrong data type for ignore_attribute. Should be list.") + # TODO add function to check if the name is casual_string128 # Attributes received by querying the RESTful API self.dataset_id = int(dataset_id) if dataset_id is not None else None self.name = name self.version = int(version) if version is not None else None self.description = description - if cache_format not in ["feather", "pickle"]: - raise ValueError( - "cache_format must be one of 'feather' or 'pickle. " - "Invalid format specified: {}".format(cache_format) - ) - self.cache_format = cache_format # Has to be called format, otherwise there will be an XML upload error self.format = data_format @@ -193,12 +202,7 @@ def find_invalid_characters(string, pattern): self.url = url self.default_target_attribute = default_target_attribute self.row_id_attribute = row_id_attribute - if isinstance(ignore_attribute, str): - self.ignore_attribute = [ignore_attribute] # type: Optional[List[str]] - elif isinstance(ignore_attribute, list) or ignore_attribute is None: - self.ignore_attribute = ignore_attribute - else: - raise ValueError("Wrong data type for ignore_attribute. " "Should be list.") + self.version_label = version_label self.citation = citation self.tag = tag @@ -210,14 +214,14 @@ def find_invalid_characters(string, pattern): self.data_file = data_file self.parquet_file = parquet_file self._dataset = dataset - self._minio_url = minio_url + self._parquet_url = parquet_url - self._features = None # type: Optional[Dict[int, OpenMLDataFeature]] - self._qualities = None # type: Optional[Dict[str, float]] + self._features: dict[int, OpenMLDataFeature] | None = None + self._qualities: dict[str, float] | None = None self._no_qualities_found = False if features_file is not None: - self._features = _read_features(features_file) + self._features = _read_features(Path(features_file)) # "" was the old default value by `get_dataset` and maybe still used by some if qualities_file == "": @@ -227,30 +231,40 @@ def find_invalid_characters(string, pattern): "to avoid reading the qualities from file. Set `qualities_file` to None to avoid " "this warning.", FutureWarning, + stacklevel=2, ) + qualities_file = None - if qualities_file: - self._qualities = _read_qualities(qualities_file) + if qualities_file is not None: + self._qualities = _read_qualities(Path(qualities_file)) if data_file is not None: - rval = self._compressed_cache_file_paths(data_file) - self.data_pickle_file = rval[0] if os.path.exists(rval[0]) else None - self.data_feather_file = rval[1] if os.path.exists(rval[1]) else None - self.feather_attribute_file = rval[2] if os.path.exists(rval[2]) else None + data_pickle, data_feather, feather_attribute = self._compressed_cache_file_paths( + Path(data_file) + ) + self.data_pickle_file = data_pickle if Path(data_pickle).exists() else None + self.data_feather_file = data_feather if Path(data_feather).exists() else None + self.feather_attribute_file = feather_attribute if Path(feather_attribute) else None else: self.data_pickle_file = None self.data_feather_file = None self.feather_attribute_file = None @property - def features(self): + def features(self) -> dict[int, OpenMLDataFeature]: + """Get the features of this dataset.""" if self._features is None: + # TODO(eddiebergman): These should return a value so we can set it to be not None self._load_features() + assert self._features is not None return self._features @property - def qualities(self): + def qualities(self) -> dict[str, float] | None: + """Get the qualities of this dataset.""" + # TODO(eddiebergman): Better docstring, I don't know what qualities means + # We have to check `_no_qualities_found` as there might not be qualities for a dataset if self._qualities is None and (not self._no_qualities_found): self._load_qualities() @@ -258,26 +272,29 @@ def qualities(self): return self._qualities @property - def id(self) -> Optional[int]: + def id(self) -> int | None: + """Get the dataset numeric id.""" return self.dataset_id - def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: + def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | None]]: """Collect all information to display in the __repr__ body.""" - # Obtain number of features in accordance with lazy loading. + n_features: int | None = None if self._qualities is not None and self._qualities["NumberOfFeatures"] is not None: - n_features = int(self._qualities["NumberOfFeatures"]) # type: Optional[int] - else: - n_features = len(self._features) if self._features is not None else None + n_features = int(self._qualities["NumberOfFeatures"]) + elif self._features is not None: + n_features = len(self._features) - fields = { + fields: dict[str, int | str | None] = { "Name": self.name, "Version": self.version, "Format": self.format, "Licence": self.licence, "Download URL": self.url, - "Data file": self.data_file, - "Pickle file": self.data_pickle_file, + "Data file": str(self.data_file) if self.data_file is not None else None, + "Pickle file": ( + str(self.data_pickle_file) if self.data_pickle_file is not None else None + ), "# of features": n_features, } if self.upload_date is not None: @@ -303,7 +320,7 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: ] return [(key, fields[key]) for key in order if key in fields] - def __eq__(self, other): + def __eq__(self, other: Any) -> bool: if not isinstance(other, OpenMLDataset): return False @@ -328,11 +345,11 @@ def _download_data(self) -> None: # import required here to avoid circular import. from .functions import _get_dataset_arff, _get_dataset_parquet - self.data_file = _get_dataset_arff(self) - if self._minio_url is not None: - self.parquet_file = _get_dataset_parquet(self) + self.data_file = str(_get_dataset_arff(self)) + if self._parquet_url is not None: + self.parquet_file = str(_get_dataset_parquet(self)) - def _get_arff(self, format: str) -> Dict: + def _get_arff(self, format: str) -> dict: # noqa: A002 """Read ARFF file and return decoded arff. Reads the file referenced in self.data_file. @@ -352,44 +369,49 @@ def _get_arff(self, format: str) -> Dict: Decoded arff. """ - # TODO: add a partial read method which only returns the attribute # headers of the corresponding .arff file! import struct filename = self.data_file + assert filename is not None + filepath = Path(filename) + bits = 8 * struct.calcsize("P") + # Files can be considered too large on a 32-bit system, # if it exceeds 120mb (slightly more than covtype dataset size) # This number is somewhat arbitrary. - if bits != 64 and os.path.getsize(filename) > 120000000: - raise NotImplementedError( - "File {} too big for {}-bit system ({} bytes).".format( - filename, os.path.getsize(filename), bits + if bits != 64: + MB_120 = 120_000_000 + file_size = filepath.stat().st_size + if file_size > MB_120: + raise NotImplementedError( + f"File {filename} too big for {file_size}-bit system ({bits} bytes).", ) - ) if format.lower() == "arff": return_type = arff.DENSE elif format.lower() == "sparse_arff": return_type = arff.COO else: - raise ValueError("Unknown data format {}".format(format)) + raise ValueError(f"Unknown data format {format}") - def decode_arff(fh): + def decode_arff(fh: Any) -> dict: decoder = arff.ArffDecoder() - return decoder.decode(fh, encode_nominal=True, return_type=return_type) + return decoder.decode(fh, encode_nominal=True, return_type=return_type) # type: ignore - if filename[-3:] == ".gz": + if filepath.suffix.endswith(".gz"): with gzip.open(filename) as zipfile: return decode_arff(zipfile) else: - with open(filename, encoding="utf8") as fh: + with filepath.open(encoding="utf8") as fh: return decode_arff(fh) - def _parse_data_from_arff( - self, arff_file_path: str - ) -> Tuple[Union[pd.DataFrame, scipy.sparse.csr_matrix], List[bool], List[str]]: + def _parse_data_from_arff( # noqa: C901, PLR0912, PLR0915 + self, + arff_file_path: Path, + ) -> tuple[pd.DataFrame | scipy.sparse.csr_matrix, list[bool], list[str]]: """Parse all required data from arff file. Parameters @@ -408,8 +430,7 @@ def _parse_data_from_arff( data = self._get_arff(self.format) except OSError as e: logger.critical( - "Please check that the data file {} is " - "there and can be read.".format(arff_file_path) + f"Please check that the data file {arff_file_path} is " "there and can be read.", ) raise e @@ -423,7 +444,7 @@ def _parse_data_from_arff( attribute_names = [] categories_names = {} categorical = [] - for i, (name, type_) in enumerate(data["attributes"]): + for name, type_ in data["attributes"]: # if the feature is nominal and a sparse matrix is # requested, the categories need to be numeric if isinstance(type_, list) and self.format.lower() == "sparse_arff": @@ -431,8 +452,11 @@ def _parse_data_from_arff( # checks if the strings which should be the class labels # can be encoded into integers pd.factorize(type_)[0] - except ValueError: - raise ValueError("Categorical data needs to be numeric when using sparse ARFF.") + except ValueError as e: + raise ValueError( + "Categorical data needs to be numeric when using sparse ARFF." + ) from e + # string can only be supported with pandas DataFrame elif type_ == "STRING" and self.format.lower() == "sparse_arff": raise ValueError("Dataset containing strings is not supported with sparse ARFF.") @@ -443,10 +467,8 @@ def _parse_data_from_arff( categories_names[name] = type_ if len(type_) == 2: type_norm = [cat.lower().capitalize() for cat in type_] - if set(["True", "False"]) == set(type_norm): - categories_names[name] = [ - True if cat == "True" else False for cat in type_norm - ] + if {"True", "False"} == set(type_norm): + categories_names[name] = [cat == "True" for cat in type_norm] attribute_dtype[name] = "boolean" else: attribute_dtype[name] = "categorical" @@ -468,9 +490,11 @@ def _parse_data_from_arff( col = [] for column_name in X.columns: if attribute_dtype[column_name] in ("categorical", "boolean"): - col.append( - self._unpack_categories(X[column_name], categories_names[column_name]) + categories = self._unpack_categories( + X[column_name], # type: ignore + categories_names[column_name], ) + col.append(categories) elif attribute_dtype[column_name] in ("floating", "integer"): X_col = X[column_name] if X_col.min() >= 0 and X_col.max() <= 255: @@ -486,20 +510,20 @@ def _parse_data_from_arff( col.append(X[column_name]) X = pd.concat(col, axis=1) else: - raise ValueError("Dataset format '{}' is not a valid format.".format(self.format)) + raise ValueError(f"Dataset format '{self.format}' is not a valid format.") - return X, categorical, attribute_names + return X, categorical, attribute_names # type: ignore - def _compressed_cache_file_paths(self, data_file: str) -> Tuple[str, str, str]: - ext = f".{data_file.split('.')[-1]}" - data_pickle_file = data_file.replace(ext, ".pkl.py3") - data_feather_file = data_file.replace(ext, ".feather") - feather_attribute_file = data_file.replace(ext, ".feather.attributes.pkl.py3") + def _compressed_cache_file_paths(self, data_file: Path) -> tuple[Path, Path, Path]: + data_pickle_file = data_file.with_suffix(".pkl.py3") + data_feather_file = data_file.with_suffix(".feather") + feather_attribute_file = data_file.with_suffix(".feather.attributes.pkl.py3") return data_pickle_file, data_feather_file, feather_attribute_file def _cache_compressed_file_from_file( - self, data_file: str - ) -> Tuple[Union[pd.DataFrame, scipy.sparse.csr_matrix], List[bool], List[str]]: + self, + data_file: Path, + ) -> tuple[pd.DataFrame | scipy.sparse.csr_matrix, list[bool], list[str]]: """Store data from the local file in compressed format. If a local parquet file is present it will be used instead of the arff file. @@ -511,12 +535,12 @@ def _cache_compressed_file_from_file( feather_attribute_file, ) = self._compressed_cache_file_paths(data_file) - if data_file.endswith(".arff"): + if data_file.suffix == ".arff": data, categorical, attribute_names = self._parse_data_from_arff(data_file) - elif data_file.endswith(".pq"): + elif data_file.suffix == ".pq": try: data = pd.read_parquet(data_file) - except Exception as e: + except Exception as e: # noqa: BLE001 raise Exception(f"File: {data_file}") from e categorical = [data[c].dtype.name == "category" for c in data.columns] @@ -530,13 +554,16 @@ def _cache_compressed_file_from_file( logger.info(f"{self.cache_format} write {self.name}") if self.cache_format == "feather": + assert isinstance(data, pd.DataFrame) + data.to_feather(data_feather_file) - with open(feather_attribute_file, "wb") as fh: + with open(feather_attribute_file, "wb") as fh: # noqa: PTH123 pickle.dump((categorical, attribute_names), fh, pickle.HIGHEST_PROTOCOL) self.data_feather_file = data_feather_file self.feather_attribute_file = feather_attribute_file + else: - with open(data_pickle_file, "wb") as fh: + with open(data_pickle_file, "wb") as fh: # noqa: PTH123 pickle.dump((data, categorical, attribute_names), fh, pickle.HIGHEST_PROTOCOL) self.data_pickle_file = data_pickle_file @@ -545,7 +572,7 @@ def _cache_compressed_file_from_file( return data, categorical, attribute_names - def _load_data(self): + def _load_data(self) -> tuple[pd.DataFrame | scipy.sparse.csr_matrix, list[bool], list[str]]: # noqa: PLR0912, C901 """Load data from compressed format or arff. Download data if not present on disk.""" need_to_create_pickle = self.cache_format == "pickle" and self.data_pickle_file is None need_to_create_feather = self.cache_format == "feather" and self.data_feather_file is None @@ -555,24 +582,31 @@ def _load_data(self): self._download_data() file_to_load = self.data_file if self.parquet_file is None else self.parquet_file - return self._cache_compressed_file_from_file(file_to_load) + assert file_to_load is not None + return self._cache_compressed_file_from_file(Path(file_to_load)) # helper variable to help identify where errors occur fpath = self.data_feather_file if self.cache_format == "feather" else self.data_pickle_file logger.info(f"{self.cache_format} load data {self.name}") try: if self.cache_format == "feather": + assert self.data_feather_file is not None + assert self.feather_attribute_file is not None + data = pd.read_feather(self.data_feather_file) fpath = self.feather_attribute_file - with open(self.feather_attribute_file, "rb") as fh: - categorical, attribute_names = pickle.load(fh) + with open(self.feather_attribute_file, "rb") as fh: # noqa: PTH123 + categorical, attribute_names = pickle.load(fh) # noqa: S301 else: - with open(self.data_pickle_file, "rb") as fh: - data, categorical, attribute_names = pickle.load(fh) - except FileNotFoundError: - raise ValueError(f"Cannot find file for dataset {self.name} at location '{fpath}'.") + assert self.data_pickle_file is not None + with open(self.data_pickle_file, "rb") as fh: # noqa: PTH123 + data, categorical, attribute_names = pickle.load(fh) # noqa: S301 + except FileNotFoundError as e: + raise ValueError( + f"Cannot find file for dataset {self.name} at location '{fpath}'." + ) from e except (EOFError, ModuleNotFoundError, ValueError, AttributeError) as e: - error_message = e.message if hasattr(e, "message") else e.args[0] + error_message = getattr(e, "message", e.args[0]) hint = "" if isinstance(e, EOFError): @@ -591,7 +625,7 @@ def _load_data(self): elif isinstance(e, ValueError) and "unsupported pickle protocol" in e.args[0]: readable_error = "Encountered unsupported pickle protocol" else: - raise # an unknown ValueError is raised, should crash and file bug report + raise e logger.warning( f"{readable_error} when loading dataset {self.id} from '{fpath}'. " @@ -600,19 +634,28 @@ def _load_data(self): "We will continue loading data from the arff-file, " "but this will be much slower for big datasets. " "Please manually delete the cache file if you want OpenML-Python " - "to attempt to reconstruct it." + "to attempt to reconstruct it.", ) - data, categorical, attribute_names = self._parse_data_from_arff(self.data_file) + assert self.data_file is not None + data, categorical, attribute_names = self._parse_data_from_arff(Path(self.data_file)) data_up_to_date = isinstance(data, pd.DataFrame) or scipy.sparse.issparse(data) if self.cache_format == "pickle" and not data_up_to_date: logger.info("Updating outdated pickle file.") file_to_load = self.data_file if self.parquet_file is None else self.parquet_file - return self._cache_compressed_file_from_file(file_to_load) + assert file_to_load is not None + + return self._cache_compressed_file_from_file(Path(file_to_load)) return data, categorical, attribute_names + # TODO(eddiebergman): Can type this better with overload + # TODO(eddiebergman): Could also techinically use scipy.sparse.sparray @staticmethod - def _convert_array_format(data, array_format, attribute_names): + def _convert_array_format( + data: pd.DataFrame | pd.Series | np.ndarray | scipy.sparse.spmatrix, + array_format: Literal["array", "dataframe"], + attribute_names: list | None = None, + ) -> pd.DataFrame | pd.Series | np.ndarray | scipy.sparse.spmatrix: """Convert a dataset to a given array format. Converts to numpy array if data is non-sparse. @@ -635,18 +678,17 @@ def _convert_array_format(data, array_format, attribute_names): else returns data as is """ - - if array_format == "array" and not scipy.sparse.issparse(data): + if array_format == "array" and not isinstance(data, scipy.sparse.spmatrix): # We encode the categories such that they are integer to be able # to make a conversion to numeric for backward compatibility - def _encode_if_category(column): + def _encode_if_category(column: pd.Series | np.ndarray) -> pd.Series | np.ndarray: if column.dtype.name == "category": column = column.cat.codes.astype(np.float32) mask_nan = column == -1 column[mask_nan] = np.nan return column - if data.ndim == 2: + if isinstance(data, pd.DataFrame): columns = { column_name: _encode_if_category(data.loc[:, column_name]) for column_name in data.columns @@ -654,28 +696,33 @@ def _encode_if_category(column): data = pd.DataFrame(columns) else: data = _encode_if_category(data) + try: - return np.asarray(data, dtype=np.float32) - except ValueError: + # TODO(eddiebergman): float32? + return_array = np.asarray(data, dtype=np.float32) + except ValueError as e: raise PyOpenMLError( "PyOpenML cannot handle string when returning numpy" - ' arrays. Use dataset_format="dataframe".' - ) - elif array_format == "dataframe": + ' arrays. Use dataset_format="dataframe".', + ) from e + + return return_array + + if array_format == "dataframe": if scipy.sparse.issparse(data): data = pd.DataFrame.sparse.from_spmatrix(data, columns=attribute_names) else: data_type = "sparse-data" if scipy.sparse.issparse(data) else "non-sparse data" logger.warning( - "Cannot convert %s (%s) to '%s'. Returning input data." - % (data_type, type(data), array_format) + f"Cannot convert {data_type} ({type(data)}) to '{array_format}'." + " Returning input data.", ) return data @staticmethod - def _unpack_categories(series, categories): + def _unpack_categories(series: pd.Series, categories: list) -> pd.Series: # nan-likes can not be explicitly specified as a category - def valid_category(cat): + def valid_category(cat: Any) -> bool: return isinstance(cat, str) or (cat is not None and not np.isnan(cat)) filtered_categories = [c for c in categories if valid_category(c)] @@ -685,22 +732,23 @@ def valid_category(cat): col.append(categories[int(x)]) except (TypeError, ValueError): col.append(np.nan) + # We require two lines to create a series of categories as detailed here: - # https://pandas.pydata.org/pandas-docs/version/0.24/user_guide/categorical.html#series-creation # noqa E501 + # https://pandas.pydata.org/pandas-docs/version/0.24/user_guide/categorical.html#series-creation raw_cat = pd.Categorical(col, ordered=True, categories=filtered_categories) return pd.Series(raw_cat, index=series.index, name=series.name) - def get_data( + def get_data( # noqa: C901, PLR0912, PLR0915 self, - target: Optional[Union[List[str], str]] = None, - include_row_id: bool = False, - include_ignore_attribute: bool = False, - dataset_format: str = "dataframe", - ) -> Tuple[ - Union[np.ndarray, pd.DataFrame, scipy.sparse.csr_matrix], - Optional[Union[np.ndarray, pd.DataFrame]], - List[bool], - List[str], + target: list[str] | str | None = None, + include_row_id: bool = False, # noqa: FBT001, FBT002 + include_ignore_attribute: bool = False, # noqa: FBT001, FBT002 + dataset_format: Literal["array", "dataframe"] = "dataframe", + ) -> tuple[ + np.ndarray | pd.DataFrame | scipy.sparse.csr_matrix, + np.ndarray | pd.DataFrame | None, + list[bool], + list[str], ]: """Returns dataset content as dataframes or sparse matrices. @@ -759,47 +807,40 @@ def get_data( if len(to_exclude) > 0: logger.info("Going to remove the following attributes: %s" % to_exclude) - keep = np.array( - [True if column not in to_exclude else False for column in attribute_names] - ) - if hasattr(data, "iloc"): - data = data.iloc[:, keep] - else: - data = data[:, keep] + keep = np.array([column not in to_exclude for column in attribute_names]) + data = data.loc[:, keep] if isinstance(data, pd.DataFrame) else data[:, keep] + categorical = [cat for cat, k in zip(categorical, keep) if k] attribute_names = [att for att, k in zip(attribute_names, keep) if k] if target is None: - data = self._convert_array_format(data, dataset_format, attribute_names) + data = self._convert_array_format(data, dataset_format, attribute_names) # type: ignore targets = None else: if isinstance(target, str): - if "," in target: - target = target.split(",") - else: - target = [target] - targets = np.array([True if column in target else False for column in attribute_names]) - target_names = np.array([column for column in attribute_names if column in target]) + target = target.split(",") if "," in target else [target] + targets = np.array([column in target for column in attribute_names]) + target_names = [column for column in attribute_names if column in target] if np.sum(targets) > 1: raise NotImplementedError( - "Number of requested targets %d is not implemented." % np.sum(targets) + "Number of requested targets %d is not implemented." % np.sum(targets), ) target_categorical = [ cat for cat, column in zip(categorical, attribute_names) if column in target ] target_dtype = int if target_categorical[0] else float - if hasattr(data, "iloc"): + if isinstance(data, pd.DataFrame): x = data.iloc[:, ~targets] y = data.iloc[:, targets] else: x = data[:, ~targets] - y = data[:, targets].astype(target_dtype) + y = data[:, targets].astype(target_dtype) # type: ignore categorical = [cat for cat, t in zip(categorical, targets) if not t] attribute_names = [att for att, k in zip(attribute_names, targets) if not k] - x = self._convert_array_format(x, dataset_format, attribute_names) + x = self._convert_array_format(x, dataset_format, attribute_names) # type: ignore if dataset_format == "array" and scipy.sparse.issparse(y): # scikit-learn requires dense representation of targets y = np.asarray(y.todense()).astype(target_dtype) @@ -807,15 +848,16 @@ def get_data( # need to flatten it to a 1-d array for _convert_array_format() y = y.squeeze() y = self._convert_array_format(y, dataset_format, target_names) - y = y.astype(target_dtype) if dataset_format == "array" else y + y = y.astype(target_dtype) if isinstance(y, np.ndarray) else y if len(y.shape) > 1 and y.shape[1] == 1: # single column targets should be 1-d for both `array` and `dataframe` formats + assert isinstance(y, (np.ndarray, pd.DataFrame, pd.Series)) y = y.squeeze() data, targets = x, y - return data, targets, categorical, attribute_names + return data, targets, categorical, attribute_names # type: ignore - def _load_features(self): + def _load_features(self) -> None: """Load the features metadata from the server and store it in the dataset object.""" # Delayed Import to avoid circular imports or having to import all of dataset.functions to # import OpenMLDataset. @@ -824,13 +866,13 @@ def _load_features(self): if self.dataset_id is None: raise ValueError( "No dataset id specified. Please set the dataset id. Otherwise we cannot load " - "metadata." + "metadata.", ) features_file = _get_dataset_features_file(None, self.dataset_id) self._features = _read_features(features_file) - def _load_qualities(self): + def _load_qualities(self) -> None: """Load qualities information from the server and store it in the dataset object.""" # same reason as above for _load_features from openml.datasets.functions import _get_dataset_qualities_file @@ -838,7 +880,7 @@ def _load_qualities(self): if self.dataset_id is None: raise ValueError( "No dataset id specified. Please set the dataset id. Otherwise we cannot load " - "metadata." + "metadata.", ) qualities_file = _get_dataset_qualities_file(None, self.dataset_id) @@ -848,7 +890,7 @@ def _load_qualities(self): else: self._qualities = _read_qualities(qualities_file) - def retrieve_class_labels(self, target_name: str = "class") -> Union[None, List[str]]: + def retrieve_class_labels(self, target_name: str = "class") -> None | list[str]: """Reads the datasets arff to determine the class-labels. If the task has no class labels (for example a regression problem) @@ -866,13 +908,27 @@ def retrieve_class_labels(self, target_name: str = "class") -> Union[None, List[ list """ for feature in self.features.values(): - if (feature.name == target_name) and (feature.data_type == "nominal"): - return feature.nominal_values + if feature.name == target_name: + if feature.data_type == "nominal": + return feature.nominal_values + + if feature.data_type == "string": + # Rel.: #1311 + # The target is invalid for a classification task if the feature type is string + # and not nominal. For such miss-configured tasks, we silently fix it here as + # we can safely interpreter string as nominal. + df, *_ = self.get_data() + return list(df[feature.name].unique()) + return None - def get_features_by_type( - self, data_type, exclude=None, exclude_ignore_attribute=True, exclude_row_id_attribute=True - ): + def get_features_by_type( # noqa: C901 + self, + data_type: str, + exclude: list[str] | None = None, + exclude_ignore_attribute: bool = True, # noqa: FBT002, FBT001 + exclude_row_id_attribute: bool = True, # noqa: FBT002, FBT001 + ) -> list[int]: """ Return indices of features of a given type, e.g. all nominal features. Optional parameters to exclude various features by index or ontology. @@ -882,8 +938,7 @@ def get_features_by_type( data_type : str The data type to return (e.g., nominal, numeric, date, string) exclude : list(int) - Indices to exclude (and adapt the return values as if these indices - are not present) + List of columns to exclude from the return value exclude_ignore_attribute : bool Whether to exclude the defined ignore attributes (and adapt the return values as if these indices are not present) @@ -898,15 +953,12 @@ def get_features_by_type( """ if data_type not in OpenMLDataFeature.LEGAL_DATA_TYPES: raise TypeError("Illegal feature type requested") - if self.ignore_attribute is not None: - if not isinstance(self.ignore_attribute, list): - raise TypeError("ignore_attribute should be a list") - if self.row_id_attribute is not None: - if not isinstance(self.row_id_attribute, str): - raise TypeError("row id attribute should be a str") - if exclude is not None: - if not isinstance(exclude, list): - raise TypeError("Exclude should be a list") + if self.ignore_attribute is not None and not isinstance(self.ignore_attribute, list): + raise TypeError("ignore_attribute should be a list") + if self.row_id_attribute is not None and not isinstance(self.row_id_attribute, str): + raise TypeError("row id attribute should be a str") + if exclude is not None and not isinstance(exclude, list): + raise TypeError("Exclude should be a list") # assert all(isinstance(elem, str) for elem in exclude), # "Exclude should be a list of strings" to_exclude = [] @@ -925,35 +977,36 @@ def get_features_by_type( name = self.features[idx].name if name in to_exclude: offset += 1 - else: - if self.features[idx].data_type == data_type: - result.append(idx - offset) + elif self.features[idx].data_type == data_type: + result.append(idx - offset) return result - def _get_file_elements(self) -> Dict: + def _get_file_elements(self) -> dict: """Adds the 'dataset' to file elements.""" - file_elements = {} - path = None if self.data_file is None else os.path.abspath(self.data_file) + file_elements: dict = {} + path = None if self.data_file is None else Path(self.data_file).absolute() if self._dataset is not None: file_elements["dataset"] = self._dataset - elif path is not None and os.path.exists(path): - with open(path, "rb") as fp: + elif path is not None and path.exists(): + with path.open("rb") as fp: file_elements["dataset"] = fp.read() + try: - dataset_utf8 = str(file_elements["dataset"], "utf8") + dataset_utf8 = str(file_elements["dataset"], encoding="utf8") arff.ArffDecoder().decode(dataset_utf8, encode_nominal=True) - except arff.ArffException: - raise ValueError("The file you have provided is not a valid arff file.") + except arff.ArffException as e: + raise ValueError("The file you have provided is not a valid arff file.") from e + elif self.url is None: raise ValueError("No valid url/path to the data file was given.") return file_elements - def _parse_publish_response(self, xml_response: Dict): + def _parse_publish_response(self, xml_response: dict) -> None: """Parse the id from the xml_response and assign it to self.""" self.dataset_id = int(xml_response["oml:upload_data_set"]["oml:id"]) - def _to_dict(self) -> "OrderedDict[str, OrderedDict]": + def _to_dict(self) -> dict[str, dict]: """Creates a dictionary representation of self.""" props = [ "id", @@ -981,39 +1034,43 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]": "md5_checksum", ] - data_container = OrderedDict() # type: 'OrderedDict[str, OrderedDict]' - data_dict = OrderedDict([("@xmlns:oml", "http://openml.org/openml")]) - data_container["oml:data_set_description"] = data_dict - + prop_values = {} for prop in props: content = getattr(self, prop, None) if content is not None: - data_dict["oml:" + prop] = content + prop_values["oml:" + prop] = content - return data_container + return { + "oml:data_set_description": { + "@xmlns:oml": "http://openml.org/openml", + **prop_values, + } + } -def _read_features(features_file: str) -> Dict[int, OpenMLDataFeature]: - features_pickle_file = _get_features_pickle_file(features_file) +def _read_features(features_file: Path) -> dict[int, OpenMLDataFeature]: + features_pickle_file = Path(_get_features_pickle_file(str(features_file))) try: - with open(features_pickle_file, "rb") as fh_binary: - features = pickle.load(fh_binary) - except: # noqa E722 - with open(features_file, encoding="utf8") as fh: + with features_pickle_file.open("rb") as fh_binary: + return pickle.load(fh_binary) # type: ignore # noqa: S301 + + except: # noqa: E722 + with Path(features_file).open("r", encoding="utf8") as fh: features_xml_string = fh.read() features = _parse_features_xml(features_xml_string) - with open(features_pickle_file, "wb") as fh_binary: + with features_pickle_file.open("wb") as fh_binary: pickle.dump(features, fh_binary) - return features + + return features -def _parse_features_xml(features_xml_string): +def _parse_features_xml(features_xml_string: str) -> dict[int, OpenMLDataFeature]: xml_dict = xmltodict.parse(features_xml_string, force_list=("oml:feature", "oml:nominal_value")) features_xml = xml_dict["oml:data_features"] - features = {} + features: dict[int, OpenMLDataFeature] = {} for idx, xmlfeature in enumerate(features_xml["oml:feature"]): nr_missing = xmlfeature.get("oml:number_of_missing_values", 0) feature = OpenMLDataFeature( @@ -1022,6 +1079,7 @@ def _parse_features_xml(features_xml_string): xmlfeature["oml:data_type"], xmlfeature.get("oml:nominal_value"), int(nr_missing), + xmlfeature.get("oml:ontology"), ) if idx != feature.index: raise ValueError("Data features not provided in right order") @@ -1030,32 +1088,40 @@ def _parse_features_xml(features_xml_string): return features +# TODO(eddiebergman): Should this really exist? def _get_features_pickle_file(features_file: str) -> str: - """This function only exists so it can be mocked during unit testing""" + """Exists so it can be mocked during unit testing""" return features_file + ".pkl" -def _read_qualities(qualities_file: str) -> Dict[str, float]: - qualities_pickle_file = _get_qualities_pickle_file(qualities_file) +# TODO(eddiebergman): Should this really exist? +def _get_qualities_pickle_file(qualities_file: str) -> str: + """Exists so it can be mocked during unit testing.""" + return qualities_file + ".pkl" + + +def _read_qualities(qualities_file: str | Path) -> dict[str, float]: + qualities_file = Path(qualities_file) + qualities_pickle_file = Path(_get_qualities_pickle_file(str(qualities_file))) try: - with open(qualities_pickle_file, "rb") as fh_binary: - qualities = pickle.load(fh_binary) - except: # noqa E722 - with open(qualities_file, encoding="utf8") as fh: + with qualities_pickle_file.open("rb") as fh_binary: + return pickle.load(fh_binary) # type: ignore # noqa: S301 + except: # noqa: E722 + with qualities_file.open(encoding="utf8") as fh: qualities_xml = fh.read() + qualities = _parse_qualities_xml(qualities_xml) - with open(qualities_pickle_file, "wb") as fh_binary: + with qualities_pickle_file.open("wb") as fh_binary: pickle.dump(qualities, fh_binary) - return qualities + return qualities -def _check_qualities(qualities: List[Dict[str, str]]) -> Dict[str, float]: + +def _check_qualities(qualities: list[dict[str, str]]) -> dict[str, float]: qualities_ = {} for xmlquality in qualities: name = xmlquality["oml:name"] - if xmlquality.get("oml:value", None) is None: - value = float("NaN") - elif xmlquality["oml:value"] == "null": + if xmlquality.get("oml:value", None) is None or xmlquality["oml:value"] == "null": value = float("NaN") else: value = float(xmlquality["oml:value"]) @@ -1063,12 +1129,7 @@ def _check_qualities(qualities: List[Dict[str, str]]) -> Dict[str, float]: return qualities_ -def _parse_qualities_xml(qualities_xml): +def _parse_qualities_xml(qualities_xml: str) -> dict[str, float]: xml_as_dict = xmltodict.parse(qualities_xml, force_list=("oml:quality",)) qualities = xml_as_dict["oml:data_qualities"]["oml:quality"] return _check_qualities(qualities) - - -def _get_qualities_pickle_file(qualities_file: str) -> str: - """This function only exists so it can be mocked during unit testing""" - return qualities_file + ".pkl" diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index adbb46c6e..a797588d4 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -1,47 +1,58 @@ # License: BSD 3-Clause +# ruff: noqa: PLR0913 +from __future__ import annotations -import io import logging -import os -from pyexpat import ExpatError -from typing import List, Dict, Optional, Union, cast import warnings +from collections import OrderedDict +from pathlib import Path +from typing import TYPE_CHECKING, Any, overload +from typing_extensions import Literal +import arff import minio.error import numpy as np -import arff import pandas as pd import urllib3 - import xmltodict +from pyexpat import ExpatError from scipy.sparse import coo_matrix -from collections import OrderedDict -import openml.utils import openml._api_calls -from .dataset import OpenMLDataset -from ..exceptions import ( +import openml.utils +from openml.exceptions import ( OpenMLHashException, + OpenMLPrivateDatasetError, OpenMLServerError, OpenMLServerException, - OpenMLPrivateDatasetError, ) -from ..utils import _remove_cache_dir_for_id, _create_cache_directory_for_id, _get_cache_dir_for_id +from openml.utils import ( + _create_cache_directory_for_id, + _get_cache_dir_for_id, + _remove_cache_dir_for_id, +) + +from .dataset import OpenMLDataset + +if TYPE_CHECKING: + import scipy DATASETS_CACHE_DIR_NAME = "datasets" logger = logging.getLogger(__name__) +NO_ACCESS_GRANTED_ERRCODE = 112 ############################################################################ # Local getters/accessors to the cache directory -def _get_cache_directory(dataset: OpenMLDataset) -> str: - """Return the cache directory of the OpenMLDataset""" +def _get_cache_directory(dataset: OpenMLDataset) -> Path: + """Creates and returns the cache directory of the OpenMLDataset.""" + assert dataset.dataset_id is not None return _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset.dataset_id) -def list_qualities() -> List[str]: +def list_qualities() -> list[str]: """Return list of data qualities available. The function performs an API call to retrieve the entire list of @@ -56,22 +67,63 @@ def list_qualities() -> List[str]: qualities = xmltodict.parse(xml_string, force_list=("oml:quality")) # Minimalistic check if the XML is useful if "oml:data_qualities_list" not in qualities: - raise ValueError("Error in return XML, does not contain " '"oml:data_qualities_list"') + raise ValueError('Error in return XML, does not contain "oml:data_qualities_list"') + if not isinstance(qualities["oml:data_qualities_list"]["oml:quality"], list): raise TypeError("Error in return XML, does not contain " '"oml:quality" as a list') - qualities = qualities["oml:data_qualities_list"]["oml:quality"] - return qualities + + return qualities["oml:data_qualities_list"]["oml:quality"] +@overload def list_datasets( - data_id: Optional[List[int]] = None, - offset: Optional[int] = None, - size: Optional[int] = None, - status: Optional[str] = None, - tag: Optional[str] = None, - output_format: str = "dict", - **kwargs, -) -> Union[Dict, pd.DataFrame]: + data_id: list[int] | None = ..., + offset: int | None = ..., + size: int | None = ..., + status: str | None = ..., + tag: str | None = ..., + *, + output_format: Literal["dataframe"], + **kwargs: Any, +) -> pd.DataFrame: + ... + + +@overload +def list_datasets( + data_id: list[int] | None, + offset: int | None, + size: int | None, + status: str | None, + tag: str | None, + output_format: Literal["dataframe"], + **kwargs: Any, +) -> pd.DataFrame: + ... + + +@overload +def list_datasets( + data_id: list[int] | None = ..., + offset: int | None = ..., + size: int | None = ..., + status: str | None = ..., + tag: str | None = ..., + output_format: Literal["dict"] = "dict", + **kwargs: Any, +) -> pd.DataFrame: + ... + + +def list_datasets( + data_id: list[int] | None = None, + offset: int | None = None, + size: int | None = None, + status: str | None = None, + tag: str | None = None, + output_format: Literal["dataframe", "dict"] = "dict", + **kwargs: Any, +) -> dict | pd.DataFrame: """ Return a list of all dataset which are on OpenML. Supports large amount of results. @@ -126,7 +178,7 @@ def list_datasets( """ if output_format not in ["dataframe", "dict"]: raise ValueError( - "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable." + "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable.", ) # TODO: [0.15] @@ -138,9 +190,9 @@ def list_datasets( ) warnings.warn(msg, category=FutureWarning, stacklevel=2) - return openml.utils._list_all( + return openml.utils._list_all( # type: ignore data_id=data_id, - output_format=output_format, + list_output_format=output_format, # type: ignore listing_call=_list_datasets, offset=offset, size=size, @@ -150,7 +202,29 @@ def list_datasets( ) -def _list_datasets(data_id: Optional[List] = None, output_format="dict", **kwargs): +@overload +def _list_datasets( + data_id: list | None = ..., + output_format: Literal["dict"] = "dict", + **kwargs: Any, +) -> dict: + ... + + +@overload +def _list_datasets( + data_id: list | None = ..., + output_format: Literal["dataframe"] = "dataframe", + **kwargs: Any, +) -> pd.DataFrame: + ... + + +def _list_datasets( + data_id: list | None = None, + output_format: Literal["dict", "dataframe"] = "dict", + **kwargs: Any, +) -> dict | pd.DataFrame: """ Perform api call to return a list of all datasets. @@ -176,28 +250,42 @@ def _list_datasets(data_id: Optional[List] = None, output_format="dict", **kwarg ------- datasets : dict of dicts, or dataframe """ - api_call = "data/list" if kwargs is not None: for operator, value in kwargs.items(): - api_call += "/%s/%s" % (operator, value) + api_call += f"/{operator}/{value}" if data_id is not None: api_call += "/data_id/%s" % ",".join([str(int(i)) for i in data_id]) return __list_datasets(api_call=api_call, output_format=output_format) -def __list_datasets(api_call, output_format="dict"): +@overload +def __list_datasets(api_call: str, output_format: Literal["dict"] = "dict") -> dict: + ... + + +@overload +def __list_datasets(api_call: str, output_format: Literal["dataframe"]) -> pd.DataFrame: + ... + + +def __list_datasets( + api_call: str, + output_format: Literal["dict", "dataframe"] = "dict", +) -> dict | pd.DataFrame: xml_string = openml._api_calls._perform_api_call(api_call, "get") datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",)) # Minimalistic check if the XML is useful - assert type(datasets_dict["oml:data"]["oml:dataset"]) == list, type(datasets_dict["oml:data"]) + assert isinstance(datasets_dict["oml:data"]["oml:dataset"], list), type( + datasets_dict["oml:data"], + ) assert datasets_dict["oml:data"]["@xmlns:oml"] == "http://openml.org/openml", datasets_dict[ "oml:data" ]["@xmlns:oml"] - datasets = dict() + datasets = {} for dataset_ in datasets_dict["oml:data"]["oml:dataset"]: ignore_attribute = ["oml:file_id", "oml:quality"] dataset = { @@ -207,7 +295,7 @@ def __list_datasets(api_call, output_format="dict"): dataset["version"] = int(dataset["version"]) # The number of qualities can range from 0 to infinity - for quality in dataset_.get("oml:quality", list()): + for quality in dataset_.get("oml:quality", []): try: dataset[quality["@name"]] = int(quality["#text"]) except ValueError: @@ -220,7 +308,7 @@ def __list_datasets(api_call, output_format="dict"): return datasets -def _expand_parameter(parameter: Union[str, List[str]]) -> List[str]: +def _expand_parameter(parameter: str | list[str] | None) -> list[str]: expanded_parameter = [] if isinstance(parameter, str): expanded_parameter = [x.strip() for x in parameter.split(",")] @@ -230,23 +318,24 @@ def _expand_parameter(parameter: Union[str, List[str]]) -> List[str]: def _validated_data_attributes( - attributes: List[str], data_attributes: List[str], parameter_name: str + attributes: list[str], + data_attributes: list[tuple[str, Any]], + parameter_name: str, ) -> None: for attribute_ in attributes: - is_attribute_a_data_attribute = any([attr[0] == attribute_ for attr in data_attributes]) + is_attribute_a_data_attribute = any(dattr[0] == attribute_ for dattr in data_attributes) if not is_attribute_a_data_attribute: raise ValueError( - "all attribute of '{}' should be one of the data attribute. " - " Got '{}' while candidates are {}.".format( - parameter_name, attribute_, [attr[0] for attr in data_attributes] - ) + f"all attribute of '{parameter_name}' should be one of the data attribute. " + f" Got '{attribute_}' while candidates are" + f" {[dattr[0] for dattr in data_attributes]}.", ) def check_datasets_active( - dataset_ids: List[int], - raise_error_if_not_exist: bool = True, -) -> Dict[int, bool]: + dataset_ids: list[int], + raise_error_if_not_exist: bool = True, # noqa: FBT001, FBT002 +) -> dict[int, bool]: """ Check if the dataset ids provided are active. @@ -276,7 +365,9 @@ def check_datasets_active( def _name_to_id( - dataset_name: str, version: Optional[int] = None, error_if_multiple: bool = False + dataset_name: str, + version: int | None = None, + error_if_multiple: bool = False, # noqa: FBT001, FBT002 ) -> int: """Attempt to find the dataset id of the dataset with the given name. @@ -304,27 +395,30 @@ def _name_to_id( The id of the dataset. """ status = None if version is not None else "active" - candidates = cast( - pd.DataFrame, - list_datasets( - data_name=dataset_name, status=status, data_version=version, output_format="dataframe" - ), + candidates = list_datasets( + data_name=dataset_name, + status=status, + data_version=version, + output_format="dataframe", ) if error_if_multiple and len(candidates) > 1: msg = f"Multiple active datasets exist with name '{dataset_name}'." raise ValueError(msg) + if candidates.empty: no_dataset_for_name = f"No active datasets exist with name '{dataset_name}'" and_version = f" and version '{version}'." if version is not None else "." raise RuntimeError(no_dataset_for_name + and_version) # Dataset ids are chronological so we can just sort based on ids (instead of version) - return candidates["did"].min() + return candidates["did"].min() # type: ignore def get_datasets( - dataset_ids: List[Union[str, int]], download_data: bool = True, download_qualities: bool = True -) -> List[OpenMLDataset]: + dataset_ids: list[str | int], + download_data: bool = True, # noqa: FBT001, FBT002 + download_qualities: bool = True, # noqa: FBT001, FBT002 +) -> list[OpenMLDataset]: """Download datasets. This function iterates :meth:`openml.datasets.get_dataset`. @@ -350,22 +444,22 @@ def get_datasets( datasets = [] for dataset_id in dataset_ids: datasets.append( - get_dataset(dataset_id, download_data, download_qualities=download_qualities) + get_dataset(dataset_id, download_data, download_qualities=download_qualities), ) return datasets @openml.utils.thread_safe_if_oslo_installed -def get_dataset( - dataset_id: Union[int, str], - download_data: Optional[bool] = None, # Optional for deprecation warning; later again only bool - version: Optional[int] = None, - error_if_multiple: bool = False, - cache_format: str = "pickle", - download_qualities: Optional[bool] = None, # Same as above - download_features_meta_data: Optional[bool] = None, # Same as above - download_all_files: bool = False, - force_refresh_cache: bool = False, +def get_dataset( # noqa: C901, PLR0912 + dataset_id: int | str, + download_data: bool | None = None, # Optional for deprecation warning; later again only bool + version: int | None = None, + error_if_multiple: bool = False, # noqa: FBT002, FBT001 + cache_format: Literal["pickle", "feather"] = "pickle", + download_qualities: bool | None = None, # Same as above + download_features_meta_data: bool | None = None, # Same as above + download_all_files: bool = False, # noqa: FBT002, FBT001 + force_refresh_cache: bool = False, # noqa: FBT001, FBT002 ) -> OpenMLDataset: """Download the OpenML dataset representation, optionally also download actual data file. @@ -442,6 +536,7 @@ def get_dataset( "`download_qualities`, and `download_features_meta_data` to a bool while calling " "`get_dataset`.", FutureWarning, + stacklevel=2, ) download_data = True if download_data is None else download_data @@ -452,13 +547,15 @@ def get_dataset( if download_all_files: warnings.warn( - "``download_all_files`` is experimental and is likely to break with new releases." + "``download_all_files`` is experimental and is likely to break with new releases.", + FutureWarning, + stacklevel=2, ) if cache_format not in ["feather", "pickle"]: raise ValueError( "cache_format must be one of 'feather' or 'pickle. " - "Invalid format specified: {}".format(cache_format) + f"Invalid format specified: {cache_format}", ) if isinstance(dataset_id, str): @@ -468,12 +565,12 @@ def get_dataset( dataset_id = _name_to_id(dataset_id, version, error_if_multiple) # type: ignore elif not isinstance(dataset_id, int): raise TypeError( - "`dataset_id` must be one of `str` or `int`, not {}.".format(type(dataset_id)) + f"`dataset_id` must be one of `str` or `int`, not {type(dataset_id)}.", ) if force_refresh_cache: did_cache_dir = _get_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, dataset_id) - if os.path.exists(did_cache_dir): + if did_cache_dir.exists(): _remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, did_cache_dir) did_cache_dir = _create_cache_directory_for_id( @@ -493,10 +590,11 @@ def get_dataset( qualities_file = _get_dataset_qualities_file(did_cache_dir, dataset_id) arff_file = _get_dataset_arff(description) if download_data else None - if "oml:minio_url" in description and download_data: + if "oml:parquet_url" in description and download_data: try: parquet_file = _get_dataset_parquet( - description, download_all_files=download_all_files + description, + download_all_files=download_all_files, ) except urllib3.exceptions.MaxRetryError: parquet_file = None @@ -508,21 +606,25 @@ def get_dataset( except OpenMLServerException as e: # if there was an exception # check if the user had access to the dataset - if e.code == 112: + if e.code == NO_ACCESS_GRANTED_ERRCODE: raise OpenMLPrivateDatasetError(e.message) from None - else: - raise e + + raise e finally: if remove_dataset_cache: _remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, did_cache_dir) - dataset = _create_dataset_from_description( - description, features_file, qualities_file, arff_file, parquet_file, cache_format + return _create_dataset_from_description( + description, + features_file, + qualities_file, + arff_file, + parquet_file, + cache_format, ) - return dataset -def attributes_arff_from_df(df): +def attributes_arff_from_df(df: pd.DataFrame) -> list[tuple[str, list[str] | str]]: """Describe attributes of the dataframe according to ARFF specification. Parameters @@ -532,13 +634,13 @@ def attributes_arff_from_df(df): Returns ------- - attributes_arff : str + attributes_arff : list[str] The data set attributes as required by the ARFF format. """ PD_DTYPES_TO_ARFF_DTYPE = {"integer": "INTEGER", "floating": "REAL", "string": "STRING"} - attributes_arff = [] + attributes_arff: list[tuple[str, list[str] | str]] = [] - if not all([isinstance(column_name, str) for column_name in df.columns]): + if not all(isinstance(column_name, str) for column_name in df.columns): logger.warning("Converting non-str column names to str.") df.columns = [str(column_name) for column_name in df.columns] @@ -555,47 +657,50 @@ def attributes_arff_from_df(df): categories_dtype = pd.api.types.infer_dtype(categories) if categories_dtype not in ("string", "unicode"): raise ValueError( - "The column '{}' of the dataframe is of " + f"The column '{column_name}' of the dataframe is of " "'category' dtype. Therefore, all values in " "this columns should be string. Please " "convert the entries which are not string. " - "Got {} dtype in this column.".format(column_name, categories_dtype) + f"Got {categories_dtype} dtype in this column.", ) attributes_arff.append((column_name, categories.tolist())) elif column_dtype == "boolean": # boolean are encoded as categorical. attributes_arff.append((column_name, ["True", "False"])) - elif column_dtype in PD_DTYPES_TO_ARFF_DTYPE.keys(): + elif column_dtype in PD_DTYPES_TO_ARFF_DTYPE: attributes_arff.append((column_name, PD_DTYPES_TO_ARFF_DTYPE[column_dtype])) else: raise ValueError( - "The dtype '{}' of the column '{}' is not " + f"The dtype '{column_dtype}' of the column '{column_name}' is not " "currently supported by liac-arff. Supported " "dtypes are categorical, string, integer, " - "floating, and boolean.".format(column_dtype, column_name) + "floating, and boolean.", ) return attributes_arff -def create_dataset( - name, - description, - creator, - contributor, - collection_date, - language, - licence, - attributes, - data, - default_target_attribute, - ignore_attribute, - citation, - row_id_attribute=None, - original_data_url=None, - paper_url=None, - update_comment=None, - version_label=None, -): +def create_dataset( # noqa: C901, PLR0912, PLR0915 + name: str, + description: str | None, + creator: str | None, + contributor: str | None, + collection_date: str | None, + language: str | None, + licence: str | None, + # TODO(eddiebergman): Docstring says `type` but I don't know what this is other than strings + # Edit: Found it could also be like ["True", "False"] + attributes: list[tuple[str, str | list[str]]] | dict[str, str | list[str]] | Literal["auto"], + data: pd.DataFrame | np.ndarray | scipy.sparse.coo_matrix, + # TODO(eddiebergman): Function requires `default_target_attribute` exist but API allows None + default_target_attribute: str, + ignore_attribute: str | list[str] | None, + citation: str, + row_id_attribute: str | None = None, + original_data_url: str | None = None, + paper_url: str | None = None, + update_comment: str | None = None, + version_label: str | None = None, +) -> OpenMLDataset: """Create a dataset. This function creates an OpenMLDataset object. @@ -661,8 +766,8 @@ def create_dataset( Returns ------- class:`openml.OpenMLDataset` - Dataset description.""" - + Dataset description. + """ if isinstance(data, pd.DataFrame): # infer the row id from the index of the dataset if row_id_attribute is None: @@ -673,10 +778,10 @@ def create_dataset( data = data.reset_index() if attributes == "auto" or isinstance(attributes, dict): - if not hasattr(data, "columns"): + if not isinstance(data, pd.DataFrame): raise ValueError( "Automatically inferring attributes requires " - "a pandas DataFrame. A {!r} was given instead.".format(data) + f"a pandas DataFrame. A {data!r} was given instead.", ) # infer the type of data for each column of the DataFrame attributes_ = attributes_arff_from_df(data) @@ -684,7 +789,7 @@ def create_dataset( # override the attributes which was specified by the user for attr_idx in range(len(attributes_)): attr_name = attributes_[attr_idx][0] - if attr_name in attributes.keys(): + if attr_name in attributes: attributes_[attr_idx] = (attr_name, attributes[attr_name]) else: attributes_ = attributes @@ -695,26 +800,28 @@ def create_dataset( _validated_data_attributes(default_target_attributes, attributes_, "default_target_attribute") if row_id_attribute is not None: - is_row_id_an_attribute = any([attr[0] == row_id_attribute for attr in attributes_]) + is_row_id_an_attribute = any(attr[0] == row_id_attribute for attr in attributes_) if not is_row_id_an_attribute: raise ValueError( "'row_id_attribute' should be one of the data attribute. " " Got '{}' while candidates are {}.".format( - row_id_attribute, [attr[0] for attr in attributes_] - ) + row_id_attribute, + [attr[0] for attr in attributes_], + ), ) - if hasattr(data, "columns"): + if isinstance(data, pd.DataFrame): if all(isinstance(dtype, pd.SparseDtype) for dtype in data.dtypes): data = data.sparse.to_coo() # liac-arff only support COO matrices with sorted rows - row_idx_sorted = np.argsort(data.row) - data.row = data.row[row_idx_sorted] - data.col = data.col[row_idx_sorted] - data.data = data.data[row_idx_sorted] + row_idx_sorted = np.argsort(data.row) # type: ignore + data.row = data.row[row_idx_sorted] # type: ignore + data.col = data.col[row_idx_sorted] # type: ignore + data.data = data.data[row_idx_sorted] # type: ignore else: - data = data.values + data = data.to_numpy() + data_format: Literal["arff", "sparse_arff"] if isinstance(data, (list, np.ndarray)): if isinstance(data[0], (list, np.ndarray)): data_format = "arff" @@ -725,7 +832,7 @@ def create_dataset( "When giving a list or a numpy.ndarray, " "they should contain a list/ numpy.ndarray " "for dense data or a dictionary for sparse " - "data. Got {!r} instead.".format(data[0]) + f"data. Got {data[0]!r} instead.", ) elif isinstance(data, coo_matrix): data_format = "sparse_arff" @@ -734,7 +841,7 @@ def create_dataset( "When giving a list or a numpy.ndarray, " "they should contain a list/ numpy.ndarray " "for dense data or a dictionary for sparse " - "data. Got {!r} instead.".format(data[0]) + f"data. Got {data[0]!r} instead.", ) arff_object = { @@ -751,11 +858,10 @@ def create_dataset( decoder = arff.ArffDecoder() return_type = arff.COO if data_format == "sparse_arff" else arff.DENSE decoder.decode(arff_dataset, encode_nominal=True, return_type=return_type) - except arff.ArffException: + except arff.ArffException as e: raise ValueError( - "The arguments you have provided \ - do not construct a valid ARFF file" - ) + "The arguments you have provided do not construct a valid ARFF file" + ) from e return OpenMLDataset( name=name, @@ -778,7 +884,7 @@ def create_dataset( ) -def status_update(data_id, status): +def status_update(data_id: int, status: Literal["active", "deactivated"]) -> None: """ Updates the status of a dataset to either 'active' or 'deactivated'. Please see the OpenML API documentation for a description of the status @@ -794,8 +900,9 @@ def status_update(data_id, status): """ legal_status = {"active", "deactivated"} if status not in legal_status: - raise ValueError("Illegal status value. " "Legal values: %s" % legal_status) - data = {"data_id": data_id, "status": status} + raise ValueError(f"Illegal status value. Legal values: {legal_status}") + + data: openml._api_calls.DATA_TYPE = {"data_id": data_id, "status": status} result_xml = openml._api_calls._perform_api_call("data/status/update", "post", data=data) result = xmltodict.parse(result_xml) server_data_id = result["oml:data_status_update"]["oml:id"] @@ -806,18 +913,18 @@ def status_update(data_id, status): def edit_dataset( - data_id, - description=None, - creator=None, - contributor=None, - collection_date=None, - language=None, - default_target_attribute=None, - ignore_attribute=None, - citation=None, - row_id_attribute=None, - original_data_url=None, - paper_url=None, + data_id: int, + description: str | None = None, + creator: str | None = None, + contributor: str | None = None, + collection_date: str | None = None, + language: str | None = None, + default_target_attribute: str | None = None, + ignore_attribute: str | list[str] | None = None, + citation: str | None = None, + row_id_attribute: str | None = None, + original_data_url: str | None = None, + paper_url: str | None = None, ) -> int: """Edits an OpenMLDataset. @@ -877,7 +984,7 @@ def edit_dataset( Dataset id """ if not isinstance(data_id, int): - raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id))) + raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.") # compose data edit parameters as xml form_data = {"data_id": data_id} # type: openml._api_calls.DATA_TYPE @@ -902,10 +1009,13 @@ def edit_dataset( del xml["oml:data_edit_parameters"][k] file_elements = { - "edit_parameters": ("description.xml", xmltodict.unparse(xml)) + "edit_parameters": ("description.xml", xmltodict.unparse(xml)), } # type: openml._api_calls.FILE_ELEMENTS_TYPE result_xml = openml._api_calls._perform_api_call( - "data/edit", "post", data=form_data, file_elements=file_elements + "data/edit", + "post", + data=form_data, + file_elements=file_elements, ) result = xmltodict.parse(result_xml) data_id = result["oml:data_edit"]["oml:id"] @@ -942,7 +1052,7 @@ def fork_dataset(data_id: int) -> int: """ if not isinstance(data_id, int): - raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id))) + raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.") # compose data fork parameters form_data = {"data_id": data_id} # type: openml._api_calls.DATA_TYPE result_xml = openml._api_calls._perform_api_call("data/fork", "post", data=form_data) @@ -951,19 +1061,77 @@ def fork_dataset(data_id: int) -> int: return int(data_id) -def _topic_add_dataset(data_id: int, topic: str): +def data_feature_add_ontology(data_id: int, index: int, ontology: str) -> bool: + """ + An ontology describes the concept that are described in a feature. An + ontology is defined by an URL where the information is provided. Adds + an ontology (URL) to a given dataset feature (defined by a dataset id + and index). The dataset has to exists on OpenML and needs to have been + processed by the evaluation engine. + + Parameters + ---------- + data_id : int + id of the dataset to which the feature belongs + index : int + index of the feature in dataset (0-based) + ontology : str + URL to ontology (max. 256 characters) + + Returns + ------- + True or throws an OpenML server exception + """ + upload_data: dict[str, int | str] = {"data_id": data_id, "index": index, "ontology": ontology} + openml._api_calls._perform_api_call("data/feature/ontology/add", "post", data=upload_data) + # an error will be thrown in case the request was unsuccessful + return True + + +def data_feature_remove_ontology(data_id: int, index: int, ontology: str) -> bool: + """ + Removes an existing ontology (URL) from a given dataset feature (defined + by a dataset id and index). The dataset has to exists on OpenML and needs + to have been processed by the evaluation engine. Ontology needs to be + attached to the specific fearure. + + Parameters + ---------- + data_id : int + id of the dataset to which the feature belongs + index : int + index of the feature in dataset (0-based) + ontology : str + URL to ontology (max. 256 characters) + + Returns + ------- + True or throws an OpenML server exception + """ + upload_data: dict[str, int | str] = {"data_id": data_id, "index": index, "ontology": ontology} + openml._api_calls._perform_api_call("data/feature/ontology/remove", "post", data=upload_data) + # an error will be thrown in case the request was unsuccessful + return True + + +def _topic_add_dataset(data_id: int, topic: str) -> int: """ Adds a topic for a dataset. This API is not available for all OpenML users and is accessible only by admins. + Parameters ---------- data_id : int id of the dataset for which the topic needs to be added topic : str Topic to be added for the dataset + + Returns + ------- + Dataset id """ if not isinstance(data_id, int): - raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id))) + raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.") form_data = {"data_id": data_id, "topic": topic} # type: openml._api_calls.DATA_TYPE result_xml = openml._api_calls._perform_api_call("data/topicadd", "post", data=form_data) result = xmltodict.parse(result_xml) @@ -971,10 +1139,11 @@ def _topic_add_dataset(data_id: int, topic: str): return int(data_id) -def _topic_delete_dataset(data_id: int, topic: str): +def _topic_delete_dataset(data_id: int, topic: str) -> int: """ Removes a topic from a dataset. This API is not available for all OpenML users and is accessible only by admins. + Parameters ---------- data_id : int @@ -982,9 +1151,12 @@ def _topic_delete_dataset(data_id: int, topic: str): topic : str Topic to be deleted + Returns + ------- + Dataset id """ if not isinstance(data_id, int): - raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id))) + raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.") form_data = {"data_id": data_id, "topic": topic} # type: openml._api_calls.DATA_TYPE result_xml = openml._api_calls._perform_api_call("data/topicdelete", "post", data=form_data) result = xmltodict.parse(result_xml) @@ -992,14 +1164,14 @@ def _topic_delete_dataset(data_id: int, topic: str): return int(data_id) -def _get_dataset_description(did_cache_dir, dataset_id): +def _get_dataset_description(did_cache_dir: Path, dataset_id: int) -> dict[str, Any]: """Get the dataset description as xml dictionary. This function is NOT thread/multiprocessing safe. Parameters ---------- - did_cache_dir : str + did_cache_dir : Path Cache subdirectory for this dataset. dataset_id : int @@ -1011,35 +1183,35 @@ def _get_dataset_description(did_cache_dir, dataset_id): XML Dataset description parsed to a dict. """ - # TODO implement a cache for this that invalidates itself after some time # This can be saved on disk, but cannot be cached properly, because # it contains the information on whether a dataset is active. - description_file = os.path.join(did_cache_dir, "description.xml") + description_file = did_cache_dir / "description.xml" try: - with io.open(description_file, encoding="utf8") as fh: + with description_file.open(encoding="utf8") as fh: dataset_xml = fh.read() description = xmltodict.parse(dataset_xml)["oml:data_set_description"] - except Exception: - url_extension = "data/{}".format(dataset_id) + except Exception: # noqa: BLE001 + url_extension = f"data/{dataset_id}" dataset_xml = openml._api_calls._perform_api_call(url_extension, "get") try: description = xmltodict.parse(dataset_xml)["oml:data_set_description"] except ExpatError as e: url = openml._api_calls._create_url_from_endpoint(url_extension) raise OpenMLServerError(f"Dataset description XML at '{url}' is malformed.") from e - with io.open(description_file, "w", encoding="utf8") as fh: + + with description_file.open("w", encoding="utf8") as fh: fh.write(dataset_xml) - return description + return description # type: ignore def _get_dataset_parquet( - description: Union[Dict, OpenMLDataset], - cache_directory: Optional[str] = None, - download_all_files: bool = False, -) -> Optional[str]: + description: dict | OpenMLDataset, + cache_directory: Path | None = None, + download_all_files: bool = False, # noqa: FBT001, FBT002 +) -> Path | None: """Return the path to the local parquet file of the dataset. If is not cached, it is downloaded. Checks if the file is in the cache, if yes, return the path to the file. @@ -1054,59 +1226,62 @@ def _get_dataset_parquet( description : dictionary or OpenMLDataset Either a dataset description as dict or OpenMLDataset. - cache_directory: str, optional (default=None) + cache_directory: Path, optional (default=None) Folder to store the parquet file in. If None, use the default cache directory for the dataset. download_all_files: bool, optional (default=False) If `True`, download all data found in the bucket to which the description's - ``minio_url`` points, only download the parquet file otherwise. + ``parquet_url`` points, only download the parquet file otherwise. Returns ------- - output_filename : string, optional + output_filename : Path, optional Location of the Parquet file if successfully downloaded, None otherwise. """ if isinstance(description, dict): - url = cast(str, description.get("oml:minio_url")) - did = description.get("oml:id") + url = str(description.get("oml:parquet_url")) + did = int(description.get("oml:id")) # type: ignore elif isinstance(description, OpenMLDataset): - url = cast(str, description._minio_url) - did = description.dataset_id + url = str(description._parquet_url) + assert description.dataset_id is not None + + did = int(description.dataset_id) else: raise TypeError("`description` should be either OpenMLDataset or Dict.") if cache_directory is None: cache_directory = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, did) - output_file_path = os.path.join(cache_directory, f"dataset_{did}.pq") - old_file_path = os.path.join(cache_directory, "dataset.pq") - if os.path.isfile(old_file_path): - os.rename(old_file_path, output_file_path) + output_file_path = cache_directory / f"dataset_{did}.pq" + + old_file_path = cache_directory / "dataset.pq" + if old_file_path.is_file(): + old_file_path.rename(output_file_path) # For this release, we want to be able to force a new download even if the # parquet file is already present when ``download_all_files`` is set. # For now, it would be the only way for the user to fetch the additional # files in the bucket (no function exists on an OpenMLDataset to do this). if download_all_files: - if url.endswith(".pq"): - url, _ = url.rsplit("/", maxsplit=1) - openml._api_calls._download_minio_bucket(source=cast(str, url), destination=cache_directory) + openml._api_calls._download_minio_bucket(source=url, destination=cache_directory) - if not os.path.isfile(output_file_path): + if not output_file_path.is_file(): try: openml._api_calls._download_minio_file( - source=cast(str, url), destination=output_file_path + source=url, + destination=output_file_path, ) except (FileNotFoundError, urllib3.exceptions.MaxRetryError, minio.error.ServerError) as e: - logger.warning("Could not download file from %s: %s" % (cast(str, url), e)) + logger.warning(f"Could not download file from {url}: {e}") return None return output_file_path def _get_dataset_arff( - description: Union[Dict, OpenMLDataset], cache_directory: Optional[str] = None -) -> str: + description: dict | OpenMLDataset, + cache_directory: Path | None = None, +) -> Path: """Return the path to the local arff file of the dataset. If is not cached, it is downloaded. Checks if the file is in the cache, if yes, return the path to the file. @@ -1120,48 +1295,56 @@ def _get_dataset_arff( description : dictionary or OpenMLDataset Either a dataset description as dict or OpenMLDataset. - cache_directory: str, optional (default=None) + cache_directory: Path, optional (default=None) Folder to store the arff file in. If None, use the default cache directory for the dataset. Returns ------- - output_filename : string + output_filename : Path Location of ARFF file. """ if isinstance(description, dict): md5_checksum_fixture = description.get("oml:md5_checksum") - url = description["oml:url"] - did = description.get("oml:id") + url = str(description["oml:url"]) + did = int(description.get("oml:id")) # type: ignore elif isinstance(description, OpenMLDataset): md5_checksum_fixture = description.md5_checksum + assert description.url is not None + assert description.dataset_id is not None + url = description.url - did = description.dataset_id + did = int(description.dataset_id) else: raise TypeError("`description` should be either OpenMLDataset or Dict.") - if cache_directory is None: - cache_directory = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, did) - output_file_path = os.path.join(cache_directory, "dataset.arff") + save_cache_directory = ( + _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, did) + if cache_directory is None + else Path(cache_directory) + ) + output_file_path = save_cache_directory / "dataset.arff" try: openml._api_calls._download_text_file( - source=url, output_path=output_file_path, md5_checksum=md5_checksum_fixture + source=url, + output_path=output_file_path, + md5_checksum=md5_checksum_fixture, ) except OpenMLHashException as e: - additional_info = " Raised when downloading dataset {}.".format(did) + additional_info = f" Raised when downloading dataset {did}." e.args = (e.args[0] + additional_info,) - raise + raise e return output_file_path -def _get_features_xml(dataset_id): +def _get_features_xml(dataset_id: int) -> str: url_extension = f"data/features/{dataset_id}" return openml._api_calls._perform_api_call(url_extension, "get") -def _get_dataset_features_file(did_cache_dir: Union[str, None], dataset_id: int) -> str: +def _get_dataset_features_file(did_cache_dir: str | Path | None, dataset_id: int) -> Path: """API call to load dataset features. Loads from cache or downloads them. Features are feature descriptions for each column. @@ -1179,37 +1362,36 @@ def _get_dataset_features_file(did_cache_dir: Union[str, None], dataset_id: int) Returns ------- - str + Path Path of the cached dataset feature file """ - + did_cache_dir = Path(did_cache_dir) if did_cache_dir is not None else None if did_cache_dir is None: - did_cache_dir = _create_cache_directory_for_id( - DATASETS_CACHE_DIR_NAME, - dataset_id, - ) + did_cache_dir = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset_id) - features_file = os.path.join(did_cache_dir, "features.xml") + features_file = did_cache_dir / "features.xml" # Dataset features aren't subject to change... - if not os.path.isfile(features_file): + if not features_file.is_file(): features_xml = _get_features_xml(dataset_id) - with io.open(features_file, "w", encoding="utf8") as fh: + with features_file.open("w", encoding="utf8") as fh: fh.write(features_xml) return features_file -def _get_qualities_xml(dataset_id): - url_extension = f"data/qualities/{dataset_id}" +def _get_qualities_xml(dataset_id: int) -> str: + url_extension = f"data/qualities/{dataset_id!s}" return openml._api_calls._perform_api_call(url_extension, "get") def _get_dataset_qualities_file( - did_cache_dir: Union[str, None], dataset_id: int -) -> Union[str, None]: - """API call to load dataset qualities. Loads from cache or downloads them. + did_cache_dir: str | Path | None, + dataset_id: int, +) -> Path | None: + """Get the path for the dataset qualities file, or None if no qualities exist. + Loads from cache or downloads them. Features are metafeatures (number of features, number of classes, ...) This function is NOT thread/multiprocessing safe. @@ -1222,47 +1404,45 @@ def _get_dataset_qualities_file( dataset_id : int Dataset ID - download_qualities : bool - wheather to download/use cahsed version or not. Returns ------- str Path of the cached qualities file """ - if did_cache_dir is None: - did_cache_dir = _create_cache_directory_for_id( - DATASETS_CACHE_DIR_NAME, - dataset_id, - ) + save_did_cache_dir = ( + _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset_id) + if did_cache_dir is None + else Path(did_cache_dir) + ) # Dataset qualities are subject to change and must be fetched every time - qualities_file = os.path.join(did_cache_dir, "qualities.xml") + qualities_file = save_did_cache_dir / "qualities.xml" try: - with io.open(qualities_file, encoding="utf8") as fh: + with qualities_file.open(encoding="utf8") as fh: qualities_xml = fh.read() - except (OSError, IOError): + except OSError: try: qualities_xml = _get_qualities_xml(dataset_id) - with io.open(qualities_file, "w", encoding="utf8") as fh: + with qualities_file.open("w", encoding="utf8") as fh: fh.write(qualities_xml) except OpenMLServerException as e: if e.code == 362 and str(e) == "No qualities found - None": # quality file stays as None - logger.warning("No qualities found for dataset {}".format(dataset_id)) + logger.warning(f"No qualities found for dataset {dataset_id}") return None - else: - raise + + raise e return qualities_file def _create_dataset_from_description( - description: Dict[str, str], - features_file: Optional[str] = None, - qualities_file: Optional[str] = None, - arff_file: Optional[str] = None, - parquet_file: Optional[str] = None, - cache_format: str = "pickle", + description: dict[str, str], + features_file: Path | None = None, + qualities_file: Path | None = None, + arff_file: Path | None = None, + parquet_file: Path | None = None, + cache_format: Literal["pickle", "feather"] = "pickle", ) -> OpenMLDataset: """Create a dataset object from a description dict. @@ -1270,9 +1450,9 @@ def _create_dataset_from_description( ---------- description : dict Description of a dataset in xml dict. - featuresfile : str + features_file : str Path of the dataset features as xml file. - qualities : list + qualities_file : list Path of the dataset qualities as xml file. arff_file : string, optional Path of dataset ARFF file. @@ -1289,9 +1469,9 @@ def _create_dataset_from_description( return OpenMLDataset( description["oml:name"], description.get("oml:description"), - data_format=description["oml:format"], - dataset_id=description["oml:id"], - version=description["oml:version"], + data_format=description["oml:format"], # type: ignore + dataset_id=int(description["oml:id"]), + version=int(description["oml:version"]), creator=description.get("oml:creator"), contributor=description.get("oml:contributor"), collection_date=description.get("oml:collection_date"), @@ -1310,16 +1490,16 @@ def _create_dataset_from_description( paper_url=description.get("oml:paper_url"), update_comment=description.get("oml:update_comment"), md5_checksum=description.get("oml:md5_checksum"), - data_file=arff_file, + data_file=str(arff_file) if arff_file is not None else None, cache_format=cache_format, - features_file=features_file, - qualities_file=qualities_file, - minio_url=description.get("oml:minio_url"), - parquet_file=parquet_file, + features_file=str(features_file) if features_file is not None else None, + qualities_file=str(qualities_file) if qualities_file is not None else None, + parquet_url=description.get("oml:parquet_url"), + parquet_file=str(parquet_file) if parquet_file is not None else None, ) -def _get_online_dataset_arff(dataset_id): +def _get_online_dataset_arff(dataset_id: int) -> str | None: """Download the ARFF file for a given dataset id from the OpenML website. @@ -1330,8 +1510,8 @@ def _get_online_dataset_arff(dataset_id): Returns ------- - str - A string representation of an ARFF file. + str or None + A string representation of an ARFF file. Or None if file already exists. """ dataset_xml = openml._api_calls._perform_api_call("data/%d" % dataset_id, "get") # build a dict from the xml. @@ -1341,7 +1521,7 @@ def _get_online_dataset_arff(dataset_id): ) -def _get_online_dataset_format(dataset_id): +def _get_online_dataset_format(dataset_id: int) -> str: """Get the dataset format for a given dataset id from the OpenML website. @@ -1357,7 +1537,7 @@ def _get_online_dataset_format(dataset_id): """ dataset_xml = openml._api_calls._perform_api_call("data/%d" % dataset_id, "get") # build a dict from the xml and get the format from the dataset description - return xmltodict.parse(dataset_xml)["oml:data_set_description"]["oml:format"].lower() + return xmltodict.parse(dataset_xml)["oml:data_set_description"]["oml:format"].lower() # type: ignore def delete_dataset(dataset_id: int) -> bool: diff --git a/openml/evaluations/__init__.py b/openml/evaluations/__init__.py index 400a59652..dbff47037 100644 --- a/openml/evaluations/__init__.py +++ b/openml/evaluations/__init__.py @@ -1,7 +1,7 @@ # License: BSD 3-Clause from .evaluation import OpenMLEvaluation -from .functions import list_evaluations, list_evaluation_measures, list_evaluations_setups +from .functions import list_evaluation_measures, list_evaluations, list_evaluations_setups __all__ = [ "OpenMLEvaluation", diff --git a/openml/evaluations/evaluation.py b/openml/evaluations/evaluation.py index 8bdf741c2..3cf732f25 100644 --- a/openml/evaluations/evaluation.py +++ b/openml/evaluations/evaluation.py @@ -1,9 +1,14 @@ # License: BSD 3-Clause +from __future__ import annotations import openml.config +import openml.datasets +import openml.flows +import openml.runs +import openml.tasks -class OpenMLEvaluation(object): +class OpenMLEvaluation: """ Contains all meta-information about a run / evaluation combination, according to the evaluation/list function @@ -41,22 +46,22 @@ class OpenMLEvaluation(object): (e.g., in case of precision, auroc, recall) """ - def __init__( + def __init__( # noqa: PLR0913 self, - run_id, - task_id, - setup_id, - flow_id, - flow_name, - data_id, - data_name, - function, - upload_time, + run_id: int, + task_id: int, + setup_id: int, + flow_id: int, + flow_name: str, + data_id: int, + data_name: str, + function: str, + upload_time: str, uploader: int, uploader_name: str, - value, - values, - array_data=None, + value: float | None, + values: list[float] | None, + array_data: str | None = None, ): self.run_id = run_id self.task_id = task_id @@ -73,7 +78,7 @@ def __init__( self.values = values self.array_data = array_data - def __repr__(self): + def __repr__(self) -> str: header = "OpenML Evaluation" header = "{}\n{}\n".format(header, "=" * len(header)) @@ -107,9 +112,9 @@ def __repr__(self): "Metric Used", "Result", ] - fields = [(key, fields[key]) for key in order if key in fields] + _fields = [(key, fields[key]) for key in order if key in fields] - longest_field_name_length = max(len(name) for name, value in fields) - field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length) - body = "\n".join(field_line_format.format(name, value) for name, value in fields) + longest_field_name_length = max(len(name) for name, _ in _fields) + field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}" + body = "\n".join(field_line_format.format(name, value) for name, value in _fields) return header + body diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py index 214348345..a854686d1 100644 --- a/openml/evaluations/functions.py +++ b/openml/evaluations/functions.py @@ -1,35 +1,75 @@ # License: BSD 3-Clause +# ruff: noqa: PLR0913 +from __future__ import annotations import json import warnings +from typing import Any +from typing_extensions import Literal, overload -import xmltodict -import pandas as pd import numpy as np -from typing import Union, List, Optional, Dict -import collections +import pandas as pd +import xmltodict -import openml.utils -import openml._api_calls -from ..evaluations import OpenMLEvaluation import openml +import openml._api_calls +import openml.utils +from openml.evaluations import OpenMLEvaluation + + +@overload +def list_evaluations( + function: str, + offset: int | None = ..., + size: int | None = ..., + tasks: list[str | int] | None = ..., + setups: list[str | int] | None = ..., + flows: list[str | int] | None = ..., + runs: list[str | int] | None = ..., + uploaders: list[str | int] | None = ..., + tag: str | None = ..., + study: int | None = ..., + per_fold: bool | None = ..., + sort_order: str | None = ..., + output_format: Literal["dict", "object"] = "dict", +) -> dict: + ... + + +@overload +def list_evaluations( + function: str, + offset: int | None = ..., + size: int | None = ..., + tasks: list[str | int] | None = ..., + setups: list[str | int] | None = ..., + flows: list[str | int] | None = ..., + runs: list[str | int] | None = ..., + uploaders: list[str | int] | None = ..., + tag: str | None = ..., + study: int | None = ..., + per_fold: bool | None = ..., + sort_order: str | None = ..., + output_format: Literal["dataframe"] = ..., +) -> pd.DataFrame: + ... def list_evaluations( function: str, - offset: Optional[int] = None, - size: Optional[int] = 10000, - tasks: Optional[List[Union[str, int]]] = None, - setups: Optional[List[Union[str, int]]] = None, - flows: Optional[List[Union[str, int]]] = None, - runs: Optional[List[Union[str, int]]] = None, - uploaders: Optional[List[Union[str, int]]] = None, - tag: Optional[str] = None, - study: Optional[int] = None, - per_fold: Optional[bool] = None, - sort_order: Optional[str] = None, - output_format: str = "object", -) -> Union[Dict, pd.DataFrame]: + offset: int | None = None, + size: int | None = 10000, + tasks: list[str | int] | None = None, + setups: list[str | int] | None = None, + flows: list[str | int] | None = None, + runs: list[str | int] | None = None, + uploaders: list[str | int] | None = None, + tag: str | None = None, + study: int | None = None, + per_fold: bool | None = None, + sort_order: str | None = None, + output_format: Literal["object", "dict", "dataframe"] = "object", +) -> dict | pd.DataFrame: """ List all run-evaluation pairs matching all of the given filters. (Supports large amount of results) @@ -76,7 +116,7 @@ def list_evaluations( """ if output_format not in ["dataframe", "dict", "object"]: raise ValueError( - "Invalid output format selected. " "Only 'object', 'dataframe', or 'dict' applicable." + "Invalid output format selected. Only 'object', 'dataframe', or 'dict' applicable.", ) # TODO: [0.15] @@ -92,8 +132,8 @@ def list_evaluations( if per_fold is not None: per_fold_str = str(per_fold).lower() - return openml.utils._list_all( - output_format=output_format, + return openml.utils._list_all( # type: ignore + list_output_format=output_format, # type: ignore listing_call=_list_evaluations, function=function, offset=offset, @@ -112,16 +152,16 @@ def list_evaluations( def _list_evaluations( function: str, - tasks: Optional[List] = None, - setups: Optional[List] = None, - flows: Optional[List] = None, - runs: Optional[List] = None, - uploaders: Optional[List] = None, - study: Optional[int] = None, - sort_order: Optional[str] = None, - output_format: str = "object", - **kwargs -) -> Union[Dict, pd.DataFrame]: + tasks: list | None = None, + setups: list | None = None, + flows: list | None = None, + runs: list | None = None, + uploaders: list | None = None, + study: int | None = None, + sort_order: str | None = None, + output_format: Literal["object", "dict", "dataframe"] = "object", + **kwargs: Any, +) -> dict | pd.DataFrame: """ Perform API call ``/evaluation/function{function}/{filters}`` @@ -164,11 +204,10 @@ def _list_evaluations( ------- dict of objects, or dataframe """ - api_call = "evaluation/list/function/%s" % function if kwargs is not None: for operator, value in kwargs.items(): - api_call += "/%s/%s" % (operator, value) + api_call += f"/{operator}/{value}" if tasks is not None: api_call += "/task/%s" % ",".join([str(int(i)) for i in tasks]) if setups is not None: @@ -187,23 +226,26 @@ def _list_evaluations( return __list_evaluations(api_call, output_format=output_format) -def __list_evaluations(api_call, output_format="object"): +def __list_evaluations( + api_call: str, + output_format: Literal["object", "dict", "dataframe"] = "object", +) -> dict | pd.DataFrame: """Helper function to parse API calls which are lists of runs""" xml_string = openml._api_calls._perform_api_call(api_call, "get") evals_dict = xmltodict.parse(xml_string, force_list=("oml:evaluation",)) # Minimalistic check if the XML is useful if "oml:evaluations" not in evals_dict: raise ValueError( - "Error in return XML, does not contain " '"oml:evaluations": %s' % str(evals_dict) + "Error in return XML, does not contain " '"oml:evaluations": %s' % str(evals_dict), ) - assert type(evals_dict["oml:evaluations"]["oml:evaluation"]) == list, type( - evals_dict["oml:evaluations"] + assert isinstance(evals_dict["oml:evaluations"]["oml:evaluation"], list), type( + evals_dict["oml:evaluations"], ) - evals = collections.OrderedDict() + evals: dict[int, dict | OpenMLEvaluation] = {} uploader_ids = list( - set([eval_["oml:uploader"] for eval_ in evals_dict["oml:evaluations"]["oml:evaluation"]]) + {eval_["oml:uploader"] for eval_ in evals_dict["oml:evaluations"]["oml:evaluation"]}, ) api_users = "user/list/user_id/" + ",".join(uploader_ids) xml_string_user = openml._api_calls._perform_api_call(api_users, "get") @@ -211,32 +253,33 @@ def __list_evaluations(api_call, output_format="object"): user_dict = {user["oml:id"]: user["oml:username"] for user in users["oml:users"]["oml:user"]} for eval_ in evals_dict["oml:evaluations"]["oml:evaluation"]: run_id = int(eval_["oml:run_id"]) + value = None - values = None - array_data = None if "oml:value" in eval_: value = float(eval_["oml:value"]) + + values = None if "oml:values" in eval_: values = json.loads(eval_["oml:values"]) - if "oml:array_data" in eval_: - array_data = eval_["oml:array_data"] + + array_data = eval_.get("oml:array_data") if output_format == "object": evals[run_id] = OpenMLEvaluation( - int(eval_["oml:run_id"]), - int(eval_["oml:task_id"]), - int(eval_["oml:setup_id"]), - int(eval_["oml:flow_id"]), - eval_["oml:flow_name"], - int(eval_["oml:data_id"]), - eval_["oml:data_name"], - eval_["oml:function"], - eval_["oml:upload_time"], - int(eval_["oml:uploader"]), - user_dict[eval_["oml:uploader"]], - value, - values, - array_data, + run_id=run_id, + task_id=int(eval_["oml:task_id"]), + setup_id=int(eval_["oml:setup_id"]), + flow_id=int(eval_["oml:flow_id"]), + flow_name=eval_["oml:flow_name"], + data_id=int(eval_["oml:data_id"]), + data_name=eval_["oml:data_name"], + function=eval_["oml:function"], + upload_time=eval_["oml:upload_time"], + uploader=int(eval_["oml:uploader"]), + uploader_name=user_dict[eval_["oml:uploader"]], + value=value, + values=values, + array_data=array_data, ) else: # for output_format in ['dict', 'dataframe'] @@ -258,12 +301,13 @@ def __list_evaluations(api_call, output_format="object"): } if output_format == "dataframe": - rows = [value for key, value in evals.items()] - evals = pd.DataFrame.from_records(rows, columns=rows[0].keys()) + rows = list(evals.values()) + return pd.DataFrame.from_records(rows, columns=rows[0].keys()) # type: ignore + return evals -def list_evaluation_measures() -> List[str]: +def list_evaluation_measures() -> list[str]: """Return list of evaluation measures available. The function performs an API call to retrieve the entire list of @@ -282,11 +326,10 @@ def list_evaluation_measures() -> List[str]: raise ValueError("Error in return XML, does not contain " '"oml:evaluation_measures"') if not isinstance(qualities["oml:evaluation_measures"]["oml:measures"][0]["oml:measure"], list): raise TypeError("Error in return XML, does not contain " '"oml:measure" as a list') - qualities = qualities["oml:evaluation_measures"]["oml:measures"][0]["oml:measure"] - return qualities + return qualities["oml:evaluation_measures"]["oml:measures"][0]["oml:measure"] -def list_estimation_procedures() -> List[str]: +def list_estimation_procedures() -> list[str]: """Return list of evaluation procedures available. The function performs an API call to retrieve the entire list of @@ -296,7 +339,6 @@ def list_estimation_procedures() -> List[str]: ------- list """ - api_call = "estimationprocedure/list" xml_string = openml._api_calls._perform_api_call(api_call, "get") api_results = xmltodict.parse(xml_string) @@ -309,31 +351,30 @@ def list_estimation_procedures() -> List[str]: if not isinstance(api_results["oml:estimationprocedures"]["oml:estimationprocedure"], list): raise TypeError( - "Error in return XML, does not contain " '"oml:estimationprocedure" as a list' + "Error in return XML, does not contain " '"oml:estimationprocedure" as a list', ) - prods = [ + return [ prod["oml:name"] for prod in api_results["oml:estimationprocedures"]["oml:estimationprocedure"] ] - return prods def list_evaluations_setups( function: str, - offset: Optional[int] = None, - size: Optional[int] = None, - tasks: Optional[List] = None, - setups: Optional[List] = None, - flows: Optional[List] = None, - runs: Optional[List] = None, - uploaders: Optional[List] = None, - tag: Optional[str] = None, - per_fold: Optional[bool] = None, - sort_order: Optional[str] = None, + offset: int | None = None, + size: int | None = None, + tasks: list | None = None, + setups: list | None = None, + flows: list | None = None, + runs: list | None = None, + uploaders: list | None = None, + tag: str | None = None, + per_fold: bool | None = None, + sort_order: str | None = None, output_format: str = "dataframe", - parameters_in_separate_columns: bool = False, -) -> Union[Dict, pd.DataFrame]: + parameters_in_separate_columns: bool = False, # noqa: FBT001, FBT002 +) -> dict | pd.DataFrame: """ List all run-evaluation pairs matching all of the given filters and their hyperparameter settings. @@ -376,7 +417,7 @@ def list_evaluations_setups( """ if parameters_in_separate_columns and (flows is None or len(flows) != 1): raise ValueError( - "Can set parameters_in_separate_columns to true " "only for single flow_id" + "Can set parameters_in_separate_columns to true " "only for single flow_id", ) # List evaluations @@ -397,40 +438,42 @@ def list_evaluations_setups( # List setups # list_setups by setup id does not support large sizes (exceeds URL length limit) # Hence we split the list of unique setup ids returned by list_evaluations into chunks of size N - df = pd.DataFrame() + _df = pd.DataFrame() if len(evals) != 0: N = 100 # size of section length = len(evals["setup_id"].unique()) # length of the array we want to split # array_split - allows indices_or_sections to not equally divide the array # array_split -length % N sub-arrays of size length//N + 1 and the rest of size length//N. - setup_chunks = np.array_split( - ary=evals["setup_id"].unique(), indices_or_sections=((length - 1) // N) + 1 - ) + uniq = np.asarray(evals["setup_id"].unique()) + setup_chunks = np.array_split(uniq, ((length - 1) // N) + 1) setup_data = pd.DataFrame() - for setups in setup_chunks: - result = pd.DataFrame( - openml.setups.list_setups(setup=setups, output_format="dataframe") - ) - result.drop("flow_id", axis=1, inplace=True) + for _setups in setup_chunks: + result = openml.setups.list_setups(setup=_setups, output_format="dataframe") + assert isinstance(result, pd.DataFrame) + result = result.drop("flow_id", axis=1) # concat resulting setup chunks into single datframe setup_data = pd.concat([setup_data, result], ignore_index=True) + parameters = [] # Convert parameters of setup into list of tuples of (hyperparameter, value) for parameter_dict in setup_data["parameters"]: if parameter_dict is not None: parameters.append( - {param["full_name"]: param["value"] for param in parameter_dict.values()} + {param["full_name"]: param["value"] for param in parameter_dict.values()}, ) else: parameters.append({}) setup_data["parameters"] = parameters # Merge setups with evaluations - df = pd.merge(evals, setup_data, on="setup_id", how="left") + _df = evals.merge(setup_data, on="setup_id", how="left") if parameters_in_separate_columns: - df = pd.concat([df.drop("parameters", axis=1), df["parameters"].apply(pd.Series)], axis=1) + _df = pd.concat( + [_df.drop("parameters", axis=1), _df["parameters"].apply(pd.Series)], + axis=1, + ) if output_format == "dataframe": - return df - else: - return df.to_dict(orient="index") + return _df + + return _df.to_dict(orient="index") diff --git a/openml/exceptions.py b/openml/exceptions.py index a86434f51..fe63b8a58 100644 --- a/openml/exceptions.py +++ b/openml/exceptions.py @@ -1,9 +1,10 @@ # License: BSD 3-Clause - -from typing import Optional +from __future__ import annotations class PyOpenMLError(Exception): + """Base class for all exceptions in OpenML-Python.""" + def __init__(self, message: str): self.message = message super().__init__(message) @@ -11,55 +12,47 @@ def __init__(self, message: str): class OpenMLServerError(PyOpenMLError): """class for when something is really wrong on the server - (result did not parse to dict), contains unparsed error.""" + (result did not parse to dict), contains unparsed error. + """ - pass - -class OpenMLServerException(OpenMLServerError): +class OpenMLServerException(OpenMLServerError): # noqa: N818 """exception for when the result of the server was - not 200 (e.g., listing call w/o results).""" + not 200 (e.g., listing call w/o results). + """ # Code needs to be optional to allow the exception to be picklable: # https://stackoverflow.com/questions/16244923/how-to-make-a-custom-exception-class-with-multiple-init-args-pickleable # noqa: E501 - def __init__(self, message: str, code: Optional[int] = None, url: Optional[str] = None): + def __init__(self, message: str, code: int | None = None, url: str | None = None): self.message = message self.code = code self.url = url super().__init__(message) - def __str__(self): + def __str__(self) -> str: return f"{self.url} returned code {self.code}: {self.message}" class OpenMLServerNoResult(OpenMLServerException): """Exception for when the result of the server is empty.""" - pass - -class OpenMLCacheException(PyOpenMLError): +class OpenMLCacheException(PyOpenMLError): # noqa: N818 """Dataset / task etc not found in cache""" - pass - -class OpenMLHashException(PyOpenMLError): +class OpenMLHashException(PyOpenMLError): # noqa: N818 """Locally computed hash is different than hash announced by the server.""" - pass - class OpenMLPrivateDatasetError(PyOpenMLError): """Exception thrown when the user has no rights to access the dataset.""" - pass - class OpenMLRunsExistError(PyOpenMLError): """Indicates run(s) already exists on the server when they should not be duplicated.""" - def __init__(self, run_ids: set, message: str): + def __init__(self, run_ids: set[int], message: str) -> None: if len(run_ids) < 1: raise ValueError("Set of run ids must be non-empty.") self.run_ids = run_ids @@ -69,4 +62,6 @@ def __init__(self, run_ids: set, message: str): class OpenMLNotAuthorizedError(OpenMLServerError): """Indicates an authenticated user is not authorized to execute the requested action.""" - pass + +class ObjectNotPublishedError(PyOpenMLError): + """Indicates an object has not been published yet.""" diff --git a/openml/extensions/__init__.py b/openml/extensions/__init__.py index 91cbc1600..b49865e0e 100644 --- a/openml/extensions/__init__.py +++ b/openml/extensions/__init__.py @@ -3,8 +3,7 @@ from typing import List, Type # noqa: F401 from .extension_interface import Extension -from .functions import register_extension, get_extension_by_model, get_extension_by_flow - +from .functions import get_extension_by_flow, get_extension_by_model, register_extension extensions = [] # type: List[Type[Extension]] diff --git a/openml/extensions/extension_interface.py b/openml/extensions/extension_interface.py index 981bf2417..2a336eb52 100644 --- a/openml/extensions/extension_interface.py +++ b/openml/extensions/extension_interface.py @@ -1,21 +1,21 @@ # License: BSD 3-Clause +from __future__ import annotations from abc import ABC, abstractmethod -from collections import OrderedDict # noqa: F401 -from typing import Any, Dict, List, Optional, Tuple, TYPE_CHECKING, Union - -import numpy as np -import scipy.sparse +from collections import OrderedDict +from typing import TYPE_CHECKING, Any # Avoid import cycles: https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles if TYPE_CHECKING: + import numpy as np + import scipy.sparse + from openml.flows import OpenMLFlow + from openml.runs.trace import OpenMLRunTrace, OpenMLTraceIteration # F401 from openml.tasks.task import OpenMLTask - from openml.runs.trace import OpenMLRunTrace, OpenMLTraceIteration # noqa F401 class Extension(ABC): - """Defines the interface to connect machine learning libraries to OpenML-Python. See ``openml.extension.sklearn.extension`` for an implementation to bootstrap from. @@ -26,7 +26,7 @@ class Extension(ABC): @classmethod @abstractmethod - def can_handle_flow(cls, flow: "OpenMLFlow") -> bool: + def can_handle_flow(cls, flow: OpenMLFlow) -> bool: """Check whether a given flow can be handled by this extension. This is typically done by parsing the ``external_version`` field. @@ -62,9 +62,9 @@ def can_handle_model(cls, model: Any) -> bool: @abstractmethod def flow_to_model( self, - flow: "OpenMLFlow", - initialize_with_defaults: bool = False, - strict_version: bool = True, + flow: OpenMLFlow, + initialize_with_defaults: bool = False, # noqa: FBT001, FBT002 + strict_version: bool = True, # noqa: FBT002, FBT001 ) -> Any: """Instantiate a model from the flow representation. @@ -85,7 +85,7 @@ def flow_to_model( """ @abstractmethod - def model_to_flow(self, model: Any) -> "OpenMLFlow": + def model_to_flow(self, model: Any) -> OpenMLFlow: """Transform a model to a flow for uploading it to OpenML. Parameters @@ -98,7 +98,7 @@ def model_to_flow(self, model: Any) -> "OpenMLFlow": """ @abstractmethod - def get_version_information(self) -> List[str]: + def get_version_information(self) -> list[str]: """List versions of libraries required by the flow. Returns @@ -139,7 +139,7 @@ def is_estimator(self, model: Any) -> bool: """ @abstractmethod - def seed_model(self, model: Any, seed: Optional[int]) -> Any: + def seed_model(self, model: Any, seed: int | None) -> Any: """Set the seed of all the unseeded components of a model and return the seeded model. Required so that all seed information can be uploaded to OpenML for reproducible results. @@ -156,16 +156,16 @@ def seed_model(self, model: Any, seed: Optional[int]) -> Any: """ @abstractmethod - def _run_model_on_fold( + def _run_model_on_fold( # noqa: PLR0913 self, model: Any, - task: "OpenMLTask", - X_train: Union[np.ndarray, scipy.sparse.spmatrix], + task: OpenMLTask, + X_train: np.ndarray | scipy.sparse.spmatrix, rep_no: int, fold_no: int, - y_train: Optional[np.ndarray] = None, - X_test: Optional[Union[np.ndarray, scipy.sparse.spmatrix]] = None, - ) -> Tuple[np.ndarray, np.ndarray, "OrderedDict[str, float]", Optional["OpenMLRunTrace"]]: + y_train: np.ndarray | None = None, + X_test: np.ndarray | scipy.sparse.spmatrix | None = None, + ) -> tuple[np.ndarray, np.ndarray | None, OrderedDict[str, float], OpenMLRunTrace | None]: """Run a model on a repeat, fold, subsample triplet of the task. Returns the data that is necessary to construct the OpenML Run object. Is used by @@ -205,9 +205,9 @@ def _run_model_on_fold( @abstractmethod def obtain_parameter_values( self, - flow: "OpenMLFlow", + flow: OpenMLFlow, model: Any = None, - ) -> List[Dict[str, Any]]: + ) -> list[dict[str, Any]]: """Extracts all parameter settings required for the flow from the model. If no explicit model is provided, the parameters will be extracted from `flow.model` @@ -251,7 +251,7 @@ def check_if_model_fitted(self, model: Any) -> bool: def instantiate_model_from_hpo_class( self, model: Any, - trace_iteration: "OpenMLTraceIteration", + trace_iteration: OpenMLTraceIteration, ) -> Any: """Instantiate a base model which can be searched over by the hyperparameter optimization model. diff --git a/openml/extensions/functions.py b/openml/extensions/functions.py index a080e1004..302ab246c 100644 --- a/openml/extensions/functions.py +++ b/openml/extensions/functions.py @@ -1,7 +1,7 @@ # License: BSD 3-Clause +from __future__ import annotations -from typing import Any, Optional, Type, TYPE_CHECKING -from . import Extension +from typing import TYPE_CHECKING, Any # Need to implement the following by its full path because otherwise it won't be possible to # access openml.extensions.extensions @@ -11,8 +11,10 @@ if TYPE_CHECKING: from openml.flows import OpenMLFlow + from . import Extension -def register_extension(extension: Type[Extension]) -> None: + +def register_extension(extension: type[Extension]) -> None: """Register an extension. Registered extensions are considered by ``get_extension_by_flow`` and @@ -30,9 +32,9 @@ def register_extension(extension: Type[Extension]) -> None: def get_extension_by_flow( - flow: "OpenMLFlow", - raise_if_no_extension: bool = False, -) -> Optional[Extension]: + flow: OpenMLFlow, + raise_if_no_extension: bool = False, # noqa: FBT001, FBT002 +) -> Extension | None: """Get an extension which can handle the given flow. Iterates all registered extensions and checks whether they can handle the presented flow. @@ -55,22 +57,23 @@ def get_extension_by_flow( candidates.append(extension_class()) if len(candidates) == 0: if raise_if_no_extension: - raise ValueError("No extension registered which can handle flow: {}".format(flow)) - else: - return None - elif len(candidates) == 1: + raise ValueError(f"No extension registered which can handle flow: {flow}") + + return None + + if len(candidates) == 1: return candidates[0] - else: - raise ValueError( - "Multiple extensions registered which can handle flow: {}, but only one " - "is allowed ({}).".format(flow, candidates) - ) + + raise ValueError( + f"Multiple extensions registered which can handle flow: {flow}, but only one " + f"is allowed ({candidates}).", + ) def get_extension_by_model( model: Any, - raise_if_no_extension: bool = False, -) -> Optional[Extension]: + raise_if_no_extension: bool = False, # noqa: FBT001, FBT002 +) -> Extension | None: """Get an extension which can handle the given flow. Iterates all registered extensions and checks whether they can handle the presented model. @@ -93,13 +96,14 @@ def get_extension_by_model( candidates.append(extension_class()) if len(candidates) == 0: if raise_if_no_extension: - raise ValueError("No extension registered which can handle model: {}".format(model)) - else: - return None - elif len(candidates) == 1: + raise ValueError(f"No extension registered which can handle model: {model}") + + return None + + if len(candidates) == 1: return candidates[0] - else: - raise ValueError( - "Multiple extensions registered which can handle model: {}, but only one " - "is allowed ({}).".format(model, candidates) - ) + + raise ValueError( + f"Multiple extensions registered which can handle model: {model}, but only one " + f"is allowed ({candidates}).", + ) diff --git a/openml/extensions/sklearn/__init__.py b/openml/extensions/sklearn/__init__.py index 135e5ccf6..9c1c6cba6 100644 --- a/openml/extensions/sklearn/__init__.py +++ b/openml/extensions/sklearn/__init__.py @@ -1,15 +1,21 @@ # License: BSD 3-Clause +from __future__ import annotations + +from typing import TYPE_CHECKING -from .extension import SklearnExtension from openml.extensions import register_extension +from .extension import SklearnExtension + +if TYPE_CHECKING: + import pandas as pd __all__ = ["SklearnExtension"] register_extension(SklearnExtension) -def cont(X): +def cont(X: pd.DataFrame) -> pd.Series: """Returns True for all non-categorical columns, False for the rest. This is a helper function for OpenML datasets encoded as DataFrames simplifying the handling @@ -23,7 +29,7 @@ def cont(X): return X.dtypes != "category" -def cat(X): +def cat(X: pd.DataFrame) -> pd.Series: """Returns True for all categorical columns, False for the rest. This is a helper function for OpenML datasets encoded as DataFrames simplifying the handling diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index 82d202e9c..3427ca7c9 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -1,23 +1,27 @@ # License: BSD 3-Clause +from __future__ import annotations -from collections import OrderedDict # noqa: F401 +import contextlib import copy -from distutils.version import LooseVersion import importlib import inspect import json import logging import re -from re import IGNORECASE import sys import time -from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union, cast, Sized +import traceback import warnings +from collections import OrderedDict +from distutils.version import LooseVersion +from json.decoder import JSONDecodeError +from re import IGNORECASE +from typing import Any, Callable, List, Sized, cast import numpy as np import pandas as pd -import scipy.stats import scipy.sparse +import scipy.stats import sklearn.base import sklearn.model_selection import sklearn.pipeline @@ -26,26 +30,22 @@ from openml.exceptions import PyOpenMLError from openml.extensions import Extension from openml.flows import OpenMLFlow -from openml.runs.trace import OpenMLRunTrace, OpenMLTraceIteration, PREFIX +from openml.runs.trace import PREFIX, OpenMLRunTrace, OpenMLTraceIteration from openml.tasks import ( - OpenMLTask, - OpenMLSupervisedTask, OpenMLClassificationTask, - OpenMLLearningCurveTask, OpenMLClusteringTask, + OpenMLLearningCurveTask, OpenMLRegressionTask, + OpenMLSupervisedTask, + OpenMLTask, ) logger = logging.getLogger(__name__) -if sys.version_info >= (3, 5): - from json.decoder import JSONDecodeError -else: - JSONDecodeError = ValueError DEPENDENCIES_PATTERN = re.compile( r"^(?P[\w\-]+)((?P==|>=|>)" - r"(?P(\d+\.)?(\d+\.)?(\d+)?(dev)?[0-9]*))?$" + r"(?P(\d+\.)?(\d+\.)?(\d+)?(dev)?[0-9]*))?$", ) SIMPLE_NUMPY_TYPES = [ @@ -54,7 +54,7 @@ for nptype in nptypes # type: ignore if type_cat != "others" ] -SIMPLE_TYPES = tuple([bool, int, float, str] + SIMPLE_NUMPY_TYPES) +SIMPLE_TYPES = (bool, int, float, str, *SIMPLE_NUMPY_TYPES) SKLEARN_PIPELINE_STRING_COMPONENTS = ("drop", "passthrough") COMPONENT_REFERENCE = "component_reference" @@ -71,7 +71,7 @@ class SklearnExtension(Extension): # General setup @classmethod - def can_handle_flow(cls, flow: "OpenMLFlow") -> bool: + def can_handle_flow(cls, flow: OpenMLFlow) -> bool: """Check whether a given describes a scikit-learn estimator. This is done by parsing the ``external_version`` field. @@ -101,8 +101,11 @@ def can_handle_model(cls, model: Any) -> bool: return isinstance(model, sklearn.base.BaseEstimator) @classmethod - def trim_flow_name( - cls, long_name: str, extra_trim_length: int = 100, _outer: bool = True + def trim_flow_name( # noqa: C901 + cls, + long_name: str, + extra_trim_length: int = 100, + _outer: bool = True, # noqa: FBT001, FBT002 ) -> str: """Shorten generated sklearn flow name to at most ``max_length`` characters. @@ -157,7 +160,7 @@ def remove_all_in_parentheses(string: str) -> str: # the example below, we want to trim `sklearn.tree.tree.DecisionTreeClassifier`, and # keep it in the final trimmed flow name: # sklearn.pipeline.Pipeline(Imputer=sklearn.preprocessing.imputation.Imputer, - # VarianceThreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, + # VarianceThreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, # noqa: ERA001, E501 # Estimator=sklearn.model_selection._search.RandomizedSearchCV(estimator= # sklearn.tree.tree.DecisionTreeClassifier)) if "sklearn.model_selection" in long_name: @@ -173,7 +176,7 @@ def remove_all_in_parentheses(string: str) -> str: # Now we want to also find and parse the `estimator`, for this we find the closing # parenthesis to the model selection technique: closing_parenthesis_expected = 1 - for i, char in enumerate(long_name[estimator_start:], start=estimator_start): + for char in long_name[estimator_start:]: if char == "(": closing_parenthesis_expected += 1 if char == ")": @@ -181,11 +184,13 @@ def remove_all_in_parentheses(string: str) -> str: if closing_parenthesis_expected == 0: break - model_select_pipeline = long_name[estimator_start:i] + _end: int = estimator_start + len(long_name[estimator_start:]) - 1 + model_select_pipeline = long_name[estimator_start:_end] + trimmed_pipeline = cls.trim_flow_name(model_select_pipeline, _outer=False) _, trimmed_pipeline = trimmed_pipeline.split(".", maxsplit=1) # trim module prefix - model_select_short = "sklearn.{}[{}]".format(model_selection_class, trimmed_pipeline) - name = long_name[:start_index] + model_select_short + long_name[i + 1 :] + model_select_short = f"sklearn.{model_selection_class}[{trimmed_pipeline}]" + name = long_name[:start_index] + model_select_short + long_name[_end + 1 :] else: name = long_name @@ -204,7 +209,7 @@ def remove_all_in_parentheses(string: str) -> str: components = [component.split(".")[-1] for component in pipeline.split(",")] pipeline = "{}({})".format(pipeline_class, ",".join(components)) if len(short_name.format(pipeline)) > extra_trim_length: - pipeline = "{}(...,{})".format(pipeline_class, components[-1]) + pipeline = f"{pipeline_class}(...,{components[-1]})" else: # Just a simple component: e.g. sklearn.tree.DecisionTreeClassifier pipeline = remove_all_in_parentheses(name).split(".")[-1] @@ -242,10 +247,10 @@ def _min_dependency_str(cls, sklearn_version: str) -> str: from sklearn import _min_dependencies as _mindep dependency_list = { - "numpy": "{}".format(_mindep.NUMPY_MIN_VERSION), - "scipy": "{}".format(_mindep.SCIPY_MIN_VERSION), - "joblib": "{}".format(_mindep.JOBLIB_MIN_VERSION), - "threadpoolctl": "{}".format(_mindep.THREADPOOLCTL_MIN_VERSION), + "numpy": f"{_mindep.NUMPY_MIN_VERSION}", + "scipy": f"{_mindep.SCIPY_MIN_VERSION}", + "joblib": f"{_mindep.JOBLIB_MIN_VERSION}", + "threadpoolctl": f"{_mindep.THREADPOOLCTL_MIN_VERSION}", } elif LooseVersion(sklearn_version) >= "0.23": dependency_list = { @@ -269,8 +274,8 @@ def _min_dependency_str(cls, sklearn_version: str) -> str: # the dependency list will be accurately updated for any flow uploaded to OpenML dependency_list = {"numpy": "1.6.1", "scipy": "0.9"} - sklearn_dep = "sklearn=={}".format(sklearn_version) - dep_str = "\n".join(["{}>={}".format(k, v) for k, v in dependency_list.items()]) + sklearn_dep = f"sklearn=={sklearn_version}" + dep_str = "\n".join([f"{k}>={v}" for k, v in dependency_list.items()]) return "\n".join([sklearn_dep, dep_str]) ################################################################################################ @@ -278,9 +283,9 @@ def _min_dependency_str(cls, sklearn_version: str) -> str: def flow_to_model( self, - flow: "OpenMLFlow", - initialize_with_defaults: bool = False, - strict_version: bool = True, + flow: OpenMLFlow, + initialize_with_defaults: bool = False, # noqa: FBT001, FBT002 + strict_version: bool = True, # noqa: FBT001, FBT002 ) -> Any: """Initializes a sklearn model based on a flow. @@ -302,16 +307,18 @@ def flow_to_model( mixed """ return self._deserialize_sklearn( - flow, initialize_with_defaults=initialize_with_defaults, strict_version=strict_version + flow, + initialize_with_defaults=initialize_with_defaults, + strict_version=strict_version, ) - def _deserialize_sklearn( + def _deserialize_sklearn( # noqa: PLR0915, C901, PLR0913, PLR0912 self, o: Any, - components: Optional[Dict] = None, - initialize_with_defaults: bool = False, + components: dict | None = None, + initialize_with_defaults: bool = False, # noqa: FBT001, FBT002 recursion_depth: int = 0, - strict_version: bool = True, + strict_version: bool = True, # noqa: FBT002, FBT001 ) -> Any: """Recursive function to deserialize a scikit-learn flow. @@ -346,10 +353,10 @@ def _deserialize_sklearn( ------- mixed """ - logger.info( - "-%s flow_to_sklearn START o=%s, components=%s, init_defaults=%s" - % ("-" * recursion_depth, o, components, initialize_with_defaults) + "-{} flow_to_sklearn START o={}, components={}, init_defaults={}".format( + "-" * recursion_depth, o, components, initialize_with_defaults + ), ) depth_pp = recursion_depth + 1 # shortcut var, depth plus plus @@ -359,10 +366,8 @@ def _deserialize_sklearn( # the parameter values to the correct type. if isinstance(o, str): - try: + with contextlib.suppress(JSONDecodeError): o = json.loads(o) - except JSONDecodeError: - pass if isinstance(o, dict): # Check if the dict encodes a 'special' object, which could not @@ -382,7 +387,9 @@ def _deserialize_sklearn( pass elif serialized_type == COMPONENT_REFERENCE: value = self._deserialize_sklearn( - value, recursion_depth=depth_pp, strict_version=strict_version + value, + recursion_depth=depth_pp, + strict_version=strict_version, ) else: raise NotImplementedError(serialized_type) @@ -407,7 +414,9 @@ def _deserialize_sklearn( rval = (step_name, component, value["argument_1"]) elif serialized_type == "cv_object": rval = self._deserialize_cross_validator( - value, recursion_depth=recursion_depth, strict_version=strict_version + value, + recursion_depth=recursion_depth, + strict_version=strict_version, ) else: raise ValueError("Cannot flow_to_sklearn %s" % serialized_type) @@ -458,10 +467,12 @@ def _deserialize_sklearn( ) else: raise TypeError(o) - logger.info("-%s flow_to_sklearn END o=%s, rval=%s" % ("-" * recursion_depth, o, rval)) + logger.info( + "-{} flow_to_sklearn END o={}, rval={}".format("-" * recursion_depth, o, rval) + ) return rval - def model_to_flow(self, model: Any) -> "OpenMLFlow": + def model_to_flow(self, model: Any) -> OpenMLFlow: """Transform a scikit-learn model to a flow for uploading it to OpenML. Parameters @@ -475,7 +486,7 @@ def model_to_flow(self, model: Any) -> "OpenMLFlow": # Necessary to make pypy not complain about all the different possible return types return self._serialize_sklearn(model) - def _serialize_sklearn(self, o: Any, parent_model: Optional[Any] = None) -> Any: + def _serialize_sklearn(self, o: Any, parent_model: Any | None = None) -> Any: # noqa: PLR0912, C901 rval = None # type: Any # TODO: assert that only on first recursion lvl `parent_model` can be None @@ -502,19 +513,17 @@ def _serialize_sklearn(self, o: Any, parent_model: Optional[Any] = None) -> Any: elif isinstance(o, dict): # TODO: explain what type of parameter is here if not isinstance(o, OrderedDict): - o = OrderedDict([(key, value) for key, value in sorted(o.items())]) + o = OrderedDict(sorted(o.items())) rval = OrderedDict() for key, value in o.items(): if not isinstance(key, str): raise TypeError( "Can only use string as keys, you passed " - "type %s for value %s." % (type(key), str(key)) + f"type {type(key)} for value {key!s}.", ) - key = self._serialize_sklearn(key, parent_model) - value = self._serialize_sklearn(value, parent_model) - rval[key] = value - rval = rval + _key = self._serialize_sklearn(key, parent_model) + rval[_key] = self._serialize_sklearn(value, parent_model) elif isinstance(o, type): # TODO: explain what type of parameter is here rval = self._serialize_type(o) @@ -534,7 +543,7 @@ def _serialize_sklearn(self, o: Any, parent_model: Optional[Any] = None) -> Any: return rval - def get_version_information(self) -> List[str]: + def get_version_information(self) -> list[str]: """List versions of libraries required by the flow. Libraries listed are ``Python``, ``scikit-learn``, ``numpy`` and ``scipy``. @@ -543,22 +552,21 @@ def get_version_information(self) -> List[str]: ------- List """ - # This can possibly be done by a package such as pyxb, but I could not get # it to work properly. - import sklearn - import scipy import numpy + import scipy + import sklearn major, minor, micro, _, _ = sys.version_info python_version = "Python_{}.".format(".".join([str(major), str(minor), str(micro)])) - sklearn_version = "Sklearn_{}.".format(sklearn.__version__) - numpy_version = "NumPy_{}.".format(numpy.__version__) # type: ignore - scipy_version = "SciPy_{}.".format(scipy.__version__) + sklearn_version = f"Sklearn_{sklearn.__version__}." + numpy_version = f"NumPy_{numpy.__version__}." # type: ignore + scipy_version = f"SciPy_{scipy.__version__}." return [python_version, sklearn_version, numpy_version, scipy_version] - def create_setup_string(self, model: Any) -> str: + def create_setup_string(self, model: Any) -> str: # noqa: ARG002 """Create a string which can be used to reinstantiate the given model. Parameters @@ -569,8 +577,7 @@ def create_setup_string(self, model: Any) -> str: ------- str """ - run_environment = " ".join(self.get_version_information()) - return run_environment + return " ".join(self.get_version_information()) def _is_cross_validator(self, o: Any) -> bool: return isinstance(o, sklearn.model_selection.BaseCrossValidator) @@ -584,7 +591,7 @@ def _is_sklearn_flow(cls, flow: OpenMLFlow) -> bool: return sklearn_dependency or sklearn_as_external def _get_sklearn_description(self, model: Any, char_lim: int = 1024) -> str: - """Fetches the sklearn function docstring for the flow description + r"""Fetches the sklearn function docstring for the flow description Retrieves the sklearn docstring available and does the following: * If length of docstring <= char_lim, then returns the complete docstring @@ -618,14 +625,13 @@ def match_format(s): s = s[:index] # trimming docstring to be within char_lim if len(s) > char_lim: - s = "{}...".format(s[: char_lim - 3]) + s = f"{s[: char_lim - 3]}..." return s.strip() except ValueError: logger.warning( "'Read more' not found in descriptions. " - "Trying to trim till 'Parameters' if available in docstring." + "Trying to trim till 'Parameters' if available in docstring.", ) - pass try: # if 'Read more' doesn't exist, trim till 'Parameters' pattern = "Parameters" @@ -637,10 +643,10 @@ def match_format(s): s = s[:index] # trimming docstring to be within char_lim if len(s) > char_lim: - s = "{}...".format(s[: char_lim - 3]) + s = f"{s[: char_lim - 3]}..." return s.strip() - def _extract_sklearn_parameter_docstring(self, model) -> Union[None, str]: + def _extract_sklearn_parameter_docstring(self, model) -> None | str: """Extracts the part of sklearn docstring containing parameter information Fetches the entire docstring and trims just the Parameter section. @@ -678,7 +684,7 @@ def match_format(s): index2 = s.index(match_format(h)) break except ValueError: - logger.warning("{} not available in docstring".format(h)) + logger.warning(f"{h} not available in docstring") continue else: # in the case only 'Parameters' exist, trim till end of docstring @@ -686,7 +692,7 @@ def match_format(s): s = s[index1:index2] return s.strip() - def _extract_sklearn_param_info(self, model, char_lim=1024) -> Union[None, Dict]: + def _extract_sklearn_param_info(self, model, char_lim=1024) -> None | dict: """Parses parameter type and description from sklearn dosctring Parameters @@ -715,7 +721,7 @@ def _extract_sklearn_param_info(self, model, char_lim=1024) -> Union[None, Dict] # collecting parameters and their descriptions description = [] # type: List - for i, s in enumerate(lines): + for s in lines: param = p.findall(s) if param != []: # a parameter definition is found by regex @@ -724,19 +730,18 @@ def _extract_sklearn_param_info(self, model, char_lim=1024) -> Union[None, Dict] # till another parameter is found and a new placeholder is created placeholder = [""] # type: List[str] description.append(placeholder) - else: - if len(description) > 0: # description=[] means no parameters found yet - # appending strings to the placeholder created when parameter found - description[-1].append(s) + elif len(description) > 0: # description=[] means no parameters found yet + # appending strings to the placeholder created when parameter found + description[-1].append(s) for i in range(len(description)): # concatenating parameter description strings description[i] = "\n".join(description[i]).strip() # limiting all parameter descriptions to accepted OpenML string length if len(description[i]) > char_lim: - description[i] = "{}...".format(description[i][: char_lim - 3]) + description[i] = f"{description[i][: char_lim - 3]}..." # collecting parameters and their types - parameter_docs = OrderedDict() # type: Dict + parameter_docs = OrderedDict() matches = p.findall(docstring) for i, param in enumerate(matches): key, value = str(param).split(":") @@ -765,7 +770,6 @@ def _serialize_model(self, model: Any) -> OpenMLFlow: OpenMLFlow """ - # Get all necessary information about the model objects itself ( parameters, @@ -786,25 +790,24 @@ def _serialize_model(self, model: Any) -> OpenMLFlow: # will be part of the name (in brackets) sub_components_names = "" for key in subcomponents: - if isinstance(subcomponents[key], OpenMLFlow): - name = subcomponents[key].name + name_thing = subcomponents[key] + if isinstance(name_thing, OpenMLFlow): + name = name_thing.name elif ( - isinstance(subcomponents[key], str) + isinstance(name_thing, str) and subcomponents[key] in SKLEARN_PIPELINE_STRING_COMPONENTS ): - name = subcomponents[key] + name = name_thing else: raise TypeError(type(subcomponents[key])) + if key in subcomponents_explicit: sub_components_names += "," + key + "=" + name else: sub_components_names += "," + name - if sub_components_names: - # slice operation on string in order to get rid of leading comma - name = "%s(%s)" % (class_name, sub_components_names[1:]) - else: - name = class_name + # slice operation on string in order to get rid of leading comma + name = f"{class_name}({sub_components_names[1:]})" if sub_components_names else class_name short_name = SklearnExtension.trim_flow_name(name) # Get the external versions of all sub-components @@ -813,7 +816,7 @@ def _serialize_model(self, model: Any) -> OpenMLFlow: tags = self._get_tags() sklearn_description = self._get_sklearn_description(model) - flow = OpenMLFlow( + return OpenMLFlow( name=name, class_name=class_name, custom_name=short_name, @@ -829,14 +832,11 @@ def _serialize_model(self, model: Any) -> OpenMLFlow: dependencies=dependencies, ) - return flow - def _get_dependencies(self) -> str: - dependencies = self._min_dependency_str(sklearn.__version__) - return dependencies + return self._min_dependency_str(sklearn.__version__) # type: ignore - def _get_tags(self) -> List[str]: - sklearn_version = self._format_external_version("sklearn", sklearn.__version__) + def _get_tags(self) -> list[str]: + sklearn_version = self._format_external_version("sklearn", sklearn.__version__) # type: ignore sklearn_version_formatted = sklearn_version.replace("==", "_") return [ "openml-python", @@ -853,7 +853,7 @@ def _get_tags(self) -> List[str]: def _get_external_version_string( self, model: Any, - sub_components: Dict[str, OpenMLFlow], + sub_components: dict[str, OpenMLFlow], ) -> str: # Create external version string for a flow, given the model and the # already parsed dictionary of sub_components. Retrieves the external @@ -875,7 +875,7 @@ def _get_external_version_string( external_versions.add(external_version) openml_version = self._format_external_version("openml", openml.__version__) - sklearn_version = self._format_external_version("sklearn", sklearn.__version__) + sklearn_version = self._format_external_version("sklearn", sklearn.__version__) # type: ignore external_versions.add(openml_version) external_versions.add(sklearn_version) for visitee in sub_components.values(): @@ -883,16 +883,16 @@ def _get_external_version_string( continue for external_version in visitee.external_version.split(","): external_versions.add(external_version) - return ",".join(list(sorted(external_versions))) + return ",".join(sorted(external_versions)) def _check_multiple_occurence_of_component_in_flow( self, model: Any, - sub_components: Dict[str, OpenMLFlow], + sub_components: dict[str, OpenMLFlow], ) -> None: - to_visit_stack = [] # type: List[OpenMLFlow] + to_visit_stack: list[OpenMLFlow] = [] to_visit_stack.extend(sub_components.values()) - known_sub_components = set() # type: Set[str] + known_sub_components: set[str] = set() while len(to_visit_stack) > 0: visitee = to_visit_stack.pop() @@ -900,21 +900,21 @@ def _check_multiple_occurence_of_component_in_flow( known_sub_components.add(visitee) elif visitee.name in known_sub_components: raise ValueError( - "Found a second occurence of component %s when " - "trying to serialize %s." % (visitee.name, model) + f"Found a second occurence of component {visitee.name} when " + f"trying to serialize {model}.", ) else: known_sub_components.add(visitee.name) to_visit_stack.extend(visitee.components.values()) - def _extract_information_from_model( + def _extract_information_from_model( # noqa: PLR0915, C901, PLR0912 self, model: Any, - ) -> Tuple[ - "OrderedDict[str, Optional[str]]", - "OrderedDict[str, Optional[Dict]]", - "OrderedDict[str, OpenMLFlow]", - Set, + ) -> tuple[ + OrderedDict[str, str | None], + OrderedDict[str, dict | None], + OrderedDict[str, OpenMLFlow], + set, ]: # This function contains four "global" states and is quite long and # complicated. If it gets to complicated to ensure it's correctness, @@ -926,8 +926,8 @@ def _extract_information_from_model( sub_components = OrderedDict() # type: OrderedDict[str, OpenMLFlow] # stores the keys of all subcomponents that should become sub_components_explicit = set() - parameters = OrderedDict() # type: OrderedDict[str, Optional[str]] - parameters_meta_info = OrderedDict() # type: OrderedDict[str, Optional[Dict]] + parameters: OrderedDict[str, str | None] = OrderedDict() + parameters_meta_info: OrderedDict[str, dict | None] = OrderedDict() parameters_docs = self._extract_sklearn_param_info(model) model_parameters = model.get_params(deep=False) @@ -951,18 +951,16 @@ def flatten_all(list_): isinstance(rval, (list, tuple)) and len(rval) > 0 and isinstance(rval[0], (list, tuple)) - and all([isinstance(rval_i, type(rval[0])) for rval_i in rval]) + and all(isinstance(rval_i, type(rval[0])) for rval_i in rval) ) # Check that all list elements are of simple types. nested_list_of_simple_types = ( is_non_empty_list_of_lists_with_same_type - and all([isinstance(el, SIMPLE_TYPES) for el in flatten_all(rval)]) + and all(isinstance(el, SIMPLE_TYPES) for el in flatten_all(rval)) and all( - [ - len(rv) in (2, 3) and rv[1] not in SKLEARN_PIPELINE_STRING_COMPONENTS - for rv in rval - ] + len(rv) in (2, 3) and rv[1] not in SKLEARN_PIPELINE_STRING_COMPONENTS + for rv in rval ) ) @@ -970,10 +968,10 @@ def flatten_all(list_): # If a list of lists is identified that include 'non-simple' types (e.g. objects), # we assume they are steps in a pipeline, feature union, or base classifiers in # a voting classifier. - parameter_value = list() # type: List + parameter_value = [] # type: List reserved_keywords = set(model.get_params(deep=False).keys()) - for i, sub_component_tuple in enumerate(rval): + for sub_component_tuple in rval: identifier = sub_component_tuple[0] sub_component = sub_component_tuple[1] sub_component_type = type(sub_component_tuple) @@ -982,7 +980,7 @@ def flatten_all(list_): # Pipeline.steps, FeatureUnion.transformer_list} # length 3 is for ColumnTransformer msg = "Length of tuple of type {} does not match assumptions".format( - sub_component_type + sub_component_type, ) raise ValueError(msg) @@ -994,9 +992,7 @@ def flatten_all(list_): "got %s" % sub_component ) raise ValueError(msg) - else: - pass - elif isinstance(sub_component, type(None)): + elif sub_component is None: msg = ( "Cannot serialize objects of None type. Please use a valid " "placeholder for None. Note that empty sklearn estimators can be " @@ -1011,8 +1007,8 @@ def flatten_all(list_): raise TypeError(msg) if identifier in reserved_keywords: - parent_model = "{}.{}".format(model.__module__, model.__class__.__name__) - msg = "Found element shadowing official " "parameter for %s: %s" % ( + parent_model = f"{model.__module__}.{model.__class__.__name__}" + msg = "Found element shadowing official " "parameter for {}: {}".format( parent_model, identifier, ) @@ -1038,11 +1034,11 @@ def flatten_all(list_): dependencies=dependencies, model=None, ) - component_reference = OrderedDict() # type: Dict[str, Union[str, Dict]] + component_reference: OrderedDict[str, str | dict] = OrderedDict() component_reference[ "oml-python:serialized_object" ] = COMPOSITION_STEP_CONSTANT - cr_value = OrderedDict() # type: Dict[str, Any] + cr_value: dict[str, Any] = OrderedDict() cr_value["key"] = identifier cr_value["step_name"] = identifier if len(sub_component_tuple) == 3: @@ -1084,27 +1080,27 @@ def flatten_all(list_): cr = self._serialize_sklearn(component_reference, model) parameters[k] = json.dumps(cr) + elif not (hasattr(rval, "__len__") and len(rval) == 0): + rval = json.dumps(rval) + parameters[k] = rval + # a regular hyperparameter else: - # a regular hyperparameter - if not (hasattr(rval, "__len__") and len(rval) == 0): - rval = json.dumps(rval) - parameters[k] = rval - else: - parameters[k] = None + parameters[k] = None if parameters_docs is not None: data_type, description = parameters_docs[k] parameters_meta_info[k] = OrderedDict( - (("description", description), ("data_type", data_type)) + (("description", description), ("data_type", data_type)), ) else: parameters_meta_info[k] = OrderedDict((("description", None), ("data_type", None))) return parameters, parameters_meta_info, sub_components, sub_components_explicit - def _get_fn_arguments_with_defaults(self, fn_name: Callable) -> Tuple[Dict, Set]: + def _get_fn_arguments_with_defaults(self, fn_name: Callable) -> tuple[dict, set]: """ - Returns: + Returns + ------- i) a dict with all parameter names that have a default value, and ii) a set with all parameter names that do not have a default @@ -1123,8 +1119,8 @@ def _get_fn_arguments_with_defaults(self, fn_name: Callable) -> Tuple[Dict, Set] # parameters with defaults are optional, all others are required. parameters = inspect.signature(fn_name).parameters required_params = set() - optional_params = dict() - for param in parameters.keys(): + optional_params = {} + for param in parameters: parameter = parameters.get(param) default_val = parameter.default # type: ignore if default_val is inspect.Signature.empty: @@ -1136,17 +1132,17 @@ def _get_fn_arguments_with_defaults(self, fn_name: Callable) -> Tuple[Dict, Set] def _deserialize_model( self, flow: OpenMLFlow, - keep_defaults: bool, + keep_defaults: bool, # noqa: FBT001 recursion_depth: int, - strict_version: bool = True, + strict_version: bool = True, # noqa: FBT002, FBT001 ) -> Any: - logger.info("-%s deserialize %s" % ("-" * recursion_depth, flow.name)) + logger.info("-{} deserialize {}".format("-" * recursion_depth, flow.name)) model_name = flow.class_name self._check_dependencies(flow.dependencies, strict_version=strict_version) parameters = flow.parameters components = flow.components - parameter_dict = OrderedDict() # type: Dict[str, Any] + parameter_dict: dict[str, Any] = OrderedDict() # Do a shallow copy of the components dictionary so we can remove the # components from this copy once we added them into the pipeline. This @@ -1157,7 +1153,9 @@ def _deserialize_model( for name in parameters: value = parameters.get(name) - logger.info("--%s flow_parameter=%s, value=%s" % ("-" * recursion_depth, name, value)) + logger.info( + "--{} flow_parameter={}, value={}".format("-" * recursion_depth, name, value) + ) rval = self._deserialize_sklearn( value, components=components_, @@ -1173,36 +1171,46 @@ def _deserialize_model( if name not in components_: continue value = components[name] - logger.info("--%s flow_component=%s, value=%s" % ("-" * recursion_depth, name, value)) + logger.info( + "--{} flow_component={}, value={}".format("-" * recursion_depth, name, value) + ) rval = self._deserialize_sklearn( - value, recursion_depth=recursion_depth + 1, strict_version=strict_version + value, + recursion_depth=recursion_depth + 1, + strict_version=strict_version, ) parameter_dict[name] = rval if model_name is None and flow.name in SKLEARN_PIPELINE_STRING_COMPONENTS: return flow.name - else: - module_name = model_name.rsplit(".", 1) - model_class = getattr(importlib.import_module(module_name[0]), module_name[1]) - - if keep_defaults: - # obtain all params with a default - param_defaults, _ = self._get_fn_arguments_with_defaults(model_class.__init__) - - # delete the params that have a default from the dict, - # so they get initialized with their default value - # except [...] - for param in param_defaults: - # [...] the ones that also have a key in the components dict. - # As OpenML stores different flows for ensembles with different - # (base-)components, in OpenML terms, these are not considered - # hyperparameters but rather constants (i.e., changing them would - # result in a different flow) - if param not in components.keys(): - del parameter_dict[param] - return model_class(**parameter_dict) - - def _check_dependencies(self, dependencies: str, strict_version: bool = True) -> None: + + assert model_name is not None + module_name = model_name.rsplit(".", 1) + model_class = getattr(importlib.import_module(module_name[0]), module_name[1]) + + if keep_defaults: + # obtain all params with a default + param_defaults, _ = self._get_fn_arguments_with_defaults(model_class.__init__) + + # delete the params that have a default from the dict, + # so they get initialized with their default value + # except [...] + for param in param_defaults: + # [...] the ones that also have a key in the components dict. + # As OpenML stores different flows for ensembles with different + # (base-)components, in OpenML terms, these are not considered + # hyperparameters but rather constants (i.e., changing them would + # result in a different flow) + if param not in components: + del parameter_dict[param] + + return model_class(**parameter_dict) + + def _check_dependencies( + self, + dependencies: str, + strict_version: bool = True, # noqa: FBT001, FBT002 + ) -> None: if not dependencies: return @@ -1232,15 +1240,15 @@ def _check_dependencies(self, dependencies: str, strict_version: bool = True) -> raise NotImplementedError("operation '%s' is not supported" % operation) message = ( "Trying to deserialize a model with dependency " - "%s not satisfied." % dependency_string + f"{dependency_string} not satisfied." ) if not check: if strict_version: raise ValueError(message) - else: - warnings.warn(message) - def _serialize_type(self, o: Any) -> "OrderedDict[str, str]": + warnings.warn(message, category=UserWarning, stacklevel=2) + + def _serialize_type(self, o: Any) -> OrderedDict[str, str]: mapping = { float: "float", np.float32: "np.float32", @@ -1250,8 +1258,8 @@ def _serialize_type(self, o: Any) -> "OrderedDict[str, str]": np.int64: "np.int64", } if LooseVersion(np.__version__) < "1.24": - mapping[np.float] = "np.float" - mapping[np.int] = "np.int" + mapping[float] = "np.float" + mapping[int] = "np.int" ret = OrderedDict() # type: 'OrderedDict[str, str]' ret["oml-python:serialized_object"] = "type" @@ -1267,26 +1275,28 @@ def _deserialize_type(self, o: str) -> Any: "np.int32": np.int32, "np.int64": np.int64, } + + # TODO(eddiebergman): Might be able to remove this if LooseVersion(np.__version__) < "1.24": - mapping["np.float"] = np.float - mapping["np.int"] = np.int + mapping["np.float"] = np.float # type: ignore # noqa: NPY001 + mapping["np.int"] = np.int # type: ignore # noqa: NPY001 return mapping[o] - def _serialize_rv_frozen(self, o: Any) -> "OrderedDict[str, Union[str, Dict]]": + def _serialize_rv_frozen(self, o: Any) -> OrderedDict[str, str | dict]: args = o.args kwds = o.kwds a = o.a b = o.b dist = o.dist.__class__.__module__ + "." + o.dist.__class__.__name__ - ret = OrderedDict() # type: 'OrderedDict[str, Union[str, Dict]]' + ret: OrderedDict[str, str | dict] = OrderedDict() ret["oml-python:serialized_object"] = "rv_frozen" ret["value"] = OrderedDict( - (("dist", dist), ("a", a), ("b", b), ("args", args), ("kwds", kwds)) + (("dist", dist), ("a", a), ("b", b), ("args", args), ("kwds", kwds)), ) return ret - def _deserialize_rv_frozen(self, o: "OrderedDict[str, str]") -> Any: + def _deserialize_rv_frozen(self, o: OrderedDict[str, str]) -> Any: args = o["args"] kwds = o["kwds"] a = o["a"] @@ -1296,17 +1306,23 @@ def _deserialize_rv_frozen(self, o: "OrderedDict[str, str]") -> Any: module_name = dist_name.rsplit(".", 1) try: rv_class = getattr(importlib.import_module(module_name[0]), module_name[1]) - except AttributeError: - warnings.warn("Cannot create model %s for flow." % dist_name) + except AttributeError as e: + _tb = traceback.format_exc() + warnings.warn( + f"Cannot create model {dist_name} for flow. Reason is from error {type(e)}:{e}" + f"\nTraceback: {_tb}", + RuntimeWarning, + stacklevel=2, + ) return None - dist = scipy.stats.distributions.rv_frozen(rv_class(), *args, **kwds) + dist = scipy.stats.distributions.rv_frozen(rv_class(), *args, **kwds) # type: ignore dist.a = a dist.b = b return dist - def _serialize_function(self, o: Callable) -> "OrderedDict[str, str]": + def _serialize_function(self, o: Callable) -> OrderedDict[str, str]: name = o.__module__ + "." + o.__name__ ret = OrderedDict() # type: 'OrderedDict[str, str]' ret["oml-python:serialized_object"] = "function" @@ -1315,11 +1331,10 @@ def _serialize_function(self, o: Callable) -> "OrderedDict[str, str]": def _deserialize_function(self, name: str) -> Callable: module_name = name.rsplit(".", 1) - function_handle = getattr(importlib.import_module(module_name[0]), module_name[1]) - return function_handle + return getattr(importlib.import_module(module_name[0]), module_name[1]) - def _serialize_cross_validator(self, o: Any) -> "OrderedDict[str, Union[str, Dict]]": - ret = OrderedDict() # type: 'OrderedDict[str, Union[str, Dict]]' + def _serialize_cross_validator(self, o: Any) -> OrderedDict[str, str | dict]: + ret: OrderedDict[str, str | dict] = OrderedDict() parameters = OrderedDict() # type: 'OrderedDict[str, Any]' @@ -1327,7 +1342,7 @@ def _serialize_cross_validator(self, o: Any) -> "OrderedDict[str, Union[str, Dic cls = o.__class__ init = getattr(cls.__init__, "deprecated_original", cls.__init__) # Ignore varargs, kw and default values and pop self - init_signature = inspect.signature(init) + init_signature = inspect.signature(init) # type: ignore # Consider the constructor parameters excluding 'self' if init is object.__init__: args = [] # type: List @@ -1337,7 +1352,7 @@ def _serialize_cross_validator(self, o: Any) -> "OrderedDict[str, Union[str, Dic p.name for p in init_signature.parameters.values() if p.name != "self" and p.kind != p.VAR_KEYWORD - ] + ], ) for key in args: @@ -1366,7 +1381,10 @@ def _serialize_cross_validator(self, o: Any) -> "OrderedDict[str, Union[str, Dic return ret def _deserialize_cross_validator( - self, value: "OrderedDict[str, Any]", recursion_depth: int, strict_version: bool = True + self, + value: OrderedDict[str, Any], + recursion_depth: int, + strict_version: bool = True, # noqa: FBT002, FBT001 ) -> Any: model_name = value["name"] parameters = value["parameters"] @@ -1386,12 +1404,13 @@ def _format_external_version( model_package_name: str, model_package_version_number: str, ) -> str: - return "%s==%s" % (model_package_name, model_package_version_number) + return f"{model_package_name}=={model_package_version_number}" @staticmethod def _get_parameter_values_recursive( - param_grid: Union[Dict, List[Dict]], parameter_name: str - ) -> List[Any]: + param_grid: dict | list[dict], + parameter_name: str, + ) -> list[Any]: """ Returns a list of values for a given hyperparameter, encountered recursively throughout the flow. (e.g., n_jobs can be defined @@ -1412,28 +1431,28 @@ def _get_parameter_values_recursive( A list of all values of hyperparameters with this name """ if isinstance(param_grid, dict): - result = list() - for param, value in param_grid.items(): - # n_jobs is scikit-learn parameter for parallelizing jobs - if param.split("__")[-1] == parameter_name: - result.append(value) - return result - elif isinstance(param_grid, list): - result = list() + return [ + value + for param, value in param_grid.items() + if param.split("__")[-1] == parameter_name + ] + + if isinstance(param_grid, list): + result = [] for sub_grid in param_grid: result.extend( - SklearnExtension._get_parameter_values_recursive(sub_grid, parameter_name) + SklearnExtension._get_parameter_values_recursive(sub_grid, parameter_name), ) return result - else: - raise ValueError("Param_grid should either be a dict or list of dicts") + + raise ValueError("Param_grid should either be a dict or list of dicts") def _prevent_optimize_n_jobs(self, model): """ Ensures that HPO classes will not optimize the n_jobs hyperparameter - Parameters: - ----------- + Parameters + ---------- model: The model that will be fitted """ @@ -1450,19 +1469,20 @@ def _prevent_optimize_n_jobs(self, model): "Using subclass BaseSearchCV other than " "{GridSearchCV, RandomizedSearchCV}. " "Could not find attribute " - "param_distributions." + "param_distributions.", ) logger.warning( "Warning! Using subclass BaseSearchCV other than " "{GridSearchCV, RandomizedSearchCV}. " - "Should implement param check. " + "Should implement param check. ", ) n_jobs_vals = SklearnExtension._get_parameter_values_recursive( - param_distributions, "n_jobs" + param_distributions, + "n_jobs", ) if len(n_jobs_vals) > 0: raise PyOpenMLError( - "openml-python should not be used to " "optimize the n_jobs parameter." + "openml-python should not be used to " "optimize the n_jobs parameter.", ) ################################################################################################ @@ -1485,7 +1505,7 @@ def is_estimator(self, model: Any) -> bool: o = model return hasattr(o, "fit") and hasattr(o, "get_params") and hasattr(o, "set_params") - def seed_model(self, model: Any, seed: Optional[int] = None) -> Any: + def seed_model(self, model: Any, seed: int | None = None) -> Any: # noqa: C901 """Set the random state of all the unseeded components of a model and return the seeded model. @@ -1511,17 +1531,19 @@ def seed_model(self, model: Any, seed: Optional[int] = None) -> Any: def _seed_current_object(current_value): if isinstance(current_value, int): # acceptable behaviour return False - elif isinstance(current_value, np.random.RandomState): + + if isinstance(current_value, np.random.RandomState): raise ValueError( "Models initialized with a RandomState object are not " - "supported. Please seed with an integer. " + "supported. Please seed with an integer. ", ) - elif current_value is not None: + + if current_value is not None: raise ValueError( - "Models should be seeded with int or None (this should never " "happen). " + "Models should be seeded with int or None (this should never " "happen). ", ) - else: - return True + + return True rs = np.random.RandomState(seed) model_params = model.get_params() @@ -1561,12 +1583,15 @@ def check_if_model_fitted(self, model: Any) -> bool: ------- bool """ + from sklearn.exceptions import NotFittedError + from sklearn.utils.validation import check_is_fitted + try: # check if model is fitted - from sklearn.exceptions import NotFittedError + check_is_fitted(model) # Creating random dummy data of arbitrary size - dummy_data = np.random.uniform(size=(10, 3)) + dummy_data = np.random.uniform(size=(10, 3)) # noqa: NPY002 # Using 'predict' instead of 'sklearn.utils.validation.check_is_fitted' for a more # robust check that works across sklearn versions and models. Internally, 'predict' # should call 'check_is_fitted' for every concerned attribute, thus offering a more @@ -1581,17 +1606,20 @@ def check_if_model_fitted(self, model: Any) -> bool: # Will reach here if the model was fit on a dataset with more or less than 3 features return True - def _run_model_on_fold( + def _run_model_on_fold( # noqa: PLR0915, PLR0913, C901, PLR0912 self, model: Any, - task: "OpenMLTask", - X_train: Union[np.ndarray, scipy.sparse.spmatrix, pd.DataFrame], + task: OpenMLTask, + X_train: np.ndarray | scipy.sparse.spmatrix | pd.DataFrame, rep_no: int, fold_no: int, - y_train: Optional[np.ndarray] = None, - X_test: Optional[Union[np.ndarray, scipy.sparse.spmatrix, pd.DataFrame]] = None, - ) -> Tuple[ - np.ndarray, Optional[pd.DataFrame], "OrderedDict[str, float]", Optional[OpenMLRunTrace] + y_train: np.ndarray | None = None, + X_test: np.ndarray | scipy.sparse.spmatrix | pd.DataFrame | None = None, + ) -> tuple[ + np.ndarray, + pd.DataFrame | None, + OrderedDict[str, float], + OpenMLRunTrace | None, ]: """Run a model on a repeat,fold,subsample triplet of the task and return prediction information. @@ -1640,7 +1668,9 @@ def _run_model_on_fold( """ def _prediction_to_probabilities( - y: Union[np.ndarray, List], model_classes: List[Any], class_labels: Optional[List[str]] + y: np.ndarray | list, + model_classes: list[Any], + class_labels: list[str] | None, ) -> pd.DataFrame: """Transforms predicted probabilities to match with OpenML class indices. @@ -1673,7 +1703,10 @@ def _prediction_to_probabilities( # DataFrame allows more accurate mapping of classes as column names result = pd.DataFrame( - 0, index=np.arange(len(y)), columns=model_classes, dtype=np.float32 + 0, + index=np.arange(len(y)), + columns=model_classes, + dtype=np.float32, ) for obs, prediction in enumerate(y): result.loc[obs, prediction] = 1.0 @@ -1696,20 +1729,20 @@ def _prediction_to_probabilities( modelfit_start_walltime = time.time() if isinstance(task, OpenMLSupervisedTask): - model_copy.fit(X_train, y_train) + model_copy.fit(X_train, y_train) # type: ignore elif isinstance(task, OpenMLClusteringTask): - model_copy.fit(X_train) + model_copy.fit(X_train) # type: ignore modelfit_dur_cputime = (time.process_time() - modelfit_start_cputime) * 1000 modelfit_dur_walltime = (time.time() - modelfit_start_walltime) * 1000 user_defined_measures["usercpu_time_millis_training"] = modelfit_dur_cputime - refit_time = model_copy.refit_time_ * 1000 if hasattr(model_copy, "refit_time_") else 0 + refit_time = model_copy.refit_time_ * 1000 if hasattr(model_copy, "refit_time_") else 0 # type: ignore user_defined_measures["wall_clock_time_millis_training"] = modelfit_dur_walltime except AttributeError as e: # typically happens when training a regressor on classification task - raise PyOpenMLError(str(e)) + raise PyOpenMLError(str(e)) from e if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)): # search for model classes_ (might differ depending on modeltype) @@ -1732,7 +1765,8 @@ def _prediction_to_probabilities( # to handle the case when dataset is numpy and categories are encoded # however the class labels stored in task are still categories if isinstance(y_train, np.ndarray) and isinstance( - cast(List, task.class_labels)[0], str + cast(List, task.class_labels)[0], + str, ): model_classes = [cast(List[str], task.class_labels)[i] for i in model_classes] @@ -1782,10 +1816,10 @@ def _prediction_to_probabilities( proba_y.shape[1], len(task.class_labels), ) - warnings.warn(message) + warnings.warn(message, stacklevel=2) openml.config.logger.warning(message) - for i, col in enumerate(task.class_labels): + for _i, col in enumerate(task.class_labels): # adding missing columns with 0 probability if col not in model_classes: proba_y[col] = 0 @@ -1798,30 +1832,27 @@ def _prediction_to_probabilities( missing_cols = list(set(task.class_labels) - set(proba_y.columns)) raise ValueError("Predicted probabilities missing for the columns: ", missing_cols) - elif isinstance(task, OpenMLRegressionTask): + elif isinstance(task, (OpenMLRegressionTask, OpenMLClusteringTask)): proba_y = None - - elif isinstance(task, OpenMLClusteringTask): - proba_y = None - else: raise TypeError(type(task)) if self._is_hpo_class(model_copy): trace_data = self._extract_trace_data(model_copy, rep_no, fold_no) - trace = self._obtain_arff_trace( - model_copy, trace_data - ) # type: Optional[OpenMLRunTrace] # noqa E501 + trace: OpenMLRunTrace | None = self._obtain_arff_trace( + model_copy, + trace_data, + ) else: trace = None return pred_y, proba_y, user_defined_measures, trace - def obtain_parameter_values( + def obtain_parameter_values( # noqa: C901, PLR0915 self, - flow: "OpenMLFlow", + flow: OpenMLFlow, model: Any = None, - ) -> List[Dict[str, Any]]: + ) -> list[dict[str, Any]]: """Extracts all parameter settings required for the flow from the model. If no explicit model is provided, the parameters will be extracted from `flow.model` @@ -1852,7 +1883,13 @@ def get_flow_dict(_flow): flow_map.update(get_flow_dict(_flow.components[subflow])) return flow_map - def extract_parameters(_flow, _flow_dict, component_model, _main_call=False, main_id=None): + def extract_parameters( # noqa: PLR0915, PLR0912, C901 + _flow, + _flow_dict, + component_model, + _main_call=False, # noqa: FBT002 + main_id=None, + ): def is_subcomponent_specification(values): # checks whether the current value can be a specification of # subcomponents, as for example the value for steps parameter @@ -1885,7 +1922,7 @@ def is_subcomponent_specification(values): ): model_parameters = set() else: - model_parameters = set([mp for mp in component_model.get_params(deep=False)]) + model_parameters = set(component_model.get_params(deep=False)) if len(exp_parameters.symmetric_difference(model_parameters)) != 0: flow_params = sorted(exp_parameters) model_params = sorted(model_parameters) @@ -1893,7 +1930,7 @@ def is_subcomponent_specification(values): "Parameters of the model do not match the " "parameters expected by the " "flow:\nexpected flow parameters: " - "%s\nmodel parameters: %s" % (flow_params, model_params) + f"{flow_params}\nmodel parameters: {model_params}", ) exp_components = set(_flow.components) if ( @@ -1902,14 +1939,12 @@ def is_subcomponent_specification(values): ): model_components = set() else: - _ = set([mp for mp in component_model.get_params(deep=False)]) - model_components = set( - [ - mp - for mp in component_model.get_params(deep=True) - if "__" not in mp and mp not in _ - ] - ) + _ = set(component_model.get_params(deep=False)) + model_components = { + mp + for mp in component_model.get_params(deep=True) + if "__" not in mp and mp not in _ + } if len(exp_components.symmetric_difference(model_components)) != 0: is_problem = True if len(exp_components - model_components) > 0: @@ -1931,7 +1966,7 @@ def is_subcomponent_specification(values): "Subcomponents of the model do not match the " "parameters expected by the " "flow:\nexpected flow subcomponents: " - "%s\nmodel subcomponents: %s" % (flow_components, model_components) + f"{flow_components}\nmodel subcomponents: {model_components}", ) _params = [] @@ -1949,7 +1984,7 @@ def is_subcomponent_specification(values): if is_subcomponent_specification(current_param_values): # complex parameter value, with subcomponents - parsed_values = list() + parsed_values = [] for subcomponent in current_param_values: # scikit-learn stores usually tuples in the form # (name (str), subcomponent (mixed), argument @@ -1963,7 +1998,7 @@ def is_subcomponent_specification(values): if not isinstance(subcomponent_identifier, str): raise TypeError( "Subcomponent identifier should be of type string, " - "but is {}".format(type(subcomponent_identifier)) + f"but is {type(subcomponent_identifier)}", ) if not isinstance(subcomponent_flow, (openml.flows.OpenMLFlow, str)): if ( @@ -1974,8 +2009,8 @@ def is_subcomponent_specification(values): else: raise TypeError( "Subcomponent flow should be of type flow, but is {}".format( - type(subcomponent_flow) - ) + type(subcomponent_flow), + ), ) current = { @@ -1987,10 +2022,11 @@ def is_subcomponent_specification(values): } if len(subcomponent) == 3: if not isinstance(subcomponent[2], list) and not isinstance( - subcomponent[2], OrderedDict + subcomponent[2], + OrderedDict, ): raise TypeError( - "Subcomponent argument should be list or OrderedDict" + "Subcomponent argument should be list or OrderedDict", ) current["value"]["argument_1"] = subcomponent[2] parsed_values.append(current) @@ -2010,16 +2046,16 @@ def is_subcomponent_specification(values): subcomponent_model = component_model.get_params()[_identifier] _params.extend( extract_parameters( - _flow.components[_identifier], _flow_dict, subcomponent_model - ) + _flow.components[_identifier], + _flow_dict, + subcomponent_model, + ), ) return _params flow_dict = get_flow_dict(flow) model = model if model is not None else flow.model - parameters = extract_parameters(flow, flow_dict, model, True, flow.flow_id) - - return parameters + return extract_parameters(flow, flow_dict, model, _main_call=True, main_id=flow.flow_id) def _openml_param_name_to_sklearn( self, @@ -2094,15 +2130,31 @@ def instantiate_model_from_hpo_class( if not self._is_hpo_class(model): raise AssertionError( "Flow model %s is not an instance of sklearn.model_selection._search.BaseSearchCV" - % model + % model, ) base_estimator = model.estimator base_estimator.set_params(**trace_iteration.get_parameters()) return base_estimator def _extract_trace_data(self, model, rep_no, fold_no): + """Extracts data from a machine learning model's cross-validation results + and creates an ARFF (Attribute-Relation File Format) trace. + + Parameters + ---------- + model : Any + A fitted hyperparameter optimization model. + rep_no : int + The repetition number. + fold_no : int + The fold number. + + Returns + ------- + A list of ARFF tracecontent. + """ arff_tracecontent = [] - for itt_no in range(0, len(model.cv_results_["mean_test_score"])): + for itt_no in range(len(model.cv_results_["mean_test_score"])): # we use the string values for True and False, as it is defined in # this way by the OpenML server selected = "false" @@ -2113,10 +2165,7 @@ def _extract_trace_data(self, model, rep_no, fold_no): for key in model.cv_results_: if key.startswith("param_"): value = model.cv_results_[key][itt_no] - if value is not np.ma.masked: - serialized_value = json.dumps(value) - else: - serialized_value = np.nan + serialized_value = json.dumps(value) if value is not np.ma.masked else np.nan arff_line.append(serialized_value) arff_tracecontent.append(arff_line) return arff_tracecontent @@ -2124,8 +2173,8 @@ def _extract_trace_data(self, model, rep_no, fold_no): def _obtain_arff_trace( self, model: Any, - trace_content: List, - ) -> "OpenMLRunTrace": + trace_content: list, + ) -> OpenMLRunTrace: """Create arff trace object from a fitted model and the trace content obtained by repeatedly calling ``run_model_on_task``. @@ -2144,7 +2193,7 @@ def _obtain_arff_trace( if not self._is_hpo_class(model): raise AssertionError( "Flow model %s is not an instance of sklearn.model_selection._search.BaseSearchCV" - % model + % model, ) if not hasattr(model, "cv_results_"): raise ValueError("model should contain `cv_results_`") @@ -2171,20 +2220,20 @@ def _obtain_arff_trace( or param_value is np.ma.masked ): # basic string values - type = "STRING" + type = "STRING" # noqa: A001 elif isinstance(param_value, (list, tuple)) and all( isinstance(i, int) for i in param_value ): # list of integers (usually for selecting features) # hyperparameter layer_sizes of MLPClassifier - type = "STRING" + type = "STRING" # noqa: A001 else: raise TypeError("Unsupported param type in param grid: %s" % key) # renamed the attribute param to parameter, as this is a required # OpenML convention - this also guards against name collisions # with the required trace attributes - attribute = (PREFIX + key[6:], type) + attribute = (PREFIX + key[6:], type) # type: ignore trace_attributes.append(attribute) return OpenMLRunTrace.generate( diff --git a/openml/flows/__init__.py b/openml/flows/__init__.py index f8d35c3f5..ce32fec7d 100644 --- a/openml/flows/__init__.py +++ b/openml/flows/__init__.py @@ -1,14 +1,13 @@ # License: BSD 3-Clause from .flow import OpenMLFlow - from .functions import ( - get_flow, - list_flows, - flow_exists, - get_flow_id, assert_flows_equal, delete_flow, + flow_exists, + get_flow, + get_flow_id, + list_flows, ) __all__ = [ diff --git a/openml/flows/flow.py b/openml/flows/flow.py index b9752e77c..4e437e35c 100644 --- a/openml/flows/flow.py +++ b/openml/flows/flow.py @@ -1,15 +1,16 @@ # License: BSD 3-Clause +from __future__ import annotations -from collections import OrderedDict -import os -from typing import Dict, List, Union, Tuple, Optional # noqa: F401 import logging +from collections import OrderedDict +from pathlib import Path +from typing import Any, Hashable, Sequence import xmltodict from openml.base import OpenMLBase -from ..extensions import get_extension_by_flow -from ..utils import extract_xml_tags +from openml.extensions import Extension, get_extension_by_flow +from openml.utils import extract_xml_tags class OpenMLFlow(OpenMLBase): @@ -59,10 +60,10 @@ class OpenMLFlow(OpenMLBase): A list of dependencies necessary to run the flow. This field should contain all libraries the flow depends on. To allow reproducibility it should also specify the exact version numbers. - class_name : str + class_name : str, optional The development language name of the class which is described by this flow. - custom_name : str + custom_name : str, optional Custom name of the flow given by the owner. binary_url : str, optional Url from which the binary can be downloaded. Added by the server. @@ -81,32 +82,34 @@ class OpenMLFlow(OpenMLBase): Date the flow was uploaded. Filled in by the server. flow_id : int, optional Flow ID. Assigned by the server. + extension : Extension, optional + The extension for a flow (e.g., sklearn). version : str, optional OpenML version of the flow. Assigned by the server. """ - def __init__( + def __init__( # noqa: PLR0913 self, - name, - description, - model, - components, - parameters, - parameters_meta_info, - external_version, - tags, - language, - dependencies, - class_name=None, - custom_name=None, - binary_url=None, - binary_format=None, - binary_md5=None, - uploader=None, - upload_date=None, - flow_id=None, - extension=None, - version=None, + name: str, + description: str, + model: object, + components: dict, + parameters: dict, + parameters_meta_info: dict, + external_version: str, + tags: list, + language: str, + dependencies: str, + class_name: str | None = None, + custom_name: str | None = None, + binary_url: str | None = None, + binary_format: str | None = None, + binary_md5: str | None = None, + uploader: str | None = None, + upload_date: str | None = None, + flow_id: int | None = None, + extension: Extension | None = None, + version: str | None = None, ): self.name = name self.description = description @@ -117,10 +120,10 @@ def __init__( [parameters, "parameters"], [parameters_meta_info, "parameters_meta_info"], ]: - if not isinstance(variable, OrderedDict): + if not isinstance(variable, (OrderedDict, dict)): raise TypeError( - "%s must be of type OrderedDict, " - "but is %s." % (variable_name, type(variable)) + f"{variable_name} must be of type OrderedDict or dict, " + f"but is {type(variable)}.", ) self.components = components @@ -133,13 +136,14 @@ def __init__( if len(keys_parameters.difference(keys_parameters_meta_info)) > 0: raise ValueError( "Parameter %s only in parameters, but not in " - "parameters_meta_info." % str(keys_parameters.difference(keys_parameters_meta_info)) + "parameters_meta_info." + % str(keys_parameters.difference(keys_parameters_meta_info)), ) if len(keys_parameters_meta_info.difference(keys_parameters)) > 0: raise ValueError( "Parameter %s only in parameters_meta_info, " "but not in parameters." - % str(keys_parameters_meta_info.difference(keys_parameters)) + % str(keys_parameters_meta_info.difference(keys_parameters)), ) self.external_version = external_version @@ -161,19 +165,21 @@ def __init__( self._extension = extension @property - def id(self) -> Optional[int]: + def id(self) -> int | None: + """The ID of the flow.""" return self.flow_id @property - def extension(self): + def extension(self) -> Extension: + """The extension of the flow (e.g., sklearn).""" if self._extension is not None: return self._extension - else: - raise RuntimeError( - "No extension could be found for flow {}: {}".format(self.flow_id, self.name) - ) - def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: + raise RuntimeError( + f"No extension could be found for flow {self.flow_id}: {self.name}", + ) + + def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str]]]: """Collect all information to display in the __repr__ body.""" fields = { "Flow Name": self.name, @@ -181,10 +187,10 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: "Dependencies": self.dependencies, } if self.flow_id is not None: - fields["Flow URL"] = self.openml_url + fields["Flow URL"] = self.openml_url if self.openml_url is not None else "None" fields["Flow ID"] = str(self.flow_id) if self.version is not None: - fields["Flow ID"] += " (version {})".format(self.version) + fields["Flow ID"] += f" (version {self.version})" if self.upload_date is not None: fields["Upload Date"] = self.upload_date.replace("T", " ") if self.binary_url is not None: @@ -202,18 +208,18 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: ] return [(key, fields[key]) for key in order if key in fields] - def _to_dict(self) -> "OrderedDict[str, OrderedDict]": + def _to_dict(self) -> dict[str, dict]: # noqa: C901, PLR0912 """Creates a dictionary representation of self.""" - flow_container = OrderedDict() # type: 'OrderedDict[str, OrderedDict]' + flow_container = OrderedDict() # type: 'dict[str, dict]' flow_dict = OrderedDict( - [("@xmlns:oml", "http://openml.org/openml")] - ) # type: 'OrderedDict[str, Union[List, str]]' # noqa E501 + [("@xmlns:oml", "http://openml.org/openml")], + ) # type: 'dict[str, list | str]' # E501 flow_container["oml:flow"] = flow_dict _add_if_nonempty(flow_dict, "oml:id", self.flow_id) for required in ["name", "external_version"]: if getattr(self, required) is None: - raise ValueError("self.{} is required but None".format(required)) + raise ValueError(f"self.{required} is required but None") for attribute in [ "uploader", "name", @@ -226,7 +232,7 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]": "language", "dependencies", ]: - _add_if_nonempty(flow_dict, "oml:{}".format(attribute), getattr(self, attribute)) + _add_if_nonempty(flow_dict, f"oml:{attribute}", getattr(self, attribute)) if not self.description: logger = logging.getLogger(__name__) @@ -245,15 +251,15 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]": for key_, value in param_dict.items(): if key_ is not None and not isinstance(key_, str): raise ValueError( - "Parameter name %s cannot be serialized " - "because it is of type %s. Only strings " - "can be serialized." % (key_, type(key_)) + f"Parameter name {key_} cannot be serialized " + f"because it is of type {type(key_)}. Only strings " + "can be serialized.", ) if value is not None and not isinstance(value, str): raise ValueError( - "Parameter value %s cannot be serialized " - "because it is of type %s. Only strings " - "can be serialized." % (value, type(value)) + f"Parameter value {value} cannot be serialized " + f"because it is of type {type(value)}. Only strings " + "can be serialized.", ) flow_parameters.append(param_dict) @@ -262,7 +268,7 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]": components = [] for key in self.components: - component_dict = OrderedDict() # type: 'OrderedDict[str, Dict]' + component_dict = OrderedDict() # type: 'OrderedDict[str, dict]' component_dict["oml:identifier"] = key if self.components[key] in ["passthrough", "drop"]: component_dict["oml:flow"] = { @@ -277,9 +283,9 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]": # value is a flow. The flow itself is valid by recursion if key_ is not None and not isinstance(key_, str): raise ValueError( - "Parameter name %s cannot be serialized " - "because it is of type %s. Only strings " - "can be serialized." % (key_, type(key_)) + f"Parameter name {key_} cannot be serialized " + f"because it is of type {type(key_)}. Only strings " + "can be serialized.", ) components.append(component_dict) @@ -287,12 +293,12 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]": flow_dict["oml:component"] = components flow_dict["oml:tag"] = self.tags for attribute in ["binary_url", "binary_format", "binary_md5"]: - _add_if_nonempty(flow_dict, "oml:{}".format(attribute), getattr(self, attribute)) + _add_if_nonempty(flow_dict, f"oml:{attribute}", getattr(self, attribute)) return flow_container @classmethod - def _from_dict(cls, xml_dict): + def _from_dict(cls, xml_dict: dict) -> OpenMLFlow: """Create a flow from an xml description. Calls itself recursively to create :class:`OpenMLFlow` objects of @@ -310,7 +316,7 @@ def _from_dict(cls, xml_dict): ------- OpenMLFlow - """ # noqa E501 + """ # E501 arguments = OrderedDict() dic = xml_dict["oml:flow"] @@ -380,30 +386,34 @@ def _from_dict(cls, xml_dict): arguments["tags"] = extract_xml_tags("oml:tag", dic) arguments["model"] = None - flow = cls(**arguments) + return cls(**arguments) - return flow + def to_filesystem(self, output_directory: str | Path) -> None: + """Write a flow to the filesystem as XML to output_directory.""" + output_directory = Path(output_directory) + output_directory.mkdir(parents=True, exist_ok=True) - def to_filesystem(self, output_directory: str) -> None: - os.makedirs(output_directory, exist_ok=True) - if "flow.xml" in os.listdir(output_directory): + output_path = output_directory / "flow.xml" + if output_path.exists(): raise ValueError("Output directory already contains a flow.xml file.") run_xml = self._to_xml() - with open(os.path.join(output_directory, "flow.xml"), "w") as f: + with output_path.open("w") as f: f.write(run_xml) @classmethod - def from_filesystem(cls, input_directory) -> "OpenMLFlow": - with open(os.path.join(input_directory, "flow.xml"), "r") as f: + def from_filesystem(cls, input_directory: str | Path) -> OpenMLFlow: + """Read a flow from an XML in input_directory on the filesystem.""" + input_directory = Path(input_directory) / "flow.xml" + with input_directory.open() as f: xml_string = f.read() return OpenMLFlow._from_dict(xmltodict.parse(xml_string)) - def _parse_publish_response(self, xml_response: Dict): + def _parse_publish_response(self, xml_response: dict) -> None: """Parse the id from the xml_response and assign it to self.""" self.flow_id = int(xml_response["oml:upload_flow"]["oml:id"]) - def publish(self, raise_error_if_exists: bool = False) -> "OpenMLFlow": + def publish(self, raise_error_if_exists: bool = False) -> OpenMLFlow: # noqa: FBT001, FBT002 """Publish this flow to OpenML server. Raises a PyOpenMLError if the flow exists on the server, but @@ -430,17 +440,17 @@ def publish(self, raise_error_if_exists: bool = False) -> "OpenMLFlow": if not flow_id: if self.flow_id: raise openml.exceptions.PyOpenMLError( - "Flow does not exist on the server, " "but 'flow.flow_id' is not None." + "Flow does not exist on the server, " "but 'flow.flow_id' is not None.", ) super().publish() + assert self.flow_id is not None # for mypy flow_id = self.flow_id elif raise_error_if_exists: - error_message = "This OpenMLFlow already exists with id: {}.".format(flow_id) + error_message = f"This OpenMLFlow already exists with id: {flow_id}." raise openml.exceptions.PyOpenMLError(error_message) elif self.flow_id is not None and self.flow_id != flow_id: raise openml.exceptions.PyOpenMLError( - "Local flow_id does not match server flow_id: " - "'{}' vs '{}'".format(self.flow_id, flow_id) + "Local flow_id does not match server flow_id: " f"'{self.flow_id}' vs '{flow_id}'", ) flow = openml.flows.functions.get_flow(flow_id) @@ -457,12 +467,12 @@ def publish(self, raise_error_if_exists: bool = False) -> "OpenMLFlow": message = e.args[0] raise ValueError( "The flow on the server is inconsistent with the local flow. " - "The server flow ID is {}. Please check manually and remove " - "the flow if necessary! Error is:\n'{}'".format(flow_id, message) - ) + f"The server flow ID is {flow_id}. Please check manually and remove " + f"the flow if necessary! Error is:\n'{message}'", + ) from e return self - def get_structure(self, key_item: str) -> Dict[str, List[str]]: + def get_structure(self, key_item: str) -> dict[str, list[str]]: """ Returns for each sub-component of the flow the path of identifiers that should be traversed to reach this component. The resulting dict @@ -482,15 +492,15 @@ def get_structure(self, key_item: str) -> Dict[str, List[str]]: """ if key_item not in ["flow_id", "name"]: raise ValueError("key_item should be in {flow_id, name}") - structure = dict() + structure = {} for key, sub_flow in self.components.items(): sub_structure = sub_flow.get_structure(key_item) for flow_name, flow_sub_structure in sub_structure.items(): - structure[flow_name] = [key] + flow_sub_structure + structure[flow_name] = [key, *flow_sub_structure] structure[getattr(self, key_item)] = [] return structure - def get_subflow(self, structure): + def get_subflow(self, structure: list[str]) -> OpenMLFlow: """ Returns a subflow from the tree of dependencies. @@ -512,17 +522,30 @@ def get_subflow(self, structure): sub_identifier = structure[0] if sub_identifier not in self.components: raise ValueError( - "Flow %s does not contain component with " - "identifier %s" % (self.name, sub_identifier) + f"Flow {self.name} does not contain component with " f"identifier {sub_identifier}", ) if len(structure) == 1: - return self.components[sub_identifier] - else: - structure.pop(0) - return self.components[sub_identifier].get_subflow(structure) + return self.components[sub_identifier] # type: ignore + + structure.pop(0) + return self.components[sub_identifier].get_subflow(structure) # type: ignore -def _copy_server_fields(source_flow, target_flow): +def _copy_server_fields(source_flow: OpenMLFlow, target_flow: OpenMLFlow) -> None: + """Recursively copies the fields added by the server + from the `source_flow` to the `target_flow`. + + Parameters + ---------- + source_flow : OpenMLFlow + To copy the fields from. + target_flow : OpenMLFlow + To copy the fields to. + + Returns + ------- + None + """ fields_added_by_the_server = ["flow_id", "uploader", "version", "upload_date"] for field in fields_added_by_the_server: setattr(target_flow, field, getattr(source_flow, field)) @@ -532,6 +555,21 @@ def _copy_server_fields(source_flow, target_flow): _copy_server_fields(component, target_flow.components[name]) -def _add_if_nonempty(dic, key, value): +def _add_if_nonempty(dic: dict, key: Hashable, value: Any) -> None: + """Adds a key-value pair to a dictionary if the value is not None. + + Parameters + ---------- + dic: dict + To add the key-value pair to. + key: hashable + To add to the dictionary. + value: Any + To add to the dictionary. + + Returns + ------- + None + """ if value is not None: dic[key] = value diff --git a/openml/flows/functions.py b/openml/flows/functions.py index 0e278d33a..b01e54b44 100644 --- a/openml/flows/functions.py +++ b/openml/flows/functions.py @@ -1,20 +1,22 @@ # License: BSD 3-Clause -import warnings +from __future__ import annotations -import dateutil.parser -from collections import OrderedDict import os -import io import re -import xmltodict +import warnings +from collections import OrderedDict +from typing import Any, Dict, overload +from typing_extensions import Literal + +import dateutil.parser import pandas as pd -from typing import Any, Union, Dict, Optional, List +import xmltodict -from ..exceptions import OpenMLCacheException import openml._api_calls -from . import OpenMLFlow import openml.utils +from openml.exceptions import OpenMLCacheException +from . import OpenMLFlow FLOWS_CACHE_DIR_NAME = "flows" @@ -57,20 +59,19 @@ def _get_cached_flow(fid: int) -> OpenMLFlow: ------- OpenMLFlow. """ - fid_cache_dir = openml.utils._create_cache_directory_for_id(FLOWS_CACHE_DIR_NAME, fid) - flow_file = os.path.join(fid_cache_dir, "flow.xml") + flow_file = fid_cache_dir / "flow.xml" try: - with io.open(flow_file, encoding="utf8") as fh: + with flow_file.open(encoding="utf8") as fh: return _create_flow_from_xml(fh.read()) - except (OSError, IOError): + except OSError as e: openml.utils._remove_cache_dir_for_id(FLOWS_CACHE_DIR_NAME, fid_cache_dir) - raise OpenMLCacheException("Flow file for fid %d not " "cached" % fid) + raise OpenMLCacheException("Flow file for fid %d not " "cached" % fid) from e @openml.utils.thread_safe_if_oslo_installed -def get_flow(flow_id: int, reinstantiate: bool = False, strict_version: bool = True) -> OpenMLFlow: +def get_flow(flow_id: int, reinstantiate: bool = False, strict_version: bool = True) -> OpenMLFlow: # noqa: FBT001, FBT002 """Download the OpenML flow for a given flow ID. Parameters @@ -121,25 +122,58 @@ def _get_flow_description(flow_id: int) -> OpenMLFlow: try: return _get_cached_flow(flow_id) except OpenMLCacheException: - xml_file = os.path.join( - openml.utils._create_cache_directory_for_id(FLOWS_CACHE_DIR_NAME, flow_id), - "flow.xml", + xml_file = ( + openml.utils._create_cache_directory_for_id(FLOWS_CACHE_DIR_NAME, flow_id) / "flow.xml" ) - flow_xml = openml._api_calls._perform_api_call("flow/%d" % flow_id, request_method="get") - with io.open(xml_file, "w", encoding="utf8") as fh: + + with xml_file.open("w", encoding="utf8") as fh: fh.write(flow_xml) return _create_flow_from_xml(flow_xml) +@overload def list_flows( - offset: Optional[int] = None, - size: Optional[int] = None, - tag: Optional[str] = None, - output_format: str = "dict", - **kwargs -) -> Union[Dict, pd.DataFrame]: + offset: int | None = ..., + size: int | None = ..., + tag: str | None = ..., + output_format: Literal["dict"] = "dict", + **kwargs: Any, +) -> dict: + ... + + +@overload +def list_flows( + offset: int | None = ..., + size: int | None = ..., + tag: str | None = ..., + *, + output_format: Literal["dataframe"], + **kwargs: Any, +) -> pd.DataFrame: + ... + + +@overload +def list_flows( + offset: int | None, + size: int | None, + tag: str | None, + output_format: Literal["dataframe"], + **kwargs: Any, +) -> pd.DataFrame: + ... + + +def list_flows( + offset: int | None = None, + size: int | None = None, + tag: str | None = None, + output_format: Literal["dict", "dataframe"] = "dict", + **kwargs: Any, +) -> dict | pd.DataFrame: """ Return a list of all flows which are on OpenML. (Supports large amount of results) @@ -186,7 +220,7 @@ def list_flows( """ if output_format not in ["dataframe", "dict"]: raise ValueError( - "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable." + "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable.", ) # TODO: [0.15] @@ -199,16 +233,33 @@ def list_flows( warnings.warn(msg, category=FutureWarning, stacklevel=2) return openml.utils._list_all( - output_format=output_format, + list_output_format=output_format, listing_call=_list_flows, offset=offset, size=size, tag=tag, - **kwargs + **kwargs, ) -def _list_flows(output_format="dict", **kwargs) -> Union[Dict, pd.DataFrame]: +@overload +def _list_flows(output_format: Literal["dict"] = ..., **kwargs: Any) -> dict: + ... + + +@overload +def _list_flows(*, output_format: Literal["dataframe"], **kwargs: Any) -> pd.DataFrame: + ... + + +@overload +def _list_flows(output_format: Literal["dataframe"], **kwargs: Any) -> pd.DataFrame: + ... + + +def _list_flows( + output_format: Literal["dict", "dataframe"] = "dict", **kwargs: Any +) -> dict | pd.DataFrame: """ Perform the api call that return a list of all flows. @@ -230,12 +281,12 @@ def _list_flows(output_format="dict", **kwargs) -> Union[Dict, pd.DataFrame]: if kwargs is not None: for operator, value in kwargs.items(): - api_call += "/%s/%s" % (operator, value) + api_call += f"/{operator}/{value}" return __list_flows(api_call=api_call, output_format=output_format) -def flow_exists(name: str, external_version: str) -> Union[int, bool]: +def flow_exists(name: str, external_version: str) -> int | bool: """Retrieves the flow id. A flow is uniquely identified by name + external_version. @@ -273,10 +324,10 @@ def flow_exists(name: str, external_version: str) -> Union[int, bool]: def get_flow_id( - model: Optional[Any] = None, - name: Optional[str] = None, - exact_version=True, -) -> Union[int, bool, List[int]]: + model: Any | None = None, + name: str | None = None, + exact_version: bool = True, # noqa: FBT001, FBT002 +) -> int | bool | list[int]: """Retrieves the flow id for a model or a flow name. Provide either a model or a name to this function. Depending on the input, it does @@ -300,18 +351,14 @@ def get_flow_id( exact_version : bool Whether to return the flow id of the exact version or all flow ids where the name of the flow matches. This is only taken into account for a model where a version number - is available. + is available (requires ``model`` to be set). Returns ------- int or bool, List flow id iff exists, ``False`` otherwise, List if ``exact_version is False`` """ - if model is None and name is None: - raise ValueError( - "Need to provide either argument `model` or argument `name`, but both are `None`." - ) - elif model is not None and name is not None: + if model is not None and name is not None: raise ValueError("Must provide either argument `model` or argument `name`, but not both.") if model is not None: @@ -323,30 +370,63 @@ def get_flow_id( flow = extension.model_to_flow(model) flow_name = flow.name external_version = flow.external_version - else: + elif name is not None: flow_name = name exact_version = False + external_version = None + else: + raise ValueError( + "Need to provide either argument `model` or argument `name`, but both are `None`." + ) if exact_version: + if external_version is None: + raise ValueError("exact_version should be False if model is None!") return flow_exists(name=flow_name, external_version=external_version) - else: - flows = list_flows(output_format="dataframe") - assert isinstance(flows, pd.DataFrame) # Make mypy happy - flows = flows.query('name == "{}"'.format(flow_name)) - return flows["id"].to_list() + flows = list_flows(output_format="dataframe") + assert isinstance(flows, pd.DataFrame) # Make mypy happy + flows = flows.query(f'name == "{flow_name}"') + return flows["id"].to_list() # type: ignore[no-any-return] + + +@overload +def __list_flows(api_call: str, output_format: Literal["dict"] = "dict") -> dict: + ... + + +@overload +def __list_flows(api_call: str, output_format: Literal["dataframe"]) -> pd.DataFrame: + ... -def __list_flows(api_call: str, output_format: str = "dict") -> Union[Dict, pd.DataFrame]: + +def __list_flows( + api_call: str, output_format: Literal["dict", "dataframe"] = "dict" +) -> dict | pd.DataFrame: + """Retrieve information about flows from OpenML API + and parse it to a dictionary or a Pandas DataFrame. + + Parameters + ---------- + api_call: str + Retrieves the information about flows. + output_format: str in {"dict", "dataframe"} + The output format. + + Returns + ------- + The flows information in the specified output format. + """ xml_string = openml._api_calls._perform_api_call(api_call, "get") flows_dict = xmltodict.parse(xml_string, force_list=("oml:flow",)) # Minimalistic check if the XML is useful - assert type(flows_dict["oml:flows"]["oml:flow"]) == list, type(flows_dict["oml:flows"]) + assert isinstance(flows_dict["oml:flows"]["oml:flow"], list), type(flows_dict["oml:flows"]) assert flows_dict["oml:flows"]["@xmlns:oml"] == "http://openml.org/openml", flows_dict[ "oml:flows" ]["@xmlns:oml"] - flows = dict() + flows = {} for flow_ in flows_dict["oml:flows"]["oml:flow"]: fid = int(flow_["oml:id"]) flow = { @@ -367,27 +447,25 @@ def __list_flows(api_call: str, output_format: str = "dict") -> Union[Dict, pd.D def _check_flow_for_server_id(flow: OpenMLFlow) -> None: """Raises a ValueError if the flow or any of its subflows has no flow id.""" - # Depth-first search to check if all components were uploaded to the # server before parsing the parameters - stack = list() - stack.append(flow) + stack = [flow] while len(stack) > 0: current = stack.pop() if current.flow_id is None: raise ValueError("Flow %s has no flow_id!" % current.name) - else: - for component in current.components.values(): - stack.append(component) + for component in current.components.values(): + stack.append(component) -def assert_flows_equal( + +def assert_flows_equal( # noqa: C901, PLR0912, PLR0913, PLR0915 flow1: OpenMLFlow, flow2: OpenMLFlow, - ignore_parameter_values_on_older_children: Optional[str] = None, - ignore_parameter_values: bool = False, - ignore_custom_name_if_none: bool = False, - check_description: bool = True, + ignore_parameter_values_on_older_children: str | None = None, + ignore_parameter_values: bool = False, # noqa: FBT001, FBT002 + ignore_custom_name_if_none: bool = False, # noqa: FBT001, FBT002 + check_description: bool = True, # noqa: FBT001, FBT002 ) -> None: """Check equality of two flows. @@ -444,11 +522,11 @@ def assert_flows_equal( for name in set(attr1.keys()).union(attr2.keys()): if name not in attr1: raise ValueError( - "Component %s only available in " "argument2, but not in argument1." % name + "Component %s only available in " "argument2, but not in argument1." % name, ) if name not in attr2: raise ValueError( - "Component %s only available in " "argument2, but not in argument1." % name + "Component %s only available in " "argument2, but not in argument1." % name, ) assert_flows_equal( attr1[name], @@ -473,13 +551,16 @@ def assert_flows_equal( raise ValueError( "Flow %s: parameter set of flow " "differs from the parameters stored " - "on the server." % flow1.name + "on the server." % flow1.name, ) if ignore_parameter_values_on_older_children: + assert ( + flow1.upload_date is not None + ), "Flow1 has no upload date that allows us to compare age of children." upload_date_current_flow = dateutil.parser.parse(flow1.upload_date) upload_date_parent_flow = dateutil.parser.parse( - ignore_parameter_values_on_older_children + ignore_parameter_values_on_older_children, ) if upload_date_current_flow < upload_date_parent_flow: continue @@ -506,7 +587,7 @@ def assert_flows_equal( params2 = set(flow2.parameters_meta_info) if params1 != params2: raise ValueError( - "Parameter list in meta info for parameters differ " "in the two flows." + "Parameter list in meta info for parameters differ " "in the two flows.", ) # iterating over the parameter's meta info list for param in params1: @@ -523,18 +604,19 @@ def assert_flows_equal( value2 = flow2.parameters_meta_info[param] if value1 is None or value2 is None: continue - elif value1 != value2: + + if value1 != value2: raise ValueError( - "Flow {}: data type for parameter {} in {} differ " - "as {}\nvs\n{}".format(flow1.name, param, key, value1, value2) + f"Flow {flow1.name}: data type for parameter {param} in {key} differ " + f"as {value1}\nvs\n{value2}", ) # the continue is to avoid the 'attr != attr2' check at end of function continue if attr1 != attr2: raise ValueError( - "Flow %s: values for attribute '%s' differ: " - "'%s'\nvs\n'%s'." % (str(flow1.name), str(key), str(attr1), str(attr2)) + f"Flow {flow1.name!s}: values for attribute '{key!s}' differ: " + f"'{attr1!s}'\nvs\n'{attr2!s}'.", ) @@ -549,7 +631,6 @@ def _create_flow_from_xml(flow_xml: str) -> OpenMLFlow: ------- OpenMLFlow """ - return OpenMLFlow._from_dict(xmltodict.parse(flow_xml)) diff --git a/openml/runs/__init__.py b/openml/runs/__init__.py index 2abbd8f29..6d3dca504 100644 --- a/openml/runs/__init__.py +++ b/openml/runs/__init__.py @@ -1,19 +1,19 @@ # License: BSD 3-Clause -from .run import OpenMLRun -from .trace import OpenMLRunTrace, OpenMLTraceIteration from .functions import ( - run_model_on_task, - run_flow_on_task, + delete_run, get_run, - list_runs, - get_runs, get_run_trace, - run_exists, + get_runs, initialize_model_from_run, initialize_model_from_trace, - delete_run, + list_runs, + run_exists, + run_flow_on_task, + run_model_on_task, ) +from .run import OpenMLRun +from .trace import OpenMLRunTrace, OpenMLTraceIteration __all__ = [ "OpenMLRun", diff --git a/openml/runs/functions.py b/openml/runs/functions.py index 96e031aee..7a082e217 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -1,62 +1,73 @@ # License: BSD 3-Clause +from __future__ import annotations -from collections import OrderedDict -import io import itertools -import os import time -from typing import Any, List, Dict, Optional, Set, Tuple, Union, TYPE_CHECKING, cast # noqa F401 import warnings +from collections import OrderedDict +from pathlib import Path +from typing import TYPE_CHECKING, Any +from typing_extensions import Literal -import sklearn.metrics -import xmltodict import numpy as np import pandas as pd +import sklearn.metrics +import xmltodict from joblib.parallel import Parallel, delayed import openml -import openml.utils import openml._api_calls -from openml.exceptions import PyOpenMLError -from openml.extensions import get_extension_by_model +import openml.utils from openml import config +from openml.exceptions import ( + OpenMLCacheException, + OpenMLRunsExistError, + OpenMLServerException, + PyOpenMLError, +) +from openml.extensions import get_extension_by_model +from openml.flows import OpenMLFlow, flow_exists, get_flow from openml.flows.flow import _copy_server_fields -from ..flows import get_flow, flow_exists, OpenMLFlow -from ..setups import setup_exists, initialize_model -from ..exceptions import OpenMLCacheException, OpenMLServerException, OpenMLRunsExistError -from ..tasks import ( - OpenMLTask, +from openml.setups import initialize_model, setup_exists +from openml.tasks import ( OpenMLClassificationTask, OpenMLClusteringTask, + OpenMLLearningCurveTask, OpenMLRegressionTask, OpenMLSupervisedTask, - OpenMLLearningCurveTask, + OpenMLTask, + TaskType, + get_task, ) + from .run import OpenMLRun from .trace import OpenMLRunTrace -from ..tasks import TaskType, get_task # Avoid import cycles: https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles if TYPE_CHECKING: + from openml.config import _Config from openml.extensions.extension_interface import Extension # get_dict is in run.py to avoid circular imports RUNS_CACHE_DIR_NAME = "runs" +ERROR_CODE = 512 -def run_model_on_task( +# TODO(eddiebergman): Could potentially overload this but +# it seems very big to do so +def run_model_on_task( # noqa: PLR0913 model: Any, - task: Union[int, str, OpenMLTask], - avoid_duplicate_runs: bool = True, - flow_tags: Optional[List[str]] = None, - seed: Optional[int] = None, - add_local_measures: bool = True, - upload_flow: bool = False, - return_flow: bool = False, - dataset_format: str = "dataframe", - n_jobs: Optional[int] = None, -) -> Union[OpenMLRun, Tuple[OpenMLRun, OpenMLFlow]]: + task: int | str | OpenMLTask, + avoid_duplicate_runs: bool = True, # noqa: FBT001, FBT002 + flow_tags: list[str] | None = None, + seed: int | None = None, + add_local_measures: bool = True, # noqa: FBT001, FBT002 + upload_flow: bool = False, # noqa: FBT001, FBT002 + return_flow: bool = False, # noqa: FBT001, FBT002 + dataset_format: Literal["array", "dataframe"] = "dataframe", + n_jobs: int | None = None, +) -> OpenMLRun | tuple[OpenMLRun, OpenMLFlow]: """Run the model on the dataset defined by the task. Parameters @@ -104,6 +115,8 @@ def run_model_on_task( "Please set your API key in the OpenML configuration file, see" "https://openml.github.io/openml-python/main/examples/20_basic/introduction_tutorial" ".html#authentication for more information on authentication.", + RuntimeWarning, + stacklevel=2, ) # TODO: At some point in the future do not allow for arguments in old order (6-2018). @@ -116,6 +129,7 @@ def run_model_on_task( "will not be supported in the future. Please use the " "order (model, task).", DeprecationWarning, + stacklevel=2, ) task, model = model, task @@ -127,11 +141,24 @@ def run_model_on_task( flow = extension.model_to_flow(model) - def get_task_and_type_conversion(task: Union[int, str, OpenMLTask]) -> OpenMLTask: - if isinstance(task, (int, str)): - return get_task(int(task)) - else: - return task + def get_task_and_type_conversion(_task: int | str | OpenMLTask) -> OpenMLTask: + """Retrieve an OpenMLTask object from either an integer or string ID, + or directly from an OpenMLTask object. + + Parameters + ---------- + _task : Union[int, str, OpenMLTask] + The task ID or the OpenMLTask object. + + Returns + ------- + OpenMLTask + The OpenMLTask object. + """ + if isinstance(_task, (int, str)): + return get_task(int(_task)) # type: ignore + + return _task task = get_task_and_type_conversion(task) @@ -151,16 +178,16 @@ def get_task_and_type_conversion(task: Union[int, str, OpenMLTask]) -> OpenMLTas return run -def run_flow_on_task( +def run_flow_on_task( # noqa: C901, PLR0912, PLR0915, PLR0913 flow: OpenMLFlow, task: OpenMLTask, - avoid_duplicate_runs: bool = True, - flow_tags: Optional[List[str]] = None, - seed: Optional[int] = None, - add_local_measures: bool = True, - upload_flow: bool = False, - dataset_format: str = "dataframe", - n_jobs: Optional[int] = None, + avoid_duplicate_runs: bool = True, # noqa: FBT002, FBT001 + flow_tags: list[str] | None = None, + seed: int | None = None, + add_local_measures: bool = True, # noqa: FBT001, FBT002 + upload_flow: bool = False, # noqa: FBT001, FBT002 + dataset_format: Literal["array", "dataframe"] = "dataframe", + n_jobs: int | None = None, ) -> OpenMLRun: """Run the model provided by the flow on the dataset defined by task. @@ -217,6 +244,7 @@ def run_flow_on_task( "will not be supported in the future. Please use the " "order (model, Flow).", DeprecationWarning, + stacklevel=2, ) task, flow = flow, task @@ -225,6 +253,7 @@ def run_flow_on_task( if flow.model is None: flow.model = flow.extension.flow_to_model(flow) + flow.model = flow.extension.seed_model(flow.model, seed=seed) # We only need to sync with the server right now if we want to upload the flow, @@ -233,17 +262,16 @@ def run_flow_on_task( if upload_flow or avoid_duplicate_runs: flow_id = flow_exists(flow.name, flow.external_version) if isinstance(flow.flow_id, int) and flow_id != flow.flow_id: - if flow_id: + if flow_id is not False: raise PyOpenMLError( "Local flow_id does not match server flow_id: " - "'{}' vs '{}'".format(flow.flow_id, flow_id) - ) - else: - raise PyOpenMLError( - "Flow does not exist on the server, " "but 'flow.flow_id' is not None." + f"'{flow.flow_id}' vs '{flow_id}'", ) + raise PyOpenMLError( + "Flow does not exist on the server, but 'flow.flow_id' is not None." + ) - if upload_flow and not flow_id: + if upload_flow and flow_id is None: flow.publish() flow_id = flow.flow_id elif flow_id: @@ -255,14 +283,13 @@ def run_flow_on_task( ids = run_exists(task.task_id, setup_id) if ids: error_message = ( - "One or more runs of this setup were " "already performed on the task." + "One or more runs of this setup were already performed on the task." ) raise OpenMLRunsExistError(ids, error_message) else: # Flow does not exist on server and we do not want to upload it. # No sync with the server happens. flow_id = None - pass dataset = task.get_dataset() @@ -272,7 +299,9 @@ def run_flow_on_task( if flow.extension.check_if_model_fitted(flow.model): warnings.warn( "The model is already fitted!" - " This might cause inconsistency in comparison of results." + " This might cause inconsistency in comparison of results.", + RuntimeWarning, + stacklevel=2, ) # execute the run @@ -315,9 +344,9 @@ def run_flow_on_task( run.fold_evaluations = fold_evaluations if flow_id: - message = "Executed Task {} with Flow id:{}".format(task.task_id, run.flow_id) + message = f"Executed Task {task.task_id} with Flow id:{run.flow_id}" else: - message = "Executed Task {} on local Flow with name {}.".format(task.task_id, flow.name) + message = f"Executed Task {task.task_id} on local Flow with name {flow.name}." config.logger.info(message) return run @@ -336,8 +365,7 @@ def get_run_trace(run_id: int) -> OpenMLRunTrace: openml.runs.OpenMLTrace """ trace_xml = openml._api_calls._perform_api_call("run/trace/%d" % run_id, "get") - run_trace = OpenMLRunTrace.trace_from_xml(trace_xml) - return run_trace + return OpenMLRunTrace.trace_from_xml(trace_xml) def initialize_model_from_run(run_id: int) -> Any: @@ -355,6 +383,9 @@ def initialize_model_from_run(run_id: int) -> Any: model """ run = get_run(run_id) + # TODO(eddiebergman): I imagine this is None if it's not published, + # might need to raise an explicit error for that + assert run.setup_id is not None return initialize_model(run.setup_id) @@ -362,7 +393,7 @@ def initialize_model_from_trace( run_id: int, repeat: int, fold: int, - iteration: Optional[int] = None, + iteration: int | None = None, ) -> Any: """ Initialize a model based on the parameters that were set @@ -392,6 +423,10 @@ def initialize_model_from_trace( model """ run = get_run(run_id) + # TODO(eddiebergman): I imagine this is None if it's not published, + # might need to raise an explicit error for that + assert run.flow_id is not None + flow = get_flow(run.flow_id) run_trace = get_run_trace(run_id) @@ -404,11 +439,10 @@ def initialize_model_from_trace( current = run_trace.trace_iterations[(repeat, fold, iteration)] search_model = initialize_model_from_run(run_id) - model = flow.extension.instantiate_model_from_hpo_class(search_model, current) - return model + return flow.extension.instantiate_model_from_hpo_class(search_model, current) -def run_exists(task_id: int, setup_id: int) -> Set[int]: +def run_exists(task_id: int, setup_id: int) -> set[int]: """Checks whether a task/setup combination is already present on the server. @@ -428,31 +462,58 @@ def run_exists(task_id: int, setup_id: int) -> Set[int]: return set() try: - result = cast( - pd.DataFrame, list_runs(task=[task_id], setup=[setup_id], output_format="dataframe") - ) + result = list_runs(task=[task_id], setup=[setup_id], output_format="dataframe") + assert isinstance(result, pd.DataFrame) # TODO(eddiebergman): Remove once #1299 return set() if result.empty else set(result["run_id"]) except OpenMLServerException as exception: - # error code 512 implies no results. The run does not exist yet - assert exception.code == 512 + # error code implies no results. The run does not exist yet + if exception.code != ERROR_CODE: + raise exception return set() -def _run_task_get_arffcontent( +def _run_task_get_arffcontent( # noqa: PLR0915, PLR0912, PLR0913, C901 + *, model: Any, task: OpenMLTask, - extension: "Extension", + extension: Extension, add_local_measures: bool, - dataset_format: str, - n_jobs: Optional[int] = None, -) -> Tuple[ - List[List], - Optional[OpenMLRunTrace], - "OrderedDict[str, OrderedDict]", - "OrderedDict[str, OrderedDict]", + dataset_format: Literal["array", "dataframe"], + n_jobs: int | None = None, +) -> tuple[ + list[list], + OpenMLRunTrace | None, + OrderedDict[str, OrderedDict], + OrderedDict[str, OrderedDict], ]: - arff_datacontent = [] # type: List[List] - traces = [] # type: List[OpenMLRunTrace] + """Runs the hyperparameter optimization on the given task + and returns the arfftrace content. + + Parameters + ---------- + model : Any + The model that is to be evalauted. + task : OpenMLTask + The OpenMLTask to evaluate. + extension : Extension + The OpenML extension object. + add_local_measures : bool + Whether to compute additional local evaluation measures. + dataset_format : str + The format in which to download the dataset. + n_jobs : int + Number of jobs to run in parallel. + If None, use 1 core by default. If -1, use all available cores. + + Returns + ------- + Tuple[List[List], Optional[OpenMLRunTrace], + OrderedDict[str, OrderedDict], OrderedDict[str, OrderedDict]] + A tuple containing the arfftrace content, + the OpenML run trace, the global and local evaluation measures. + """ + arff_datacontent = [] # type: list[list] + traces = [] # type: list[OpenMLRunTrace] # stores fold-based evaluation measures. In case of a sample based task, # this information is multiple times overwritten, but due to the ordering # of tne loops, eventually it contains the information based on the full @@ -484,7 +545,18 @@ def _run_task_get_arffcontent( # Execute runs in parallel # assuming the same number of tasks as workers (n_jobs), the total compute time for this # statement will be similar to the slowest run - job_rvals = Parallel(verbose=0, n_jobs=n_jobs)( + # TODO(eddiebergman): Simplify this + job_rvals: list[ + tuple[ + np.ndarray, + pd.DataFrame | None, + np.ndarray, + pd.DataFrame | None, + OpenMLRunTrace | None, + OrderedDict[str, float], + ], + ] + job_rvals = Parallel(verbose=0, n_jobs=n_jobs)( # type: ignore delayed(_run_task_get_arffcontent_parallel_helper)( extension=extension, fold_no=fold_no, @@ -495,22 +567,32 @@ def _run_task_get_arffcontent( dataset_format=dataset_format, configuration=_config, ) - for n_fit, rep_no, fold_no, sample_no in jobs + for _n_fit, rep_no, fold_no, sample_no in jobs ) # job_rvals contain the output of all the runs with one-to-one correspondence with `jobs` for n_fit, rep_no, fold_no, sample_no in jobs: - pred_y, proba_y, test_indices, test_y, trace, user_defined_measures_fold = job_rvals[ + pred_y, proba_y, test_indices, test_y, inner_trace, user_defined_measures_fold = job_rvals[ n_fit - 1 ] - if trace is not None: - traces.append(trace) + + if inner_trace is not None: + traces.append(inner_trace) # add client-side calculated metrics. These is used on the server as # consistency check, only useful for supervised tasks - def _calculate_local_measure(sklearn_fn, openml_name): - user_defined_measures_fold[openml_name] = sklearn_fn(test_y, pred_y) + def _calculate_local_measure( # type: ignore + sklearn_fn, + openml_name, + _test_y=test_y, + _pred_y=pred_y, + _user_defined_measures_fold=user_defined_measures_fold, + ): + _user_defined_measures_fold[openml_name] = sklearn_fn(_test_y, _pred_y) if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)): + assert test_y is not None + assert proba_y is not None + for i, tst_idx in enumerate(test_indices): if task.class_labels is not None: prediction = ( @@ -554,6 +636,7 @@ def _calculate_local_measure(sklearn_fn, openml_name): ) elif isinstance(task, OpenMLRegressionTask): + assert test_y is not None for i, _ in enumerate(test_indices): truth = test_y.iloc[i] if isinstance(test_y, pd.Series) else test_y[i] arff_line = format_prediction( @@ -601,15 +684,14 @@ def _calculate_local_measure(sklearn_fn, openml_name): sample_no ] = user_defined_measures_fold[measure] + trace: OpenMLRunTrace | None = None if len(traces) > 0: - if len(traces) != n_fit: + if len(traces) != len(jobs): raise ValueError( - "Did not find enough traces (expected {}, found {})".format(n_fit, len(traces)) + f"Did not find enough traces (expected {len(jobs)}, found {len(traces)})", ) - else: - trace = OpenMLRunTrace.merge_traces(traces) - else: - trace = None + + trace = OpenMLRunTrace.merge_traces(traces) return ( arff_datacontent, @@ -619,54 +701,88 @@ def _calculate_local_measure(sklearn_fn, openml_name): ) -def _run_task_get_arffcontent_parallel_helper( - extension: "Extension", +def _run_task_get_arffcontent_parallel_helper( # noqa: PLR0913 + extension: Extension, fold_no: int, model: Any, rep_no: int, sample_no: int, task: OpenMLTask, - dataset_format: str, - configuration: Optional[Dict] = None, -) -> Tuple[ + dataset_format: Literal["array", "dataframe"], + configuration: _Config | None = None, +) -> tuple[ np.ndarray, - Optional[pd.DataFrame], + pd.DataFrame | None, np.ndarray, - Optional[pd.DataFrame], - Optional[OpenMLRunTrace], - "OrderedDict[str, float]", + pd.DataFrame | None, + OpenMLRunTrace | None, + OrderedDict[str, float], ]: + """Helper function that runs a single model on a single task fold sample. + + Parameters + ---------- + extension : Extension + An OpenML extension instance. + fold_no : int + The fold number to be run. + model : Any + The model that is to be evaluated. + rep_no : int + Repetition number to be run. + sample_no : int + Sample number to be run. + task : OpenMLTask + The task object from OpenML. + dataset_format : str + The dataset format to be used. + configuration : _Config + Hyperparameters to configure the model. + + Returns + ------- + Tuple[np.ndarray, Optional[pd.DataFrame], np.ndarray, Optional[pd.DataFrame], + Optional[OpenMLRunTrace], OrderedDict[str, float]] + A tuple containing the predictions, probability estimates (if applicable), + actual target values, actual target value probabilities (if applicable), + the trace object of the OpenML run (if applicable), + and a dictionary of local measures for this particular fold. + """ # Sets up the OpenML instantiated in the child process to match that of the parent's # if configuration=None, loads the default config._setup(configuration) train_indices, test_indices = task.get_train_test_split_indices( - repeat=rep_no, fold=fold_no, sample=sample_no + repeat=rep_no, + fold=fold_no, + sample=sample_no, ) if isinstance(task, OpenMLSupervisedTask): x, y = task.get_X_and_y(dataset_format=dataset_format) - if dataset_format == "dataframe": + if isinstance(x, pd.DataFrame): + assert isinstance(y, (pd.Series, pd.DataFrame)) train_x = x.iloc[train_indices] train_y = y.iloc[train_indices] test_x = x.iloc[test_indices] test_y = y.iloc[test_indices] else: - train_x = x[train_indices] + # TODO(eddiebergman): Complains spmatrix doesn't support __getitem__ for typing + assert y is not None + train_x = x[train_indices] # type: ignore train_y = y[train_indices] - test_x = x[test_indices] + test_x = x[test_indices] # type: ignore test_y = y[test_indices] elif isinstance(task, OpenMLClusteringTask): x = task.get_X(dataset_format=dataset_format) - if dataset_format == "dataframe": - train_x = x.iloc[train_indices] - else: - train_x = x[train_indices] + # TODO(eddiebergman): Complains spmatrix doesn't support __getitem__ for typing + train_x = x.iloc[train_indices] if isinstance(x, pd.DataFrame) else x[train_indices] # type: ignore train_y = None test_x = None test_y = None else: raise NotImplementedError(task.task_type) + config.logger.info( "Going to run model {} on dataset {} for repeat {} fold {} sample {}".format( str(model), @@ -674,7 +790,7 @@ def _run_task_get_arffcontent_parallel_helper( rep_no, fold_no, sample_no, - ) + ), ) ( pred_y, @@ -685,15 +801,16 @@ def _run_task_get_arffcontent_parallel_helper( model=model, task=task, X_train=train_x, - y_train=train_y, + # TODO(eddiebergman): Likely should not be ignored + y_train=train_y, # type: ignore rep_no=rep_no, fold_no=fold_no, X_test=test_x, ) - return pred_y, proba_y, test_indices, test_y, trace, user_defined_measures_fold + return pred_y, proba_y, test_indices, test_y, trace, user_defined_measures_fold # type: ignore -def get_runs(run_ids): +def get_runs(run_ids: list[int]) -> list[OpenMLRun]: """Gets all runs in run_ids list. Parameters @@ -705,7 +822,6 @@ def get_runs(run_ids): runs : list of OpenMLRun List of runs corresponding to IDs, fetched from the server. """ - runs = [] for run_id in run_ids: runs.append(get_run(run_id)) @@ -713,7 +829,7 @@ def get_runs(run_ids): @openml.utils.thread_safe_if_oslo_installed -def get_run(run_id: int, ignore_cache: bool = False) -> OpenMLRun: +def get_run(run_id: int, ignore_cache: bool = False) -> OpenMLRun: # noqa: FBT002, FBT001 """Gets run corresponding to run_id. Parameters @@ -731,29 +847,26 @@ def get_run(run_id: int, ignore_cache: bool = False) -> OpenMLRun: run : OpenMLRun Run corresponding to ID, fetched from the server. """ - run_dir = openml.utils._create_cache_directory_for_id(RUNS_CACHE_DIR_NAME, run_id) - run_file = os.path.join(run_dir, "description.xml") + run_dir = Path(openml.utils._create_cache_directory_for_id(RUNS_CACHE_DIR_NAME, run_id)) + run_file = run_dir / "description.xml" - if not os.path.exists(run_dir): - os.makedirs(run_dir) + run_dir.mkdir(parents=True, exist_ok=True) try: if not ignore_cache: return _get_cached_run(run_id) - else: - raise OpenMLCacheException(message="dummy") + + raise OpenMLCacheException(message="dummy") except OpenMLCacheException: run_xml = openml._api_calls._perform_api_call("run/%d" % run_id, "get") - with io.open(run_file, "w", encoding="utf8") as fh: + with run_file.open("w", encoding="utf8") as fh: fh.write(run_xml) - run = _create_run_from_xml(run_xml) + return _create_run_from_xml(run_xml) - return run - -def _create_run_from_xml(xml, from_server=True): +def _create_run_from_xml(xml: str, from_server: bool = True) -> OpenMLRun: # noqa: PLR0915, PLR0912, C901, , FBT001, FBT002FBT """Create a run object from xml returned from server. Parameters @@ -771,7 +884,7 @@ def _create_run_from_xml(xml, from_server=True): New run object representing run_xml. """ - def obtain_field(xml_obj, fieldname, from_server, cast=None): + def obtain_field(xml_obj, fieldname, from_server, cast=None): # type: ignore # this function can be used to check whether a field is present in an # object. if it is not present, either returns None or throws an error # (this is usually done if the xml comes from the server) @@ -779,10 +892,11 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None): if cast is not None: return cast(xml_obj[fieldname]) return xml_obj[fieldname] - elif not from_server: + + if not from_server: return None - else: - raise AttributeError("Run XML does not contain required (server) " "field: ", fieldname) + + raise AttributeError("Run XML does not contain required (server) " "field: ", fieldname) run = xmltodict.parse(xml, force_list=["oml:file", "oml:evaluation", "oml:parameter_setting"])[ "oml:run" @@ -794,10 +908,7 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None): task_type = obtain_field(run, "oml:task_type", from_server) # even with the server requirement this field may be empty. - if "oml:task_evaluation_measure" in run: - task_evaluation_measure = run["oml:task_evaluation_measure"] - else: - task_evaluation_measure = None + task_evaluation_measure = run.get("oml:task_evaluation_measure", None) if not from_server and run["oml:flow_id"] is None: # This can happen for a locally stored run of which the flow is not yet published. @@ -811,9 +922,10 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None): if "oml:parameter_setting" in run: obtained_parameter_settings = run["oml:parameter_setting"] for parameter_dict in obtained_parameter_settings: - current_parameter = OrderedDict() - current_parameter["oml:name"] = parameter_dict["oml:name"] - current_parameter["oml:value"] = parameter_dict["oml:value"] + current_parameter = { + "oml:name": parameter_dict["oml:name"], + "oml:value": parameter_dict["oml:value"], + } if "oml:component" in parameter_dict: current_parameter["oml:component"] = parameter_dict["oml:component"] parameters.append(current_parameter) @@ -834,15 +946,14 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None): t = openml.tasks.get_task(task_id, download_data=False) if not hasattr(t, "dataset_id"): raise ValueError( - "Unable to fetch dataset_id from the task({}) " - "linked to run({})".format(task_id, run_id) + f"Unable to fetch dataset_id from the task({task_id}) linked to run({run_id})", ) dataset_id = t.dataset_id - files = OrderedDict() - evaluations = OrderedDict() - fold_evaluations = OrderedDict() - sample_evaluations = OrderedDict() + files: dict[str, int] = {} + evaluations: dict[str, float | Any] = {} + fold_evaluations: dict[str, dict[int, dict[int, float | Any]]] = {} + sample_evaluations: dict[str, dict[int, dict[int, dict[int, float | Any]]]] = {} if "oml:output_data" not in run: if from_server: raise ValueError("Run does not contain output_data " "(OpenML server error?)") @@ -868,7 +979,7 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None): else: raise ValueError( 'Could not find keys "value" or ' - '"array_data" in %s' % str(evaluation_dict.keys()) + '"array_data" in %s' % str(evaluation_dict.keys()), ) if ( "@repeat" in evaluation_dict @@ -879,19 +990,19 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None): fold = int(evaluation_dict["@fold"]) sample = int(evaluation_dict["@sample"]) if key not in sample_evaluations: - sample_evaluations[key] = OrderedDict() + sample_evaluations[key] = {} if repeat not in sample_evaluations[key]: - sample_evaluations[key][repeat] = OrderedDict() + sample_evaluations[key][repeat] = {} if fold not in sample_evaluations[key][repeat]: - sample_evaluations[key][repeat][fold] = OrderedDict() + sample_evaluations[key][repeat][fold] = {} sample_evaluations[key][repeat][fold][sample] = value elif "@repeat" in evaluation_dict and "@fold" in evaluation_dict: repeat = int(evaluation_dict["@repeat"]) fold = int(evaluation_dict["@fold"]) if key not in fold_evaluations: - fold_evaluations[key] = OrderedDict() + fold_evaluations[key] = {} if repeat not in fold_evaluations[key]: - fold_evaluations[key][repeat] = OrderedDict() + fold_evaluations[key][repeat] = {} fold_evaluations[key][repeat][fold] = value else: evaluations[key] = value @@ -903,12 +1014,12 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None): task = openml.tasks.get_task(task_id) if task.task_type_id == TaskType.SUBGROUP_DISCOVERY: raise NotImplementedError("Subgroup discovery tasks are not yet supported.") - else: - # JvR: actually, I am not sure whether this error should be raised. - # a run can consist without predictions. But for now let's keep it - # Matthias: yes, it should stay as long as we do not really handle - # this stuff - raise ValueError("No prediction files for run %d in run " "description XML" % run_id) + + # JvR: actually, I am not sure whether this error should be raised. + # a run can consist without predictions. But for now let's keep it + # Matthias: yes, it should stay as long as we do not really handle + # this stuff + raise ValueError("No prediction files for run %d in run description XML" % run_id) tags = openml.utils.extract_xml_tags("oml:tag", run) @@ -936,36 +1047,33 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None): ) -def _get_cached_run(run_id): +def _get_cached_run(run_id: int) -> OpenMLRun: """Load a run from the cache.""" - run_cache_dir = openml.utils._create_cache_directory_for_id( - RUNS_CACHE_DIR_NAME, - run_id, - ) + run_cache_dir = openml.utils._create_cache_directory_for_id(RUNS_CACHE_DIR_NAME, run_id) + run_file = run_cache_dir / "description.xml" try: - run_file = os.path.join(run_cache_dir, "description.xml") - with io.open(run_file, encoding="utf8") as fh: - run = _create_run_from_xml(xml=fh.read()) - return run - - except (OSError, IOError): - raise OpenMLCacheException("Run file for run id %d not " "cached" % run_id) - - -def list_runs( - offset: Optional[int] = None, - size: Optional[int] = None, - id: Optional[List] = None, - task: Optional[List[int]] = None, - setup: Optional[List] = None, - flow: Optional[List] = None, - uploader: Optional[List] = None, - tag: Optional[str] = None, - study: Optional[int] = None, - display_errors: bool = False, - output_format: str = "dict", - **kwargs, -) -> Union[Dict, pd.DataFrame]: + with run_file.open(encoding="utf8") as fh: + return _create_run_from_xml(xml=fh.read()) + except OSError as e: + raise OpenMLCacheException(f"Run file for run id {run_id} not cached") from e + + +# TODO(eddiebergman): Could overload, likely too large an annoying to do +# nvm, will be deprecated in 0.15 +def list_runs( # noqa: PLR0913 + offset: int | None = None, + size: int | None = None, + id: list | None = None, # noqa: A002 + task: list[int] | None = None, + setup: list | None = None, + flow: list | None = None, + uploader: list | None = None, + tag: str | None = None, + study: int | None = None, + display_errors: bool = False, # noqa: FBT001, FBT002 + output_format: Literal["dict", "dataframe"] = "dict", + **kwargs: Any, +) -> dict | pd.DataFrame: """ List all runs matching all of the given filters. (Supports large amount of results) @@ -1008,9 +1116,8 @@ def list_runs( dict of dicts, or dataframe """ if output_format not in ["dataframe", "dict"]: - raise ValueError( - "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable." - ) + raise ValueError("Invalid output format selected. Only 'dict' or 'dataframe' applicable.") + # TODO: [0.15] if output_format == "dict": msg = ( @@ -1020,6 +1127,7 @@ def list_runs( ) warnings.warn(msg, category=FutureWarning, stacklevel=2) + # TODO(eddiebergman): Do we really need this runtime type validation? if id is not None and (not isinstance(id, list)): raise TypeError("id must be of type list.") if task is not None and (not isinstance(task, list)): @@ -1031,8 +1139,8 @@ def list_runs( if uploader is not None and (not isinstance(uploader, list)): raise TypeError("uploader must be of type list.") - return openml.utils._list_all( - output_format=output_format, + return openml.utils._list_all( # type: ignore + list_output_format=output_format, # type: ignore listing_call=_list_runs, offset=offset, size=size, @@ -1048,17 +1156,17 @@ def list_runs( ) -def _list_runs( - id: Optional[List] = None, - task: Optional[List] = None, - setup: Optional[List] = None, - flow: Optional[List] = None, - uploader: Optional[List] = None, - study: Optional[int] = None, - display_errors: bool = False, - output_format: str = "dict", - **kwargs, -) -> Union[Dict, pd.DataFrame]: +def _list_runs( # noqa: PLR0913 + id: list | None = None, # noqa: A002 + task: list | None = None, + setup: list | None = None, + flow: list | None = None, + uploader: list | None = None, + study: int | None = None, + display_errors: bool = False, # noqa: FBT002, FBT001 + output_format: Literal["dict", "dataframe"] = "dict", + **kwargs: Any, +) -> dict | pd.DataFrame: """ Perform API call `/run/list/{filters}' ` @@ -1099,11 +1207,10 @@ def _list_runs( dict, or dataframe List of found runs. """ - api_call = "run/list" if kwargs is not None: for operator, value in kwargs.items(): - api_call += "/%s/%s" % (operator, value) + api_call += f"/{operator}/{value}" if id is not None: api_call += "/run/%s" % ",".join([str(int(i)) for i in id]) if task is not None: @@ -1121,40 +1228,43 @@ def _list_runs( return __list_runs(api_call=api_call, output_format=output_format) -def __list_runs(api_call, output_format="dict"): +def __list_runs( + api_call: str, output_format: Literal["dict", "dataframe"] = "dict" +) -> dict | pd.DataFrame: """Helper function to parse API calls which are lists of runs""" xml_string = openml._api_calls._perform_api_call(api_call, "get") runs_dict = xmltodict.parse(xml_string, force_list=("oml:run",)) # Minimalistic check if the XML is useful if "oml:runs" not in runs_dict: - raise ValueError('Error in return XML, does not contain "oml:runs": %s' % str(runs_dict)) - elif "@xmlns:oml" not in runs_dict["oml:runs"]: + raise ValueError(f'Error in return XML, does not contain "oml:runs": {runs_dict}') + + if "@xmlns:oml" not in runs_dict["oml:runs"]: raise ValueError( - "Error in return XML, does not contain " '"oml:runs"/@xmlns:oml: %s' % str(runs_dict) + f'Error in return XML, does not contain "oml:runs"/@xmlns:oml: {runs_dict}' ) - elif runs_dict["oml:runs"]["@xmlns:oml"] != "http://openml.org/openml": + + if runs_dict["oml:runs"]["@xmlns:oml"] != "http://openml.org/openml": raise ValueError( "Error in return XML, value of " '"oml:runs"/@xmlns:oml is not ' - '"http://openml.org/openml": %s' % str(runs_dict) + f'"http://openml.org/openml": {runs_dict}', ) - assert type(runs_dict["oml:runs"]["oml:run"]) == list, type(runs_dict["oml:runs"]) - - runs = OrderedDict() - for run_ in runs_dict["oml:runs"]["oml:run"]: - run_id = int(run_["oml:run_id"]) - run = { - "run_id": run_id, - "task_id": int(run_["oml:task_id"]), - "setup_id": int(run_["oml:setup_id"]), - "flow_id": int(run_["oml:flow_id"]), - "uploader": int(run_["oml:uploader"]), - "task_type": TaskType(int(run_["oml:task_type_id"])), - "upload_time": str(run_["oml:upload_time"]), - "error_message": str((run_["oml:error_message"]) or ""), + assert isinstance(runs_dict["oml:runs"]["oml:run"], list), type(runs_dict["oml:runs"]) + + runs = { + int(r["oml:run_id"]): { + "run_id": int(r["oml:run_id"]), + "task_id": int(r["oml:task_id"]), + "setup_id": int(r["oml:setup_id"]), + "flow_id": int(r["oml:flow_id"]), + "uploader": int(r["oml:uploader"]), + "task_type": TaskType(int(r["oml:task_type_id"])), + "upload_time": str(r["oml:upload_time"]), + "error_message": str((r["oml:error_message"]) or ""), } - runs[run_id] = run + for r in runs_dict["oml:runs"]["oml:run"] + } if output_format == "dataframe": runs = pd.DataFrame.from_dict(runs, orient="index") @@ -1162,16 +1272,16 @@ def __list_runs(api_call, output_format="dict"): return runs -def format_prediction( +def format_prediction( # noqa: PLR0913 task: OpenMLSupervisedTask, repeat: int, fold: int, index: int, - prediction: Union[str, int, float], - truth: Union[str, int, float], - sample: Optional[int] = None, - proba: Optional[Dict[str, float]] = None, -) -> List[Union[str, int, float]]: + prediction: str | int | float, + truth: str | int | float, + sample: int | None = None, + proba: dict[str, float] | None = None, +) -> list[str | int | float]: """Format the predictions in the specific order as required for the run results. Parameters @@ -1216,14 +1326,15 @@ def format_prediction( if sample is None: if isinstance(task, OpenMLLearningCurveTask): raise ValueError("`sample` can not be none for LearningCurveTask") - else: - sample = 0 + + sample = 0 probabilities = [proba[c] for c in task.class_labels] return [repeat, fold, sample, index, prediction, truth, *probabilities] - elif isinstance(task, OpenMLRegressionTask): + + if isinstance(task, OpenMLRegressionTask): return [repeat, fold, index, prediction, truth] - else: - raise NotImplementedError(f"Formatting for {type(task)} is not supported.") + + raise NotImplementedError(f"Formatting for {type(task)} is not supported.") def delete_run(run_id: int) -> bool: diff --git a/openml/runs/run.py b/openml/runs/run.py index 5528c8a67..766f8c97f 100644 --- a/openml/runs/run.py +++ b/openml/runs/run.py @@ -1,10 +1,16 @@ # License: BSD 3-Clause +from __future__ import annotations -from collections import OrderedDict import pickle import time -from typing import Any, IO, TextIO, List, Union, Tuple, Optional, Dict # noqa F401 -import os +from collections import OrderedDict +from pathlib import Path +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Sequence, +) import arff import numpy as np @@ -13,17 +19,21 @@ import openml import openml._api_calls from openml.base import OpenMLBase -from ..exceptions import PyOpenMLError -from ..flows import get_flow -from ..tasks import ( - get_task, - TaskType, +from openml.exceptions import PyOpenMLError +from openml.flows import OpenMLFlow, get_flow +from openml.tasks import ( OpenMLClassificationTask, - OpenMLLearningCurveTask, OpenMLClusteringTask, + OpenMLLearningCurveTask, OpenMLRegressionTask, + OpenMLTask, + TaskType, + get_task, ) +if TYPE_CHECKING: + from openml.runs.trace import OpenMLRunTrace + class OpenMLRun(OpenMLBase): """OpenML Run: result of running a model on an OpenML dataset. @@ -38,7 +48,7 @@ class OpenMLRun(OpenMLBase): The ID of the OpenML dataset used for the run. setup_string: str The setup string of the run. - output_files: Dict[str, str] + output_files: Dict[str, int] Specifies where each related file can be found. setup_id: int An integer representing the ID of the setup used for the run. @@ -66,7 +76,7 @@ class OpenMLRun(OpenMLBase): The evaluation measure used for the task. flow_name: str The name of the OpenML flow associated with the run. - parameter_settings: List[OrderedDict] + parameter_settings: list[OrderedDict] Representing the parameter settings used for the run. predictions_url: str The URL of the predictions file. @@ -85,33 +95,33 @@ class OpenMLRun(OpenMLBase): Description of the run stored in the run meta-data. """ - def __init__( + def __init__( # noqa: PLR0913 self, - task_id, - flow_id, - dataset_id, - setup_string=None, - output_files=None, - setup_id=None, - tags=None, - uploader=None, - uploader_name=None, - evaluations=None, - fold_evaluations=None, - sample_evaluations=None, - data_content=None, - trace=None, - model=None, - task_type=None, - task_evaluation_measure=None, - flow_name=None, - parameter_settings=None, - predictions_url=None, - task=None, - flow=None, - run_id=None, - description_text=None, - run_details=None, + task_id: int, + flow_id: int | None, + dataset_id: int | None, + setup_string: str | None = None, + output_files: dict[str, int] | None = None, + setup_id: int | None = None, + tags: list[str] | None = None, + uploader: int | None = None, + uploader_name: str | None = None, + evaluations: dict | None = None, + fold_evaluations: dict | None = None, + sample_evaluations: dict | None = None, + data_content: list[list] | None = None, + trace: OpenMLRunTrace | None = None, + model: object | None = None, + task_type: str | None = None, + task_evaluation_measure: str | None = None, + flow_name: str | None = None, + parameter_settings: list[dict[str, Any]] | None = None, + predictions_url: str | None = None, + task: OpenMLTask | None = None, + flow: OpenMLFlow | None = None, + run_id: int | None = None, + description_text: str | None = None, + run_details: str | None = None, ): self.uploader = uploader self.uploader_name = uploader_name @@ -153,12 +163,14 @@ def predictions(self) -> pd.DataFrame: else: raise RuntimeError("Run has no predictions.") self._predictions = pd.DataFrame( - arff_dict["data"], columns=[name for name, _ in arff_dict["attributes"]] + arff_dict["data"], + columns=[name for name, _ in arff_dict["attributes"]], ) return self._predictions @property - def id(self) -> Optional[int]: + def id(self) -> int | None: + """The ID of the run, None if not uploaded to the server yet.""" return self.run_id def _evaluation_summary(self, metric: str) -> str: @@ -181,15 +193,17 @@ def _evaluation_summary(self, metric: str) -> str: A formatted string that displays the metric's evaluation summary. The summary consists of the mean and std. """ + if self.fold_evaluations is None: + raise ValueError("No fold evaluations available.") fold_score_lists = self.fold_evaluations[metric].values() # Get the mean and std over all repetitions rep_means = [np.mean(list(x.values())) for x in fold_score_lists] rep_stds = [np.std(list(x.values())) for x in fold_score_lists] - return "{:.4f} +- {:.4f}".format(np.mean(rep_means), np.mean(rep_stds)) + return f"{np.mean(rep_means):.4f} +- {np.mean(rep_stds):.4f}" - def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: + def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str]]]: """Collect all information to display in the __repr__ body.""" # Set up fields fields = { @@ -201,20 +215,26 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: "Task URL": openml.tasks.OpenMLTask.url_for_id(self.task_id), "Flow ID": self.flow_id, "Flow Name": self.flow_name, - "Flow URL": openml.flows.OpenMLFlow.url_for_id(self.flow_id), + "Flow URL": ( + openml.flows.OpenMLFlow.url_for_id(self.flow_id) + if self.flow_id is not None + else None + ), "Setup ID": self.setup_id, "Setup String": self.setup_string, "Dataset ID": self.dataset_id, - "Dataset URL": openml.datasets.OpenMLDataset.url_for_id(self.dataset_id), + "Dataset URL": ( + openml.datasets.OpenMLDataset.url_for_id(self.dataset_id) + if self.dataset_id is not None + else None + ), } # determines the order of the initial fields in which the information will be printed order = ["Uploader Name", "Uploader Profile", "Metric", "Result"] if self.uploader is not None: - fields["Uploader Profile"] = "{}/u/{}".format( - openml.config.get_server_base_url(), self.uploader - ) + fields["Uploader Profile"] = f"{openml.config.get_server_base_url()}/u/{self.uploader}" if self.run_id is not None: fields["Run URL"] = self.openml_url if self.evaluations is not None and self.task_evaluation_measure in self.evaluations: @@ -223,13 +243,11 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: # -- Add locally computed summary values if possible if "predictive_accuracy" in self.fold_evaluations: # OpenMLClassificationTask; OpenMLLearningCurveTask - # default: predictive_accuracy result_field = "Local Result - Accuracy (+- STD)" fields[result_field] = self._evaluation_summary("predictive_accuracy") order.append(result_field) elif "mean_absolute_error" in self.fold_evaluations: # OpenMLRegressionTask - # default: mean_absolute_error result_field = "Local Result - MAE (+- STD)" fields[result_field] = self._evaluation_summary("mean_absolute_error") order.append(result_field) @@ -255,10 +273,14 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: "Dataset ID", "Dataset URL", ] - return [(key, fields[key]) for key in order if key in fields] + return [ + (key, "None" if fields[key] is None else fields[key]) # type: ignore + for key in order + if key in fields + ] @classmethod - def from_filesystem(cls, directory: str, expect_model: bool = True) -> "OpenMLRun": + def from_filesystem(cls, directory: str | Path, expect_model: bool = True) -> OpenMLRun: # noqa: FBT001, FBT002 """ The inverse of the to_filesystem method. Instantiates an OpenMLRun object based on files stored on the file system. @@ -279,26 +301,26 @@ def from_filesystem(cls, directory: str, expect_model: bool = True) -> "OpenMLRu run : OpenMLRun the re-instantiated run object """ - # Avoiding cyclic imports import openml.runs.functions - if not os.path.isdir(directory): + directory = Path(directory) + if not directory.is_dir(): raise ValueError("Could not find folder") - description_path = os.path.join(directory, "description.xml") - predictions_path = os.path.join(directory, "predictions.arff") - trace_path = os.path.join(directory, "trace.arff") - model_path = os.path.join(directory, "model.pkl") + description_path = directory / "description.xml" + predictions_path = directory / "predictions.arff" + trace_path = directory / "trace.arff" + model_path = directory / "model.pkl" - if not os.path.isfile(description_path): + if not description_path.is_file(): raise ValueError("Could not find description.xml") - if not os.path.isfile(predictions_path): + if not predictions_path.is_file(): raise ValueError("Could not find predictions.arff") - if not os.path.isfile(model_path) and expect_model: + if (not model_path.is_file()) and expect_model: raise ValueError("Could not find model.pkl") - with open(description_path, "r") as fht: + with description_path.open() as fht: xml_string = fht.read() run = openml.runs.functions._create_run_from_xml(xml_string, from_server=False) @@ -307,25 +329,25 @@ def from_filesystem(cls, directory: str, expect_model: bool = True) -> "OpenMLRu run.flow = flow run.flow_name = flow.name - with open(predictions_path, "r") as fht: + with predictions_path.open() as fht: predictions = arff.load(fht) run.data_content = predictions["data"] - if os.path.isfile(model_path): + if model_path.is_file(): # note that it will load the model if the file exists, even if # expect_model is False - with open(model_path, "rb") as fhb: - run.model = pickle.load(fhb) + with model_path.open("rb") as fhb: + run.model = pickle.load(fhb) # noqa: S301 - if os.path.isfile(trace_path): + if trace_path.is_file(): run.trace = openml.runs.OpenMLRunTrace._from_filesystem(trace_path) return run def to_filesystem( self, - directory: str, - store_model: bool = True, + directory: str | Path, + store_model: bool = True, # noqa: FBT001, FBT002 ) -> None: """ The inverse of the from_filesystem method. Serializes a run @@ -344,32 +366,31 @@ def to_filesystem( """ if self.data_content is None or self.model is None: raise ValueError("Run should have been executed (and contain " "model / predictions)") + directory = Path(directory) + directory.mkdir(exist_ok=True, parents=True) - os.makedirs(directory, exist_ok=True) - if not os.listdir(directory) == []: - raise ValueError( - "Output directory {} should be empty".format(os.path.abspath(directory)) - ) + if any(directory.iterdir()): + raise ValueError(f"Output directory {directory.expanduser().resolve()} should be empty") run_xml = self._to_xml() predictions_arff = arff.dumps(self._generate_arff_dict()) # It seems like typing does not allow to define the same variable multiple times - with open(os.path.join(directory, "description.xml"), "w") as fh: # type: TextIO + with (directory / "description.xml").open("w") as fh: fh.write(run_xml) - with open(os.path.join(directory, "predictions.arff"), "w") as fh: + with (directory / "predictions.arff").open("w") as fh: fh.write(predictions_arff) if store_model: - with open(os.path.join(directory, "model.pkl"), "wb") as fh_b: # type: IO[bytes] + with (directory / "model.pkl").open("wb") as fh_b: pickle.dump(self.model, fh_b) - if self.flow_id is None: + if self.flow_id is None and self.flow is not None: self.flow.to_filesystem(directory) if self.trace is not None: self.trace._to_filesystem(directory) - def _generate_arff_dict(self) -> "OrderedDict[str, Any]": + def _generate_arff_dict(self) -> OrderedDict[str, Any]: """Generates the arff dictionary for uploading predictions to the server. @@ -386,6 +407,7 @@ def _generate_arff_dict(self) -> "OrderedDict[str, Any]": if self.data_content is None: raise ValueError("Run has not been executed.") if self.flow is None: + assert self.flow_id is not None, "Run has no associated flow id!" self.flow = get_flow(self.flow_id) if self.description_text is None: @@ -395,7 +417,7 @@ def _generate_arff_dict(self) -> "OrderedDict[str, Any]": arff_dict = OrderedDict() # type: 'OrderedDict[str, Any]' arff_dict["data"] = self.data_content arff_dict["description"] = self.description_text - arff_dict["relation"] = "openml_task_{}_predictions".format(task.task_id) + arff_dict["relation"] = f"openml_task_{task.task_id}_predictions" if isinstance(task, OpenMLLearningCurveTask): class_labels = task.class_labels @@ -462,7 +484,7 @@ def _generate_arff_dict(self) -> "OrderedDict[str, Any]": return arff_dict - def get_metric_fn(self, sklearn_fn, kwargs=None): + def get_metric_fn(self, sklearn_fn: Callable, kwargs: dict | None = None) -> np.ndarray: # noqa: PLR0915, PLR0912, C901 """Calculates metric scores based on predicted values. Assumes the run has been executed locally (and contains run_data). Furthermore, it assumes that the 'correct' or 'truth' attribute is specified in @@ -474,16 +496,18 @@ def get_metric_fn(self, sklearn_fn, kwargs=None): sklearn_fn : function a function pointer to a sklearn function that accepts ``y_true``, ``y_pred`` and ``**kwargs`` + kwargs : dict + kwargs for the function Returns ------- - scores : list - a list of floats, of length num_folds * num_repeats + scores : ndarray of scores of length num_folds * num_repeats + metric results """ - kwargs = kwargs if kwargs else dict() + kwargs = kwargs if kwargs else {} if self.data_content is not None and self.task_id is not None: predictions_arff = self._generate_arff_dict() - elif "predictions" in self.output_files: + elif (self.output_files is not None) and ("predictions" in self.output_files): predictions_file_url = openml._api_calls._file_id_to_url( self.output_files["predictions"], "predictions.arff", @@ -493,7 +517,7 @@ def get_metric_fn(self, sklearn_fn, kwargs=None): # TODO: make this a stream reader else: raise ValueError( - "Run should have been locally executed or " "contain outputfile reference." + "Run should have been locally executed or " "contain outputfile reference.", ) # Need to know more about the task to compute scores correctly @@ -510,7 +534,7 @@ def get_metric_fn(self, sklearn_fn, kwargs=None): if task.task_type_id != TaskType.CLUSTERING and "prediction" not in attribute_names: raise ValueError('Attribute "predict" should be set for ' "supervised task runs") - def _attribute_list_to_dict(attribute_list): + def _attribute_list_to_dict(attribute_list): # type: ignore # convenience function: Creates a mapping to map from the name of # attributes present in the arff prediction file to their index. # This is necessary because the number of classes can be different @@ -526,10 +550,7 @@ def _attribute_list_to_dict(attribute_list): fold_idx = attribute_dict["fold"] predicted_idx = attribute_dict["prediction"] # Assume supervised task - if ( - task.task_type_id == TaskType.SUPERVISED_CLASSIFICATION - or task.task_type_id == TaskType.LEARNING_CURVE - ): + if task.task_type_id in (TaskType.SUPERVISED_CLASSIFICATION, TaskType.LEARNING_CURVE): correct_idx = attribute_dict["correct"] elif task.task_type_id == TaskType.SUPERVISED_REGRESSION: correct_idx = attribute_dict["truth"] @@ -545,27 +566,23 @@ def _attribute_list_to_dict(attribute_list): pred = predictions_arff["attributes"][predicted_idx][1] corr = predictions_arff["attributes"][correct_idx][1] raise ValueError( - "Predicted and Correct do not have equal values:" - " %s Vs. %s" % (str(pred), str(corr)) + "Predicted and Correct do not have equal values:" f" {pred!s} Vs. {corr!s}", ) # TODO: these could be cached - values_predict = {} - values_correct = {} - for line_idx, line in enumerate(predictions_arff["data"]): + values_predict: dict[int, dict[int, dict[int, list[float]]]] = {} + values_correct: dict[int, dict[int, dict[int, list[float]]]] = {} + for _line_idx, line in enumerate(predictions_arff["data"]): rep = line[repeat_idx] fold = line[fold_idx] - if has_samples: - samp = line[sample_idx] - else: - samp = 0 # No learning curve sample, always 0 + samp = line[sample_idx] if has_samples else 0 if task.task_type_id in [ TaskType.SUPERVISED_CLASSIFICATION, TaskType.LEARNING_CURVE, ]: prediction = predictions_arff["attributes"][predicted_idx][1].index( - line[predicted_idx] + line[predicted_idx], ) correct = predictions_arff["attributes"][predicted_idx][1].index(line[correct_idx]) elif task.task_type_id == TaskType.SUPERVISED_REGRESSION: @@ -585,19 +602,19 @@ def _attribute_list_to_dict(attribute_list): values_correct[rep][fold][samp].append(correct) scores = [] - for rep in values_predict.keys(): - for fold in values_predict[rep].keys(): + for rep in values_predict: + for fold in values_predict[rep]: last_sample = len(values_predict[rep][fold]) - 1 y_pred = values_predict[rep][fold][last_sample] y_true = values_correct[rep][fold][last_sample] scores.append(sklearn_fn(y_true, y_pred, **kwargs)) return np.array(scores) - def _parse_publish_response(self, xml_response: Dict): + def _parse_publish_response(self, xml_response: dict) -> None: """Parse the id from the xml_response and assign it to self.""" self.run_id = int(xml_response["oml:upload_run"]["oml:run_id"]) - def _get_file_elements(self) -> Dict: + def _get_file_elements(self) -> dict: """Get file_elements to upload to the server. Derived child classes should overwrite this method as necessary. @@ -605,21 +622,22 @@ def _get_file_elements(self) -> Dict: """ if self.parameter_settings is None and self.model is None: raise PyOpenMLError( - "OpenMLRun must contain a model or be initialized with parameter_settings." + "OpenMLRun must contain a model or be initialized with parameter_settings.", ) if self.flow_id is None: if self.flow is None: raise PyOpenMLError( "OpenMLRun object does not contain a flow id or reference to OpenMLFlow " - "(these should have been added while executing the task). " + "(these should have been added while executing the task). ", ) - else: - # publish the linked Flow before publishing the run. - self.flow.publish() - self.flow_id = self.flow.flow_id + + # publish the linked Flow before publishing the run. + self.flow.publish() + self.flow_id = self.flow.flow_id if self.parameter_settings is None: if self.flow is None: + assert self.flow_id is not None # for mypy self.flow = openml.flows.get_flow(self.flow_id) self.parameter_settings = self.flow.extension.obtain_parameter_values( self.flow, @@ -637,7 +655,7 @@ def _get_file_elements(self) -> Dict: file_elements["trace"] = ("trace.arff", trace_arff) return file_elements - def _to_dict(self) -> "OrderedDict[str, OrderedDict]": + def _to_dict(self) -> dict[str, dict]: # noqa: PLR0912, C901 """Creates a dictionary representation of self.""" description = OrderedDict() # type: 'OrderedDict' description["oml:run"] = OrderedDict() @@ -657,7 +675,7 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]": self.sample_evaluations is not None and len(self.sample_evaluations) > 0 ): description["oml:run"]["oml:output_data"] = OrderedDict() - description["oml:run"]["oml:output_data"]["oml:evaluation"] = list() + description["oml:run"]["oml:output_data"]["oml:evaluation"] = [] if self.fold_evaluations is not None: for measure in self.fold_evaluations: for repeat in self.fold_evaluations[measure]: @@ -668,7 +686,7 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]": ("@fold", str(fold)), ("oml:name", measure), ("oml:value", str(value)), - ] + ], ) description["oml:run"]["oml:output_data"]["oml:evaluation"].append(current) if self.sample_evaluations is not None: @@ -683,9 +701,9 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]": ("@sample", str(sample)), ("oml:name", measure), ("oml:value", str(value)), - ] + ], ) description["oml:run"]["oml:output_data"]["oml:evaluation"].append( - current + current, ) return description diff --git a/openml/runs/trace.py b/openml/runs/trace.py index f6b038a55..3b7d60c2f 100644 --- a/openml/runs/trace.py +++ b/openml/runs/trace.py @@ -1,10 +1,12 @@ # License: BSD 3-Clause +from __future__ import annotations +import json from collections import OrderedDict from dataclasses import dataclass -import json -import os -from typing import List, Tuple, Optional # noqa F401 +from pathlib import Path +from typing import IO, Any, Iterator +from typing_extensions import Self import arff import xmltodict @@ -19,7 +21,83 @@ ] -class OpenMLRunTrace(object): +@dataclass +class OpenMLTraceIteration: + """ + OpenML Trace Iteration: parsed output from Run Trace call + Exactly one of `setup_string` or `parameters` must be provided. + + Parameters + ---------- + repeat : int + repeat number (in case of no repeats: 0) + + fold : int + fold number (in case of no folds: 0) + + iteration : int + iteration number of optimization procedure + + setup_string : str, optional + json string representing the parameters + If not provided, ``parameters`` should be set. + + evaluation : double + The evaluation that was awarded to this trace iteration. + Measure is defined by the task + + selected : bool + Whether this was the best of all iterations, and hence + selected for making predictions. Per fold/repeat there + should be only one iteration selected + + parameters : OrderedDict, optional + Dictionary specifying parameter names and their values. + If not provided, ``setup_string`` should be set. + """ + + repeat: int + fold: int + iteration: int + + evaluation: float + selected: bool + + setup_string: dict[str, str] | None = None + parameters: dict[str, str | int | float] | None = None + + def __post_init__(self) -> None: + # TODO: refactor into one argument of type + if self.setup_string and self.parameters: + raise ValueError( + "Can only be instantiated with either `setup_string` or `parameters` argument.", + ) + + if not (self.setup_string or self.parameters): + raise ValueError( + "Either `setup_string` or `parameters` needs to be passed as argument.", + ) + + if self.parameters is not None and not isinstance(self.parameters, dict): + raise TypeError( + "argument parameters is not an instance of OrderedDict, but %s" + % str(type(self.parameters)), + ) + + def get_parameters(self) -> dict[str, Any]: + """Get the parameters of this trace iteration.""" + # parameters have prefix 'parameter_' + if self.setup_string: + return { + param[len(PREFIX) :]: json.loads(value) + for param, value in self.setup_string.items() + } + + assert self.parameters is not None + return {param[len(PREFIX) :]: value for param, value in self.parameters.items()} + + +class OpenMLRunTrace: """OpenML Run Trace: parsed output from Run Trace call Parameters @@ -33,7 +111,20 @@ class OpenMLRunTrace(object): """ - def __init__(self, run_id, trace_iterations): + def __init__( + self, + run_id: int | None, + trace_iterations: dict[tuple[int, int, int], OpenMLTraceIteration], + ): + """Object to hold the trace content of a run. + + Parameters + ---------- + run_id : int + Id for which the trace content is to be stored. + trace_iterations : List[List] + The trace content obtained by running a flow on a task. + """ self.run_id = run_id self.trace_iterations = trace_iterations @@ -50,7 +141,7 @@ def get_selected_iteration(self, fold: int, repeat: int) -> int: repeat: int Returns - ---------- + ------- int The trace iteration from the given fold and repeat that was selected as the best iteration by the search procedure @@ -59,11 +150,15 @@ def get_selected_iteration(self, fold: int, repeat: int) -> int: if r == repeat and f == fold and self.trace_iterations[(r, f, i)].selected is True: return i raise ValueError( - "Could not find the selected iteration for rep/fold %d/%d" % (repeat, fold) + "Could not find the selected iteration for rep/fold %d/%d" % (repeat, fold), ) @classmethod - def generate(cls, attributes, content): + def generate( + cls, + attributes: list[tuple[str, str]], + content: list[list[int | float | str]], + ) -> OpenMLRunTrace: """Generates an OpenMLRunTrace. Generates the trace object from the attributes and content extracted @@ -71,7 +166,6 @@ def generate(cls, attributes, content): Parameters ---------- - attributes : list List of tuples describing the arff attributes. @@ -83,17 +177,16 @@ def generate(cls, attributes, content): ------- OpenMLRunTrace """ - if content is None: raise ValueError("Trace content not available.") - elif attributes is None: + if attributes is None: raise ValueError("Trace attributes not available.") - elif len(content) == 0: + if len(content) == 0: raise ValueError("Trace content is empty.") - elif len(attributes) != len(content[0]): + if len(attributes) != len(content[0]): raise ValueError( "Trace_attributes and trace_content not compatible:" - " %s vs %s" % (attributes, content[0]) + f" {attributes} vs {content[0]}", ) return cls._trace_from_arff_struct( @@ -104,23 +197,25 @@ def generate(cls, attributes, content): ) @classmethod - def _from_filesystem(cls, file_path: str) -> "OpenMLRunTrace": + def _from_filesystem(cls, file_path: str | Path) -> OpenMLRunTrace: """ Logic to deserialize the trace from the filesystem. Parameters ---------- - file_path: str + file_path: str | Path File path where the trace arff is stored. Returns - ---------- + ------- OpenMLRunTrace """ - if not os.path.isfile(file_path): + file_path = Path(file_path) + + if not file_path.exists(): raise ValueError("Trace file doesn't exist") - with open(file_path, "r") as fp: + with file_path.open("r") as fp: trace_arff = arff.load(fp) for trace_idx in range(len(trace_arff["data"])): @@ -128,27 +223,28 @@ def _from_filesystem(cls, file_path: str) -> "OpenMLRunTrace": # (fold, repeat, trace_iteration) these should be int for line_idx in range(3): trace_arff["data"][trace_idx][line_idx] = int( - trace_arff["data"][trace_idx][line_idx] + trace_arff["data"][trace_idx][line_idx], ) return cls.trace_from_arff(trace_arff) - def _to_filesystem(self, file_path): + def _to_filesystem(self, file_path: str | Path) -> None: """Serialize the trace object to the filesystem. Serialize the trace object as an arff. Parameters ---------- - file_path: str + file_path: str | Path File path where the trace arff will be stored. """ + trace_path = Path(file_path) / "trace.arff" trace_arff = arff.dumps(self.trace_to_arff()) - with open(os.path.join(file_path, "trace.arff"), "w") as f: + with trace_path.open("w") as f: f.write(trace_arff) - def trace_to_arff(self): + def trace_to_arff(self) -> dict[str, Any]: """Generate the arff dictionary for uploading predictions to the server. Uses the trace object to generate an arff dictionary representation. @@ -174,24 +270,23 @@ def trace_to_arff(self): [ (PREFIX + parameter, "STRING") for parameter in next(iter(self.trace_iterations.values())).get_parameters() - ] + ], ) - arff_dict = OrderedDict() + arff_dict: dict[str, Any] = {} data = [] for trace_iteration in self.trace_iterations.values(): tmp_list = [] - for attr, _ in trace_attributes: - if attr.startswith(PREFIX): - attr = attr[len(PREFIX) :] + for _attr, _ in trace_attributes: + if _attr.startswith(PREFIX): + attr = _attr[len(PREFIX) :] value = trace_iteration.get_parameters()[attr] else: + attr = _attr value = getattr(trace_iteration, attr) + if attr == "selected": - if value: - tmp_list.append("true") - else: - tmp_list.append("false") + tmp_list.append("true" if value else "false") else: tmp_list.append(value) data.append(tmp_list) @@ -203,7 +298,7 @@ def trace_to_arff(self): return arff_dict @classmethod - def trace_from_arff(cls, arff_obj): + def trace_from_arff(cls, arff_obj: dict[str, Any]) -> OpenMLRunTrace: """Generate trace from arff trace. Creates a trace file from arff object (for example, generated by a @@ -227,7 +322,30 @@ def trace_from_arff(cls, arff_obj): ) @classmethod - def _trace_from_arff_struct(cls, attributes, content, error_message): + def _trace_from_arff_struct( + cls, + attributes: list[tuple[str, str]], + content: list[list[int | float | str]], + error_message: str, + ) -> Self: + """Generate a trace dictionary from ARFF structure. + + Parameters + ---------- + cls : type + The trace object to be created. + attributes : list[tuple[str, str]] + Attribute descriptions. + content : list[list[int | float | str]]] + List of instances. + error_message : str + Error message to raise if `setup_string` is in `attributes`. + + Returns + ------- + OrderedDict + A dictionary representing the trace. + """ trace = OrderedDict() attribute_idx = {att[0]: idx for idx, att in enumerate(attributes)} @@ -241,17 +359,16 @@ def _trace_from_arff_struct(cls, attributes, content, error_message): # they are not parameters parameter_attributes = [] for attribute in attribute_idx: - if attribute in REQUIRED_ATTRIBUTES: - continue - elif attribute == "setup_string": + if attribute in REQUIRED_ATTRIBUTES or attribute == "setup_string": continue - elif not attribute.startswith(PREFIX): + + if not attribute.startswith(PREFIX): raise ValueError( - "Encountered unknown attribute %s that does not start " - "with prefix %s" % (attribute, PREFIX) + f"Encountered unknown attribute {attribute} that does not start " + f"with prefix {PREFIX}", ) - else: - parameter_attributes.append(attribute) + + parameter_attributes.append(attribute) for itt in content: repeat = int(itt[attribute_idx["repeat"]]) @@ -266,12 +383,12 @@ def _trace_from_arff_struct(cls, attributes, content, error_message): else: raise ValueError( 'expected {"true", "false"} value for selected field, ' - "received: %s" % selected_value + "received: %s" % selected_value, ) - parameters = OrderedDict( - [(attribute, itt[attribute_idx[attribute]]) for attribute in parameter_attributes] - ) + parameters = { + attribute: itt[attribute_idx[attribute]] for attribute in parameter_attributes + } current = OpenMLTraceIteration( repeat=repeat, @@ -287,7 +404,7 @@ def _trace_from_arff_struct(cls, attributes, content, error_message): return cls(None, trace) @classmethod - def trace_from_xml(cls, xml): + def trace_from_xml(cls, xml: str | Path | IO) -> OpenMLRunTrace: """Generate trace from xml. Creates a trace file from the xml description. @@ -304,6 +421,9 @@ def trace_from_xml(cls, xml): Object containing the run id and a dict containing the trace iterations. """ + if isinstance(xml, Path): + xml = str(xml.absolute()) + result_dict = xmltodict.parse(xml, force_list=("oml:trace_iteration",))["oml:trace"] run_id = result_dict["oml:run_id"] @@ -328,7 +448,7 @@ def trace_from_xml(cls, xml): else: raise ValueError( 'expected {"true", "false"} value for ' - "selected field, received: %s" % selected_value + "selected field, received: %s" % selected_value, ) current = OpenMLTraceIteration( @@ -344,30 +464,55 @@ def trace_from_xml(cls, xml): return cls(run_id, trace) @classmethod - def merge_traces(cls, traces: List["OpenMLRunTrace"]) -> "OpenMLRunTrace": - merged_trace = ( - OrderedDict() - ) # type: OrderedDict[Tuple[int, int, int], OpenMLTraceIteration] # noqa E501 + def merge_traces(cls, traces: list[OpenMLRunTrace]) -> OpenMLRunTrace: + """Merge multiple traces into a single trace. + + Parameters + ---------- + cls : type + Type of the trace object to be created. + traces : List[OpenMLRunTrace] + List of traces to merge. + + Returns + ------- + OpenMLRunTrace + A trace object representing the merged traces. + + Raises + ------ + ValueError + If the parameters in the iterations of the traces being merged are not equal. + If a key (repeat, fold, iteration) is encountered twice while merging the traces. + """ + merged_trace: dict[tuple[int, int, int], OpenMLTraceIteration] = {} previous_iteration = None for trace in traces: for iteration in trace: key = (iteration.repeat, iteration.fold, iteration.iteration) + + assert iteration.parameters is not None + param_keys = iteration.parameters.keys() + if previous_iteration is not None: - if list(merged_trace[previous_iteration].parameters.keys()) != list( - iteration.parameters.keys() - ): + trace_itr = merged_trace[previous_iteration] + + assert trace_itr.parameters is not None + trace_itr_keys = trace_itr.parameters.keys() + + if list(param_keys) != list(trace_itr_keys): raise ValueError( "Cannot merge traces because the parameters are not equal: " "{} vs {}".format( - list(merged_trace[previous_iteration].parameters.keys()), + list(trace_itr.parameters.keys()), list(iteration.parameters.keys()), - ) + ), ) if key in merged_trace: raise ValueError( - "Cannot merge traces because key '{}' was encountered twice".format(key) + f"Cannot merge traces because key '{key}' was encountered twice", ) merged_trace[key] = iteration @@ -375,88 +520,11 @@ def merge_traces(cls, traces: List["OpenMLRunTrace"]) -> "OpenMLRunTrace": return cls(None, merged_trace) - def __repr__(self): + def __repr__(self) -> str: return "[Run id: {}, {} trace iterations]".format( -1 if self.run_id is None else self.run_id, len(self.trace_iterations), ) - def __iter__(self): - for val in self.trace_iterations.values(): - yield val - - -@dataclass -class OpenMLTraceIteration: - """ - OpenML Trace Iteration: parsed output from Run Trace call - Exactly one of `setup_string` or `parameters` must be provided. - - Parameters - ---------- - repeat : int - repeat number (in case of no repeats: 0) - - fold : int - fold number (in case of no folds: 0) - - iteration : int - iteration number of optimization procedure - - setup_string : str, optional - json string representing the parameters - If not provided, ``parameters`` should be set. - - evaluation : double - The evaluation that was awarded to this trace iteration. - Measure is defined by the task - - selected : bool - Whether this was the best of all iterations, and hence - selected for making predictions. Per fold/repeat there - should be only one iteration selected - - parameters : OrderedDict, optional - Dictionary specifying parameter names and their values. - If not provided, ``setup_string`` should be set. - """ - - repeat: int - fold: int - iteration: int - - evaluation: float - selected: bool - - setup_string: Optional[str] = None - parameters: Optional[OrderedDict] = None - - def __post_init__(self): - # TODO: refactor into one argument of type - if self.setup_string and self.parameters: - raise ValueError( - "Can only be instantiated with either `setup_string` or `parameters` argument." - ) - elif not (self.setup_string or self.parameters): - raise ValueError( - "Either `setup_string` or `parameters` needs to be passed as argument." - ) - if self.parameters is not None and not isinstance(self.parameters, OrderedDict): - raise TypeError( - "argument parameters is not an instance of OrderedDict, but %s" - % str(type(self.parameters)) - ) - - def get_parameters(self): - result = {} - # parameters have prefix 'parameter_' - - if self.setup_string: - for param in self.setup_string: - key = param[len(PREFIX) :] - value = self.setup_string[param] - result[key] = json.loads(value) - else: - for param, value in self.parameters.items(): - result[param[len(PREFIX) :]] = value - return result + def __iter__(self) -> Iterator[OpenMLTraceIteration]: + yield from self.trace_iterations.values() diff --git a/openml/setups/__init__.py b/openml/setups/__init__.py index 31f4f503f..dd38cb9b7 100644 --- a/openml/setups/__init__.py +++ b/openml/setups/__init__.py @@ -1,7 +1,7 @@ # License: BSD 3-Clause -from .setup import OpenMLSetup, OpenMLParameter -from .functions import get_setup, list_setups, setup_exists, initialize_model +from .functions import get_setup, initialize_model, list_setups, setup_exists +from .setup import OpenMLParameter, OpenMLSetup __all__ = [ "OpenMLSetup", diff --git a/openml/setups/functions.py b/openml/setups/functions.py index b9af97c6e..ee0c6d707 100644 --- a/openml/setups/functions.py +++ b/openml/setups/functions.py @@ -1,28 +1,31 @@ # License: BSD 3-Clause +from __future__ import annotations + import warnings from collections import OrderedDict -import io -import os -from typing import Any, Union, List, Dict, Optional +from pathlib import Path +from typing import Any, Iterable +from typing_extensions import Literal -import xmltodict import pandas as pd +import xmltodict import openml -from .. import config -from .setup import OpenMLSetup, OpenMLParameter -from openml.flows import flow_exists import openml.exceptions import openml.utils +from openml import config +from openml.flows import OpenMLFlow, flow_exists +from .setup import OpenMLParameter, OpenMLSetup -def setup_exists(flow) -> int: + +def setup_exists(flow: OpenMLFlow) -> int: """ Checks whether a hyperparameter configuration already exists on the server. Parameters ---------- - flow : flow + flow : OpenMLFlow The openml flow object. Should have flow id present for the main flow and all subflows (i.e., it should be downloaded from the server by means of flow.get, and not instantiated locally) @@ -44,40 +47,57 @@ def setup_exists(flow) -> int: if exists != flow.flow_id: raise ValueError( f"Local flow id ({flow.id}) differs from server id ({exists}). " - "If this issue persists, please contact the developers." + "If this issue persists, please contact the developers.", ) openml_param_settings = flow.extension.obtain_parameter_values(flow) description = xmltodict.unparse(_to_dict(flow.flow_id, openml_param_settings), pretty=True) file_elements = { - "description": ("description.arff", description) + "description": ("description.arff", description), } # type: openml._api_calls.FILE_ELEMENTS_TYPE result = openml._api_calls._perform_api_call( - "/setup/exists/", "post", file_elements=file_elements + "/setup/exists/", + "post", + file_elements=file_elements, ) result_dict = xmltodict.parse(result) setup_id = int(result_dict["oml:setup_exists"]["oml:id"]) return setup_id if setup_id > 0 else False -def _get_cached_setup(setup_id): - """Load a run from the cache.""" - cache_dir = config.get_cache_directory() - setup_cache_dir = os.path.join(cache_dir, "setups", str(setup_id)) +def _get_cached_setup(setup_id: int) -> OpenMLSetup: + """Load a run from the cache. + + Parameters + ---------- + setup_id : int + ID of the setup to be loaded. + + Returns + ------- + OpenMLSetup + The loaded setup object. + + Raises + ------ + OpenMLCacheException + If the setup file for the given setup ID is not cached. + """ + cache_dir = Path(config.get_cache_directory()) + setup_cache_dir = cache_dir / "setups" / str(setup_id) try: - setup_file = os.path.join(setup_cache_dir, "description.xml") - with io.open(setup_file, encoding="utf8") as fh: + setup_file = setup_cache_dir / "description.xml" + with setup_file.open(encoding="utf8") as fh: setup_xml = xmltodict.parse(fh.read()) - setup = _create_setup_from_xml(setup_xml, output_format="object") - return setup + return _create_setup_from_xml(setup_xml, output_format="object") # type: ignore - except (OSError, IOError): + except OSError as e: raise openml.exceptions.OpenMLCacheException( - "Setup file for setup id %d not cached" % setup_id - ) + "Setup file for setup id %d not cached" % setup_id, + ) from e -def get_setup(setup_id): +def get_setup(setup_id: int) -> OpenMLSetup: """ Downloads the setup (configuration) description from OpenML and returns a structured object @@ -89,34 +109,33 @@ def get_setup(setup_id): Returns ------- - dict or OpenMLSetup(an initialized openml setup object) + OpenMLSetup (an initialized openml setup object) """ - setup_dir = os.path.join(config.get_cache_directory(), "setups", str(setup_id)) - setup_file = os.path.join(setup_dir, "description.xml") + setup_dir = Path(config.get_cache_directory()) / "setups" / str(setup_id) + setup_dir.mkdir(exist_ok=True, parents=True) - if not os.path.exists(setup_dir): - os.makedirs(setup_dir) + setup_file = setup_dir / "description.xml" try: return _get_cached_setup(setup_id) except openml.exceptions.OpenMLCacheException: url_suffix = "/setup/%d" % setup_id setup_xml = openml._api_calls._perform_api_call(url_suffix, "get") - with io.open(setup_file, "w", encoding="utf8") as fh: + with setup_file.open("w", encoding="utf8") as fh: fh.write(setup_xml) result_dict = xmltodict.parse(setup_xml) - return _create_setup_from_xml(result_dict, output_format="object") + return _create_setup_from_xml(result_dict, output_format="object") # type: ignore -def list_setups( - offset: Optional[int] = None, - size: Optional[int] = None, - flow: Optional[int] = None, - tag: Optional[str] = None, - setup: Optional[List] = None, - output_format: str = "object", -) -> Union[Dict, pd.DataFrame]: +def list_setups( # noqa: PLR0913 + offset: int | None = None, + size: int | None = None, + flow: int | None = None, + tag: str | None = None, + setup: Iterable[int] | None = None, + output_format: Literal["object", "dict", "dataframe"] = "object", +) -> dict | pd.DataFrame: """ List all setups matching all of the given filters. @@ -126,10 +145,9 @@ def list_setups( size : int, optional flow : int, optional tag : str, optional - setup : list(int), optional + setup : Iterable[int], optional output_format: str, optional (default='object') The parameter decides the format of the output. - - If 'object' the output is a dict of OpenMLSetup objects - If 'dict' the output is a dict of dict - If 'dataframe' the output is a pandas DataFrame @@ -139,7 +157,7 @@ def list_setups( """ if output_format not in ["dataframe", "dict", "object"]: raise ValueError( - "Invalid output format selected. " "Only 'dict', 'object', or 'dataframe' applicable." + "Invalid output format selected. " "Only 'dict', 'object', or 'dataframe' applicable.", ) # TODO: [0.15] @@ -152,8 +170,8 @@ def list_setups( warnings.warn(msg, category=FutureWarning, stacklevel=2) batch_size = 1000 # batch size for setups is lower - return openml.utils._list_all( - output_format=output_format, + return openml.utils._list_all( # type: ignore + list_output_format=output_format, # type: ignore listing_call=_list_setups, offset=offset, size=size, @@ -164,7 +182,11 @@ def list_setups( ) -def _list_setups(setup=None, output_format="object", **kwargs): +def _list_setups( + setup: Iterable[int] | None = None, + output_format: Literal["dict", "dataframe", "object"] = "object", + **kwargs: Any, +) -> dict[int, dict] | pd.DataFrame | dict[int, OpenMLSetup]: """ Perform API call `/setup/list/{filters}` @@ -179,26 +201,28 @@ def _list_setups(setup=None, output_format="object", **kwargs): The parameter decides the format of the output. - If 'dict' the output is a dict of dict - If 'dataframe' the output is a pandas DataFrame + - If 'object' the output is a dict of OpenMLSetup objects kwargs: dict, optional Legal filter operators: flow, setup, limit, offset, tag. Returns ------- - dict or dataframe + dict or dataframe or list[OpenMLSetup] """ - api_call = "setup/list" if setup is not None: api_call += "/setup/%s" % ",".join([str(int(i)) for i in setup]) if kwargs is not None: for operator, value in kwargs.items(): - api_call += "/%s/%s" % (operator, value) + api_call += f"/{operator}/{value}" return __list_setups(api_call=api_call, output_format=output_format) -def __list_setups(api_call, output_format="object"): +def __list_setups( + api_call: str, output_format: Literal["dict", "dataframe", "object"] = "object" +) -> dict[int, dict] | pd.DataFrame | dict[int, OpenMLSetup]: """Helper function to parse API calls which are lists of setups""" xml_string = openml._api_calls._perform_api_call(api_call, "get") setups_dict = xmltodict.parse(xml_string, force_list=("oml:setup",)) @@ -206,32 +230,35 @@ def __list_setups(api_call, output_format="object"): # Minimalistic check if the XML is useful if "oml:setups" not in setups_dict: raise ValueError( - 'Error in return XML, does not contain "oml:setups":' " %s" % str(setups_dict) + 'Error in return XML, does not contain "oml:setups":' " %s" % str(setups_dict), ) - elif "@xmlns:oml" not in setups_dict["oml:setups"]: + + if "@xmlns:oml" not in setups_dict["oml:setups"]: raise ValueError( "Error in return XML, does not contain " - '"oml:setups"/@xmlns:oml: %s' % str(setups_dict) + '"oml:setups"/@xmlns:oml: %s' % str(setups_dict), ) - elif setups_dict["oml:setups"]["@xmlns:oml"] != openml_uri: + + if setups_dict["oml:setups"]["@xmlns:oml"] != openml_uri: raise ValueError( "Error in return XML, value of " '"oml:seyups"/@xmlns:oml is not ' - '"%s": %s' % (openml_uri, str(setups_dict)) + f'"{openml_uri}": {setups_dict!s}', ) - assert type(setups_dict["oml:setups"]["oml:setup"]) == list, type(setups_dict["oml:setups"]) + assert isinstance(setups_dict["oml:setups"]["oml:setup"], list), type(setups_dict["oml:setups"]) - setups = dict() + setups = {} for setup_ in setups_dict["oml:setups"]["oml:setup"]: # making it a dict to give it the right format current = _create_setup_from_xml( - {"oml:setup_parameters": setup_}, output_format=output_format + {"oml:setup_parameters": setup_}, + output_format=output_format, ) if output_format == "object": - setups[current.setup_id] = current + setups[current.setup_id] = current # type: ignore else: - setups[current["setup_id"]] = current + setups[current["setup_id"]] = current # type: ignore if output_format == "dataframe": setups = pd.DataFrame.from_dict(setups, orient="index") @@ -259,21 +286,38 @@ def initialize_model(setup_id: int) -> Any: # instead of using scikit-learns or any other library's "set_params" function, we override the # OpenMLFlow objects default parameter value so we can utilize the # Extension.flow_to_model() function to reinitialize the flow with the set defaults. - for hyperparameter in setup.parameters.values(): - structure = flow.get_structure("flow_id") - if len(structure[hyperparameter.flow_id]) > 0: - subflow = flow.get_subflow(structure[hyperparameter.flow_id]) - else: - subflow = flow - subflow.parameters[hyperparameter.parameter_name] = hyperparameter.value + if setup.parameters is not None: + for hyperparameter in setup.parameters.values(): + structure = flow.get_structure("flow_id") + if len(structure[hyperparameter.flow_id]) > 0: + subflow = flow.get_subflow(structure[hyperparameter.flow_id]) + else: + subflow = flow + subflow.parameters[hyperparameter.parameter_name] = hyperparameter.value + + return flow.extension.flow_to_model(flow) - model = flow.extension.flow_to_model(flow) - return model +def _to_dict( + flow_id: int, openml_parameter_settings: list[OpenMLParameter] | list[dict[str, Any]] +) -> OrderedDict: + """Convert a flow ID and a list of OpenML parameter settings to + a dictionary representation that can be serialized to XML. + + Parameters + ---------- + flow_id : int + ID of the flow. + openml_parameter_settings : List[OpenMLParameter] + A list of OpenML parameter settings. -def _to_dict(flow_id, openml_parameter_settings): + Returns + ------- + OrderedDict + A dictionary representation of the flow ID and parameter settings. + """ # for convenience, this function (ab)uses the run object. - xml = OrderedDict() + xml: OrderedDict = OrderedDict() xml["oml:run"] = OrderedDict() xml["oml:run"]["@xmlns:oml"] = "http://openml.org/openml" xml["oml:run"]["oml:flow_id"] = flow_id @@ -282,43 +326,56 @@ def _to_dict(flow_id, openml_parameter_settings): return xml -def _create_setup_from_xml(result_dict, output_format="object"): - """ - Turns an API xml result into a OpenMLSetup object (or dict) - """ +def _create_setup_from_xml( + result_dict: dict, output_format: Literal["dict", "dataframe", "object"] = "object" +) -> OpenMLSetup | dict[str, int | dict[int, Any] | None]: + """Turns an API xml result into a OpenMLSetup object (or dict)""" + if output_format in ["dataframe", "dict"]: + _output_format: Literal["dict", "object"] = "dict" + elif output_format == "object": + _output_format = "object" + else: + raise ValueError( + f"Invalid output format selected: {output_format}" + "Only 'dict', 'object', or 'dataframe' applicable.", + ) + setup_id = int(result_dict["oml:setup_parameters"]["oml:setup_id"]) flow_id = int(result_dict["oml:setup_parameters"]["oml:flow_id"]) - parameters = {} if "oml:parameter" not in result_dict["oml:setup_parameters"]: parameters = None else: + parameters = {} # basically all others xml_parameters = result_dict["oml:setup_parameters"]["oml:parameter"] if isinstance(xml_parameters, dict): - id = int(xml_parameters["oml:id"]) - parameters[id] = _create_setup_parameter_from_xml( - result_dict=xml_parameters, output_format=output_format + oml_id = int(xml_parameters["oml:id"]) + parameters[oml_id] = _create_setup_parameter_from_xml( + result_dict=xml_parameters, + output_format=_output_format, ) elif isinstance(xml_parameters, list): for xml_parameter in xml_parameters: - id = int(xml_parameter["oml:id"]) - parameters[id] = _create_setup_parameter_from_xml( - result_dict=xml_parameter, output_format=output_format + oml_id = int(xml_parameter["oml:id"]) + parameters[oml_id] = _create_setup_parameter_from_xml( + result_dict=xml_parameter, + output_format=_output_format, ) else: raise ValueError( "Expected None, list or dict, received " - "something else: %s" % str(type(xml_parameters)) + "something else: %s" % str(type(xml_parameters)), ) - if output_format in ["dataframe", "dict"]: - return_dict = {"setup_id": setup_id, "flow_id": flow_id} - return_dict["parameters"] = parameters - return return_dict + if _output_format in ["dataframe", "dict"]: + return {"setup_id": setup_id, "flow_id": flow_id, "parameters": parameters} return OpenMLSetup(setup_id, flow_id, parameters) -def _create_setup_parameter_from_xml(result_dict, output_format="object"): +def _create_setup_parameter_from_xml( + result_dict: dict[str, str], output_format: Literal["object", "dict"] = "object" +) -> dict[str, int | str] | OpenMLParameter: + """Create an OpenMLParameter object or a dictionary from an API xml result.""" if output_format == "object": return OpenMLParameter( input_id=int(result_dict["oml:id"]), @@ -330,14 +387,16 @@ def _create_setup_parameter_from_xml(result_dict, output_format="object"): default_value=result_dict["oml:default_value"], value=result_dict["oml:value"], ) - else: - return { - "input_id": int(result_dict["oml:id"]), - "flow_id": int(result_dict["oml:flow_id"]), - "flow_name": result_dict["oml:flow_name"], - "full_name": result_dict["oml:full_name"], - "parameter_name": result_dict["oml:parameter_name"], - "data_type": result_dict["oml:data_type"], - "default_value": result_dict["oml:default_value"], - "value": result_dict["oml:value"], - } + + # FIXME: likely we want to crash here if unknown output_format but not backwards compatible + # output_format == "dict" case, + return { + "input_id": int(result_dict["oml:id"]), + "flow_id": int(result_dict["oml:flow_id"]), + "flow_name": result_dict["oml:flow_name"], + "full_name": result_dict["oml:full_name"], + "parameter_name": result_dict["oml:parameter_name"], + "data_type": result_dict["oml:data_type"], + "default_value": result_dict["oml:default_value"], + "value": result_dict["oml:value"], + } diff --git a/openml/setups/setup.py b/openml/setups/setup.py index 44919fd09..e8dc059e7 100644 --- a/openml/setups/setup.py +++ b/openml/setups/setup.py @@ -1,9 +1,13 @@ # License: BSD 3-Clause +from __future__ import annotations + +from typing import Any import openml.config +import openml.flows -class OpenMLSetup(object): +class OpenMLSetup: """Setup object (a.k.a. Configuration). Parameters @@ -16,20 +20,21 @@ class OpenMLSetup(object): The setting of the parameters """ - def __init__(self, setup_id, flow_id, parameters): + def __init__(self, setup_id: int, flow_id: int, parameters: dict[int, Any] | None): if not isinstance(setup_id, int): raise ValueError("setup id should be int") + if not isinstance(flow_id, int): raise ValueError("flow id should be int") - if parameters is not None: - if not isinstance(parameters, dict): - raise ValueError("parameters should be dict") + + if parameters is not None and not isinstance(parameters, dict): + raise ValueError("parameters should be dict") self.setup_id = setup_id self.flow_id = flow_id self.parameters = parameters - def __repr__(self): + def __repr__(self) -> str: header = "OpenML Setup" header = "{}\n{}\n".format(header, "=" * len(header)) @@ -37,20 +42,22 @@ def __repr__(self): "Setup ID": self.setup_id, "Flow ID": self.flow_id, "Flow URL": openml.flows.OpenMLFlow.url_for_id(self.flow_id), - "# of Parameters": len(self.parameters), + "# of Parameters": ( + len(self.parameters) if self.parameters is not None else float("nan") + ), } # determines the order in which the information will be printed order = ["Setup ID", "Flow ID", "Flow URL", "# of Parameters"] - fields = [(key, fields[key]) for key in order if key in fields] + _fields = [(key, fields[key]) for key in order if key in fields] - longest_field_name_length = max(len(name) for name, value in fields) - field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length) - body = "\n".join(field_line_format.format(name, value) for name, value in fields) + longest_field_name_length = max(len(name) for name, _ in _fields) + field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}" + body = "\n".join(field_line_format.format(name, value) for name, value in _fields) return header + body -class OpenMLParameter(object): +class OpenMLParameter: """Parameter object (used in setup). Parameters @@ -75,16 +82,16 @@ class OpenMLParameter(object): If the parameter was set, the value that it was set to. """ - def __init__( + def __init__( # noqa: PLR0913 self, - input_id, - flow_id, - flow_name, - full_name, - parameter_name, - data_type, - default_value, - value, + input_id: int, + flow_id: int, + flow_name: str, + full_name: str, + parameter_name: str, + data_type: str, + default_value: str, + value: str, ): self.id = input_id self.flow_id = flow_id @@ -95,7 +102,7 @@ def __init__( self.default_value = default_value self.value = value - def __repr__(self): + def __repr__(self) -> str: header = "OpenML Parameter" header = "{}\n{}\n".format(header, "=" * len(header)) @@ -110,11 +117,11 @@ def __repr__(self): # indented prints for parameter attributes # indention = 2 spaces + 1 | + 2 underscores indent = "{}|{}".format(" " * 2, "_" * 2) - parameter_data_type = "{}Data Type".format(indent) + parameter_data_type = f"{indent}Data Type" fields[parameter_data_type] = self.data_type - parameter_default = "{}Default".format(indent) + parameter_default = f"{indent}Default" fields[parameter_default] = self.default_value - parameter_value = "{}Value".format(indent) + parameter_value = f"{indent}Value" fields[parameter_value] = self.value # determines the order in which the information will be printed @@ -128,9 +135,9 @@ def __repr__(self): parameter_default, parameter_value, ] - fields = [(key, fields[key]) for key in order if key in fields] + _fields = [(key, fields[key]) for key in order if key in fields] - longest_field_name_length = max(len(name) for name, value in fields) - field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length) - body = "\n".join(field_line_format.format(name, value) for name, value in fields) + longest_field_name_length = max(len(name) for name, _ in _fields) + field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}" + body = "\n".join(field_line_format.format(name, value) for name, value in _fields) return header + body diff --git a/openml/study/__init__.py b/openml/study/__init__.py index 030ee05c2..b7d77fec4 100644 --- a/openml/study/__init__.py +++ b/openml/study/__init__.py @@ -1,23 +1,22 @@ # License: BSD 3-Clause -from .study import OpenMLStudy, OpenMLBenchmarkSuite from .functions import ( - get_study, - get_suite, - create_study, - create_benchmark_suite, - update_study_status, - update_suite_status, attach_to_study, attach_to_suite, - detach_from_study, - detach_from_suite, + create_benchmark_suite, + create_study, delete_study, delete_suite, + detach_from_study, + detach_from_suite, + get_study, + get_suite, list_studies, list_suites, + update_study_status, + update_suite_status, ) - +from .study import OpenMLBenchmarkSuite, OpenMLStudy __all__ = [ "OpenMLStudy", diff --git a/openml/study/functions.py b/openml/study/functions.py index 1db09b8ad..9d726d286 100644 --- a/openml/study/functions.py +++ b/openml/study/functions.py @@ -1,17 +1,24 @@ # License: BSD 3-Clause +# ruff: noqa: PLR0913 +from __future__ import annotations -from typing import cast, Dict, List, Optional, Union import warnings +from typing import TYPE_CHECKING, Any, overload +from typing_extensions import Literal -import xmltodict import pandas as pd +import xmltodict -from openml.study import OpenMLStudy, OpenMLBenchmarkSuite -from openml.study.study import BaseStudy import openml._api_calls +import openml.config +import openml.utils +from openml.study.study import OpenMLBenchmarkSuite, OpenMLStudy + +if TYPE_CHECKING: + from openml.study.study import BaseStudy -def get_suite(suite_id: Union[int, str]) -> OpenMLBenchmarkSuite: +def get_suite(suite_id: int | str) -> OpenMLBenchmarkSuite: """ Retrieves all relevant information of an OpenML benchmarking suite from the server. @@ -25,14 +32,16 @@ def get_suite(suite_id: Union[int, str]) -> OpenMLBenchmarkSuite: OpenMLSuite The OpenML suite object """ - suite = cast(OpenMLBenchmarkSuite, _get_study(suite_id, entity_type="task")) - return suite + study = _get_study(suite_id, entity_type="task") + assert isinstance(study, OpenMLBenchmarkSuite) + + return study def get_study( - study_id: Union[int, str], - arg_for_backwards_compat: Optional[str] = None, -) -> OpenMLStudy: # noqa F401 + study_id: int | str, + arg_for_backwards_compat: str | None = None, # noqa: ARG001 +) -> OpenMLStudy: # F401 """ Retrieves all relevant information of an OpenML study from the server. @@ -57,18 +66,20 @@ def get_study( "It looks like you are running code from the OpenML100 paper. It still works, but lots " "of things have changed since then. Please use `get_suite('OpenML100')` instead." ) - warnings.warn(message, DeprecationWarning) + warnings.warn(message, DeprecationWarning, stacklevel=2) openml.config.logger.warning(message) study = _get_study(study_id, entity_type="task") - return cast(OpenMLBenchmarkSuite, study) # type: ignore - else: - study = cast(OpenMLStudy, _get_study(study_id, entity_type="run")) - return study + assert isinstance(study, OpenMLBenchmarkSuite) + return study # type: ignore -def _get_study(id_: Union[int, str], entity_type) -> BaseStudy: - call_suffix = "study/{}".format(str(id_)) - xml_string = openml._api_calls._perform_api_call(call_suffix, "get") + study = _get_study(study_id, entity_type="run") + assert isinstance(study, OpenMLStudy) + return study + + +def _get_study(id_: int | str, entity_type: str) -> BaseStudy: + xml_string = openml._api_calls._perform_api_call(f"study/{id_}", "get") force_list_tags = ( "oml:data_id", "oml:flow_id", @@ -81,13 +92,13 @@ def _get_study(id_: Union[int, str], entity_type) -> BaseStudy: study_id = int(result_dict["oml:id"]) alias = result_dict["oml:alias"] if "oml:alias" in result_dict else None main_entity_type = result_dict["oml:main_entity_type"] + if entity_type != main_entity_type: raise ValueError( - "Unexpected entity type '{}' reported by the server, expected '{}'".format( - main_entity_type, - entity_type, - ) + f"Unexpected entity type '{main_entity_type}' reported by the server" + f", expected '{entity_type}'" ) + benchmark_suite = ( result_dict["oml:benchmark_suite"] if "oml:benchmark_suite" in result_dict else None ) @@ -106,7 +117,21 @@ def _get_study(id_: Union[int, str], entity_type) -> BaseStudy: current_tag["window_start"] = tag["oml:window_start"] tags.append(current_tag) - def get_nested_ids_from_result_dict(key: str, subkey: str) -> Optional[List]: + def get_nested_ids_from_result_dict(key: str, subkey: str) -> list[int] | None: + """Extracts a list of nested IDs from a result dictionary. + + Parameters + ---------- + key : str + Nested OpenML IDs. + subkey : str + The subkey contains the nested OpenML IDs. + + Returns + ------- + Optional[List] + A list of nested OpenML IDs, or None if the key is not present in the dictionary. + """ if result_dict.get(key) is not None: return [int(oml_id) for oml_id in result_dict[key][subkey]] return None @@ -137,7 +162,6 @@ def get_nested_ids_from_result_dict(key: str, subkey: str) -> Optional[List]: ) # type: BaseStudy elif main_entity_type in ["tasks", "task"]: - tasks = cast("List[int]", tasks) study = OpenMLBenchmarkSuite( suite_id=study_id, alias=alias, @@ -152,7 +176,7 @@ def get_nested_ids_from_result_dict(key: str, subkey: str) -> Optional[List]: ) else: - raise ValueError("Unknown entity type {}".format(main_entity_type)) + raise ValueError(f"Unknown entity type {main_entity_type}") return study @@ -160,9 +184,9 @@ def get_nested_ids_from_result_dict(key: str, subkey: str) -> Optional[List]: def create_study( name: str, description: str, - run_ids: Optional[List[int]] = None, - alias: Optional[str] = None, - benchmark_suite: Optional[int] = None, + run_ids: list[int] | None = None, + alias: str | None = None, + benchmark_suite: int | None = None, ) -> OpenMLStudy: """ Creates an OpenML study (collection of data, tasks, flows, setups and run), @@ -211,8 +235,8 @@ def create_study( def create_benchmark_suite( name: str, description: str, - task_ids: List[int], - alias: Optional[str] = None, + task_ids: list[int], + alias: str | None = None, ) -> OpenMLBenchmarkSuite: """ Creates an OpenML benchmark suite (collection of entity types, where @@ -319,7 +343,7 @@ def delete_study(study_id: int) -> bool: return openml.utils._delete_entity("study", study_id) -def attach_to_suite(suite_id: int, task_ids: List[int]) -> int: +def attach_to_suite(suite_id: int, task_ids: list[int]) -> int: """Attaches a set of tasks to a benchmarking suite. Parameters @@ -338,7 +362,7 @@ def attach_to_suite(suite_id: int, task_ids: List[int]) -> int: return attach_to_study(suite_id, task_ids) -def attach_to_study(study_id: int, run_ids: List[int]) -> int: +def attach_to_study(study_id: int, run_ids: list[int]) -> int: """Attaches a set of runs to a study. Parameters @@ -354,18 +378,17 @@ def attach_to_study(study_id: int, run_ids: List[int]) -> int: int new size of the study (in terms of explicitly linked entities) """ - # Interestingly, there's no need to tell the server about the entity type, it knows by itself - uri = "study/%d/attach" % study_id - post_variables = {"ids": ",".join(str(x) for x in run_ids)} # type: openml._api_calls.DATA_TYPE result_xml = openml._api_calls._perform_api_call( - call=uri, request_method="post", data=post_variables + call=f"study/{study_id}/attach", + request_method="post", + data={"ids": ",".join(str(x) for x in run_ids)}, ) result = xmltodict.parse(result_xml)["oml:study_attach"] return int(result["oml:linked_entities"]) -def detach_from_suite(suite_id: int, task_ids: List[int]) -> int: +def detach_from_suite(suite_id: int, task_ids: list[int]) -> int: """Detaches a set of task ids from a suite. Parameters @@ -379,11 +402,12 @@ def detach_from_suite(suite_id: int, task_ids: List[int]) -> int: Returns ------- int - new size of the study (in terms of explicitly linked entities)""" + new size of the study (in terms of explicitly linked entities) + """ return detach_from_study(suite_id, task_ids) -def detach_from_study(study_id: int, run_ids: List[int]) -> int: +def detach_from_study(study_id: int, run_ids: list[int]) -> int: """Detaches a set of run ids from a study. Parameters @@ -399,24 +423,47 @@ def detach_from_study(study_id: int, run_ids: List[int]) -> int: int new size of the study (in terms of explicitly linked entities) """ - # Interestingly, there's no need to tell the server about the entity type, it knows by itself uri = "study/%d/detach" % study_id post_variables = {"ids": ",".join(str(x) for x in run_ids)} # type: openml._api_calls.DATA_TYPE result_xml = openml._api_calls._perform_api_call( - call=uri, request_method="post", data=post_variables + call=uri, + request_method="post", + data=post_variables, ) result = xmltodict.parse(result_xml)["oml:study_detach"] return int(result["oml:linked_entities"]) +@overload +def list_suites( + offset: int | None = ..., + size: int | None = ..., + status: str | None = ..., + uploader: list[int] | None = ..., + output_format: Literal["dict"] = "dict", +) -> dict: + ... + + +@overload +def list_suites( + offset: int | None = ..., + size: int | None = ..., + status: str | None = ..., + uploader: list[int] | None = ..., + output_format: Literal["dataframe"] = "dataframe", +) -> pd.DataFrame: + ... + + def list_suites( - offset: Optional[int] = None, - size: Optional[int] = None, - status: Optional[str] = None, - uploader: Optional[List[int]] = None, - output_format: str = "dict", -) -> Union[Dict, pd.DataFrame]: + offset: int | None = None, + size: int | None = None, + status: str | None = None, + uploader: list[int] | None = None, + output_format: Literal["dict", "dataframe"] = "dict", +) -> dict | pd.DataFrame: """ Return a list of all suites which are on OpenML. @@ -461,7 +508,7 @@ def list_suites( """ if output_format not in ["dataframe", "dict"]: raise ValueError( - "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable." + "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable.", ) # TODO: [0.15] if output_format == "dict": @@ -472,8 +519,8 @@ def list_suites( ) warnings.warn(msg, category=FutureWarning, stacklevel=2) - return openml.utils._list_all( - output_format=output_format, + return openml.utils._list_all( # type: ignore + list_output_format=output_format, # type: ignore listing_call=_list_studies, offset=offset, size=size, @@ -483,14 +530,38 @@ def list_suites( ) +@overload def list_studies( - offset: Optional[int] = None, - size: Optional[int] = None, - status: Optional[str] = None, - uploader: Optional[List[str]] = None, - benchmark_suite: Optional[int] = None, - output_format: str = "dict", -) -> Union[Dict, pd.DataFrame]: + offset: int | None = ..., + size: int | None = ..., + status: str | None = ..., + uploader: list[str] | None = ..., + benchmark_suite: int | None = ..., + output_format: Literal["dict"] = "dict", +) -> dict: + ... + + +@overload +def list_studies( + offset: int | None = ..., + size: int | None = ..., + status: str | None = ..., + uploader: list[str] | None = ..., + benchmark_suite: int | None = ..., + output_format: Literal["dataframe"] = "dataframe", +) -> pd.DataFrame: + ... + + +def list_studies( + offset: int | None = None, + size: int | None = None, + status: str | None = None, + uploader: list[str] | None = None, + benchmark_suite: int | None = None, + output_format: Literal["dict", "dataframe"] = "dict", +) -> dict | pd.DataFrame: """ Return a list of all studies which are on OpenML. @@ -542,7 +613,7 @@ def list_studies( """ if output_format not in ["dataframe", "dict"]: raise ValueError( - "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable." + "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable.", ) # TODO: [0.15] if output_format == "dict": @@ -553,8 +624,8 @@ def list_studies( ) warnings.warn(msg, category=FutureWarning, stacklevel=2) - return openml.utils._list_all( - output_format=output_format, + return openml.utils._list_all( # type: ignore + list_output_format=output_format, # type: ignore listing_call=_list_studies, offset=offset, size=size, @@ -565,7 +636,19 @@ def list_studies( ) -def _list_studies(output_format="dict", **kwargs) -> Union[Dict, pd.DataFrame]: +@overload +def _list_studies(output_format: Literal["dict"] = "dict", **kwargs: Any) -> dict: + ... + + +@overload +def _list_studies(output_format: Literal["dataframe"], **kwargs: Any) -> pd.DataFrame: + ... + + +def _list_studies( + output_format: Literal["dict", "dataframe"] = "dict", **kwargs: Any +) -> dict | pd.DataFrame: """ Perform api call to return a list of studies. @@ -586,23 +669,52 @@ def _list_studies(output_format="dict", **kwargs) -> Union[Dict, pd.DataFrame]: api_call = "study/list" if kwargs is not None: for operator, value in kwargs.items(): - api_call += "/%s/%s" % (operator, value) + api_call += f"/{operator}/{value}" return __list_studies(api_call=api_call, output_format=output_format) -def __list_studies(api_call, output_format="object") -> Union[Dict, pd.DataFrame]: +@overload +def __list_studies(api_call: str, output_format: Literal["dict"] = "dict") -> dict: + ... + + +@overload +def __list_studies(api_call: str, output_format: Literal["dataframe"]) -> pd.DataFrame: + ... + + +def __list_studies( + api_call: str, output_format: Literal["dict", "dataframe"] = "dict" +) -> dict | pd.DataFrame: + """Retrieves the list of OpenML studies and + returns it in a dictionary or a Pandas DataFrame. + + Parameters + ---------- + api_call : str + The API call for retrieving the list of OpenML studies. + output_format : str in {"dict", "dataframe"} + Format of the output, either 'object' for a dictionary + or 'dataframe' for a Pandas DataFrame. + + Returns + ------- + Union[Dict, pd.DataFrame] + A dictionary or Pandas DataFrame of OpenML studies, + depending on the value of 'output_format'. + """ xml_string = openml._api_calls._perform_api_call(api_call, "get") study_dict = xmltodict.parse(xml_string, force_list=("oml:study",)) # Minimalistic check if the XML is useful - assert type(study_dict["oml:study_list"]["oml:study"]) == list, type( - study_dict["oml:study_list"] + assert isinstance(study_dict["oml:study_list"]["oml:study"], list), type( + study_dict["oml:study_list"], ) assert study_dict["oml:study_list"]["@xmlns:oml"] == "http://openml.org/openml", study_dict[ "oml:study_list" ]["@xmlns:oml"] - studies = dict() + studies = {} for study_ in study_dict["oml:study_list"]["oml:study"]: # maps from xml name to a tuple of (dict name, casting fn) expected_fields = { @@ -616,7 +728,7 @@ def __list_studies(api_call, output_format="object") -> Union[Dict, pd.DataFrame "oml:creator": ("creator", int), } study_id = int(study_["oml:id"]) - current_study = dict() + current_study = {} for oml_field_name, (real_field_name, cast_fn) in expected_fields.items(): if oml_field_name in study_: current_study[real_field_name] = cast_fn(study_[oml_field_name]) diff --git a/openml/study/study.py b/openml/study/study.py index cfc4cab3b..83bbf0497 100644 --- a/openml/study/study.py +++ b/openml/study/study.py @@ -1,10 +1,11 @@ # License: BSD 3-Clause +# TODO(eddiebergman): Begging for dataclassses to shorten this all +from __future__ import annotations -from collections import OrderedDict -from typing import Dict, List, Optional, Tuple, Union, Any +from typing import Any, Sequence -import openml from openml.base import OpenMLBase +from openml.config import get_server_base_url class BaseStudy(OpenMLBase): @@ -55,23 +56,23 @@ class BaseStudy(OpenMLBase): a list of setup ids associated with this study """ - def __init__( + def __init__( # noqa: PLR0913 self, - study_id: Optional[int], - alias: Optional[str], + study_id: int | None, + alias: str | None, main_entity_type: str, - benchmark_suite: Optional[int], + benchmark_suite: int | None, name: str, description: str, - status: Optional[str], - creation_date: Optional[str], - creator: Optional[int], - tags: Optional[List[Dict]], - data: Optional[List[int]], - tasks: Optional[List[int]], - flows: Optional[List[int]], - runs: Optional[List[int]], - setups: Optional[List[int]], + status: str | None, + creation_date: str | None, + creator: int | None, + tags: list[dict] | None, + data: list[int] | None, + tasks: list[int] | None, + flows: list[int] | None, + runs: list[int] | None, + setups: list[int] | None, ): self.study_id = study_id self.alias = alias @@ -94,12 +95,13 @@ def _entity_letter(cls) -> str: return "s" @property - def id(self) -> Optional[int]: + def id(self) -> int | None: + """Return the id of the study.""" return self.study_id - def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: + def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str]]]: """Collect all information to display in the __repr__ body.""" - fields: Dict[str, Any] = { + fields: dict[str, Any] = { "Name": self.name, "Status": self.status, "Main Entity Type": self.main_entity_type, @@ -108,7 +110,7 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: fields["ID"] = self.study_id fields["Study URL"] = self.openml_url if self.creator is not None: - fields["Creator"] = "{}/u/{}".format(openml.config.get_server_base_url(), self.creator) + fields["Creator"] = f"{get_server_base_url()}/u/{self.creator}" if self.creation_date is not None: fields["Upload Time"] = self.creation_date.replace("T", " ") if self.data is not None: @@ -136,42 +138,47 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: ] return [(key, fields[key]) for key in order if key in fields] - def _parse_publish_response(self, xml_response: Dict): + def _parse_publish_response(self, xml_response: dict) -> None: """Parse the id from the xml_response and assign it to self.""" self.study_id = int(xml_response["oml:study_upload"]["oml:id"]) - def _to_dict(self) -> "OrderedDict[str, OrderedDict]": + def _to_dict(self) -> dict[str, dict]: """Creates a dictionary representation of self.""" # some can not be uploaded, e.g., id, creator, creation_date simple_props = ["alias", "main_entity_type", "name", "description"] - # maps from attribute name (which is used as outer tag name) to immer - # tag name (e.g., self.tasks -> 1987 - # ) - complex_props = { - "tasks": "task_id", - "runs": "run_id", - } - - study_container = OrderedDict() # type: 'OrderedDict' - namespace_list = [("@xmlns:oml", "http://openml.org/openml")] - study_dict = OrderedDict(namespace_list) # type: 'OrderedDict' - study_container["oml:study"] = study_dict + # TODO(eddiebergman): Begging for a walrus if we can drop 3.7 + simple_prop_values = {} for prop_name in simple_props: content = getattr(self, prop_name, None) if content is not None: - study_dict["oml:" + prop_name] = content + simple_prop_values["oml:" + prop_name] = content + + # maps from attribute name (which is used as outer tag name) to immer + # tag name e.g., self.tasks -> 1987 + complex_props = {"tasks": "task_id", "runs": "run_id"} + + # TODO(eddiebergman): Begging for a walrus if we can drop 3.7 + complex_prop_values = {} for prop_name, inner_name in complex_props.items(): content = getattr(self, prop_name, None) if content is not None: - sub_dict = {"oml:" + inner_name: content} - study_dict["oml:" + prop_name] = sub_dict - return study_container + complex_prop_values["oml:" + prop_name] = {"oml:" + inner_name: content} + + return { + "oml:study": { + "@xmlns:oml": "http://openml.org/openml", + **simple_prop_values, + **complex_prop_values, + } + } - def push_tag(self, tag: str): + def push_tag(self, tag: str) -> None: + """Add a tag to the study.""" raise NotImplementedError("Tags for studies is not (yet) supported.") - def remove_tag(self, tag: str): + def remove_tag(self, tag: str) -> None: + """Remove a tag from the study.""" raise NotImplementedError("Tags for studies is not (yet) supported.") @@ -219,22 +226,22 @@ class OpenMLStudy(BaseStudy): a list of setup ids associated with this study """ - def __init__( + def __init__( # noqa: PLR0913 self, - study_id: Optional[int], - alias: Optional[str], - benchmark_suite: Optional[int], + study_id: int | None, + alias: str | None, + benchmark_suite: int | None, name: str, description: str, - status: Optional[str], - creation_date: Optional[str], - creator: Optional[int], - tags: Optional[List[Dict]], - data: Optional[List[int]], - tasks: Optional[List[int]], - flows: Optional[List[int]], - runs: Optional[List[int]], - setups: Optional[List[int]], + status: str | None, + creation_date: str | None, + creator: int | None, + tags: list[dict] | None, + data: list[int] | None, + tasks: list[int] | None, + flows: list[int] | None, + runs: list[int] | None, + setups: list[int] | None, ): super().__init__( study_id=study_id, @@ -293,18 +300,18 @@ class OpenMLBenchmarkSuite(BaseStudy): a list of task ids associated with this study """ - def __init__( + def __init__( # noqa: PLR0913 self, - suite_id: Optional[int], - alias: Optional[str], + suite_id: int | None, + alias: str | None, name: str, description: str, - status: Optional[str], - creation_date: Optional[str], - creator: Optional[int], - tags: Optional[List[Dict]], - data: Optional[List[int]], - tasks: List[int], + status: str | None, + creation_date: str | None, + creator: int | None, + tags: list[dict] | None, + data: list[int] | None, + tasks: list[int] | None, ): super().__init__( study_id=suite_id, diff --git a/openml/tasks/__init__.py b/openml/tasks/__init__.py index a5d578d2d..f6df3a8d4 100644 --- a/openml/tasks/__init__.py +++ b/openml/tasks/__init__.py @@ -1,21 +1,21 @@ # License: BSD 3-Clause -from .task import ( - OpenMLTask, - OpenMLSupervisedTask, - OpenMLClassificationTask, - OpenMLRegressionTask, - OpenMLClusteringTask, - OpenMLLearningCurveTask, - TaskType, -) -from .split import OpenMLSplit from .functions import ( create_task, + delete_task, get_task, get_tasks, list_tasks, - delete_task, +) +from .split import OpenMLSplit +from .task import ( + OpenMLClassificationTask, + OpenMLClusteringTask, + OpenMLLearningCurveTask, + OpenMLRegressionTask, + OpenMLSupervisedTask, + OpenMLTask, + TaskType, ) __all__ = [ diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py index b038179fc..c763714bf 100644 --- a/openml/tasks/functions.py +++ b/openml/tasks/functions.py @@ -1,55 +1,50 @@ # License: BSD 3-Clause -import warnings -from collections import OrderedDict -import io -import re +from __future__ import annotations + import os -from typing import Union, Dict, Optional, List +import re +import warnings +from typing import Any +from typing_extensions import Literal import pandas as pd import xmltodict -from ..exceptions import OpenMLCacheException -from ..datasets import get_dataset +import openml._api_calls +import openml.utils +from openml.datasets import get_dataset +from openml.exceptions import OpenMLCacheException + from .task import ( OpenMLClassificationTask, OpenMLClusteringTask, OpenMLLearningCurveTask, - TaskType, OpenMLRegressionTask, OpenMLSupervisedTask, OpenMLTask, + TaskType, ) -import openml.utils -import openml._api_calls TASKS_CACHE_DIR_NAME = "tasks" -def _get_cached_tasks(): +def _get_cached_tasks() -> dict[int, OpenMLTask]: """Return a dict of all the tasks which are cached locally. + Returns ------- tasks : OrderedDict A dict of all the cached tasks. Each task is an instance of OpenMLTask. """ - tasks = OrderedDict() - task_cache_dir = openml.utils._create_cache_directory(TASKS_CACHE_DIR_NAME) directory_content = os.listdir(task_cache_dir) directory_content.sort() + # Find all dataset ids for which we have downloaded the dataset # description - - for filename in directory_content: - if not re.match(r"[0-9]*", filename): - continue - - tid = int(filename) - tasks[tid] = _get_cached_task(tid) - - return tasks + tids = (int(did) for did in directory_content if re.match(r"[0-9]*", did)) + return {tid: _get_cached_task(tid) for tid in tids} def _get_cached_task(tid: int) -> OpenMLTask: @@ -66,16 +61,18 @@ def _get_cached_task(tid: int) -> OpenMLTask: """ tid_cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, tid) + task_xml_path = tid_cache_dir / "task.xml" try: - with io.open(os.path.join(tid_cache_dir, "task.xml"), encoding="utf8") as fh: + with task_xml_path.open(encoding="utf8") as fh: return _create_task_from_xml(fh.read()) - except (OSError, IOError): + except OSError as e: openml.utils._remove_cache_dir_for_id(TASKS_CACHE_DIR_NAME, tid_cache_dir) - raise OpenMLCacheException("Task file for tid %d not " "cached" % tid) + raise OpenMLCacheException(f"Task file for tid {tid} not cached") from e -def _get_estimation_procedure_list(): +def _get_estimation_procedure_list() -> list[dict[str, Any]]: """Return a list of all estimation procedures which are on OpenML. + Returns ------- procedures : list @@ -90,50 +87,52 @@ def _get_estimation_procedure_list(): # Minimalistic check if the XML is useful if "oml:estimationprocedures" not in procs_dict: raise ValueError("Error in return XML, does not contain tag oml:estimationprocedures.") - elif "@xmlns:oml" not in procs_dict["oml:estimationprocedures"]: + + if "@xmlns:oml" not in procs_dict["oml:estimationprocedures"]: raise ValueError( "Error in return XML, does not contain tag " - "@xmlns:oml as a child of oml:estimationprocedures." + "@xmlns:oml as a child of oml:estimationprocedures.", ) - elif procs_dict["oml:estimationprocedures"]["@xmlns:oml"] != "http://openml.org/openml": + + if procs_dict["oml:estimationprocedures"]["@xmlns:oml"] != "http://openml.org/openml": raise ValueError( "Error in return XML, value of " "oml:estimationprocedures/@xmlns:oml is not " "http://openml.org/openml, but %s" - % str(procs_dict["oml:estimationprocedures"]["@xmlns:oml"]) + % str(procs_dict["oml:estimationprocedures"]["@xmlns:oml"]), ) - procs = [] + procs: list[dict[str, Any]] = [] for proc_ in procs_dict["oml:estimationprocedures"]["oml:estimationprocedure"]: task_type_int = int(proc_["oml:ttid"]) try: task_type_id = TaskType(task_type_int) + procs.append( + { + "id": int(proc_["oml:id"]), + "task_type_id": task_type_id, + "name": proc_["oml:name"], + "type": proc_["oml:type"], + }, + ) except ValueError as e: warnings.warn( f"Could not create task type id for {task_type_int} due to error {e}", RuntimeWarning, + stacklevel=2, ) - continue - procs.append( - { - "id": int(proc_["oml:id"]), - "task_type_id": task_type_id, - "name": proc_["oml:name"], - "type": proc_["oml:type"], - } - ) return procs def list_tasks( - task_type: Optional[TaskType] = None, - offset: Optional[int] = None, - size: Optional[int] = None, - tag: Optional[str] = None, - output_format: str = "dict", - **kwargs, -) -> Union[Dict, pd.DataFrame]: + task_type: TaskType | None = None, + offset: int | None = None, + size: int | None = None, + tag: str | None = None, + output_format: Literal["dict", "dataframe"] = "dict", + **kwargs: Any, +) -> dict | pd.DataFrame: """ Return a number of tasks having the given tag and task_type @@ -174,7 +173,7 @@ def list_tasks( """ if output_format not in ["dataframe", "dict"]: raise ValueError( - "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable." + "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable.", ) # TODO: [0.15] if output_format == "dict": @@ -184,8 +183,8 @@ def list_tasks( "will continue to work, use `output_format`='dataframe'." ) warnings.warn(msg, category=FutureWarning, stacklevel=2) - return openml.utils._list_all( - output_format=output_format, + return openml.utils._list_all( # type: ignore + list_output_format=output_format, # type: ignore listing_call=_list_tasks, task_type=task_type, offset=offset, @@ -195,9 +194,14 @@ def list_tasks( ) -def _list_tasks(task_type=None, output_format="dict", **kwargs): +def _list_tasks( + task_type: TaskType | None = None, + output_format: Literal["dict", "dataframe"] = "dict", + **kwargs: Any, +) -> dict | pd.DataFrame: """ Perform the api call to return a number of tasks having the given filters. + Parameters ---------- Filter task_type is separated from the other filters because @@ -224,33 +228,62 @@ def _list_tasks(task_type=None, output_format="dict", **kwargs): if kwargs is not None: for operator, value in kwargs.items(): if operator == "task_id": - value = ",".join([str(int(i)) for i in value]) - api_call += "/%s/%s" % (operator, value) + value = ",".join([str(int(i)) for i in value]) # noqa: PLW2901 + api_call += f"/{operator}/{value}" + return __list_tasks(api_call=api_call, output_format=output_format) -def __list_tasks(api_call, output_format="dict"): +# TODO(eddiebergman): overload todefine type returned +def __list_tasks( # noqa: PLR0912, C901 + api_call: str, + output_format: Literal["dict", "dataframe"] = "dict", +) -> dict | pd.DataFrame: + """Returns a dictionary or a Pandas DataFrame with information about OpenML tasks. + + Parameters + ---------- + api_call : str + The API call specifying which tasks to return. + output_format : str in {"dict", "dataframe"} + Output format for the returned object. + + Returns + ------- + Union[Dict, pd.DataFrame] + A dictionary or a Pandas DataFrame with information about OpenML tasks. + + Raises + ------ + ValueError + If the XML returned by the OpenML API does not contain 'oml:tasks', '@xmlns:oml', + or has an incorrect value for '@xmlns:oml'. + KeyError + If an invalid key is found in the XML for a task. + """ xml_string = openml._api_calls._perform_api_call(api_call, "get") tasks_dict = xmltodict.parse(xml_string, force_list=("oml:task", "oml:input")) # Minimalistic check if the XML is useful if "oml:tasks" not in tasks_dict: - raise ValueError('Error in return XML, does not contain "oml:runs": %s' % str(tasks_dict)) - elif "@xmlns:oml" not in tasks_dict["oml:tasks"]: + raise ValueError(f'Error in return XML, does not contain "oml:runs": {tasks_dict}') + + if "@xmlns:oml" not in tasks_dict["oml:tasks"]: raise ValueError( - "Error in return XML, does not contain " '"oml:runs"/@xmlns:oml: %s' % str(tasks_dict) + f'Error in return XML, does not contain "oml:runs"/@xmlns:oml: {tasks_dict}' ) - elif tasks_dict["oml:tasks"]["@xmlns:oml"] != "http://openml.org/openml": + + if tasks_dict["oml:tasks"]["@xmlns:oml"] != "http://openml.org/openml": raise ValueError( "Error in return XML, value of " '"oml:runs"/@xmlns:oml is not ' - '"http://openml.org/openml": %s' % str(tasks_dict) + '"http://openml.org/openml": %s' % str(tasks_dict), ) - assert type(tasks_dict["oml:tasks"]["oml:task"]) == list, type(tasks_dict["oml:tasks"]) + assert isinstance(tasks_dict["oml:tasks"]["oml:task"], list), type(tasks_dict["oml:tasks"]) - tasks = dict() + tasks = {} procs = _get_estimation_procedure_list() - proc_dict = dict((x["id"], x) for x in procs) + proc_dict = {x["id"]: x for x in procs} for task_ in tasks_dict["oml:tasks"]["oml:task"]: tid = None @@ -263,8 +296,10 @@ def __list_tasks(api_call, output_format="dict"): warnings.warn( f"Could not create task type id for {task_type_int} due to error {e}", RuntimeWarning, + stacklevel=2, ) continue + task = { "tid": tid, "ttid": task_type_id, @@ -275,15 +310,15 @@ def __list_tasks(api_call, output_format="dict"): } # Other task inputs - for input in task_.get("oml:input", list()): - if input["@name"] == "estimation_procedure": - task[input["@name"]] = proc_dict[int(input["#text"])]["name"] + for _input in task_.get("oml:input", []): + if _input["@name"] == "estimation_procedure": + task[_input["@name"]] = proc_dict[int(_input["#text"])]["name"] else: - value = input.get("#text") - task[input["@name"]] = value + value = _input.get("#text") + task[_input["@name"]] = value # The number of qualities can range from 0 to infinity - for quality in task_.get("oml:quality", list()): + for quality in task_.get("oml:quality", []): if "#text" not in quality: quality_value = 0.0 else: @@ -295,10 +330,13 @@ def __list_tasks(api_call, output_format="dict"): tasks[tid] = task except KeyError as e: if tid is not None: - warnings.warn("Invalid xml for task %d: %s\nFrom %s" % (tid, e, task_)) + warnings.warn( + "Invalid xml for task %d: %s\nFrom %s" % (tid, e, task_), + RuntimeWarning, + stacklevel=2, + ) else: - warnings.warn("Could not find key %s in %s!" % (e, task_)) - continue + warnings.warn(f"Could not find key {e} in {task_}!", RuntimeWarning, stacklevel=2) if output_format == "dataframe": tasks = pd.DataFrame.from_dict(tasks, orient="index") @@ -306,9 +344,12 @@ def __list_tasks(api_call, output_format="dict"): return tasks +# TODO(eddiebergman): Maybe since this isn't public api, we can make it keyword only? def get_tasks( - task_ids: List[int], download_data: bool = True, download_qualities: bool = True -) -> List[OpenMLTask]: + task_ids: list[int], + download_data: bool = True, # noqa: FBT001, FBT002 + download_qualities: bool = True, # noqa: FBT001, FBT002 +) -> list[OpenMLTask]: """Download tasks. This function iterates :meth:`openml.tasks.get_task`. @@ -334,7 +375,10 @@ def get_tasks( @openml.utils.thread_safe_if_oslo_installed def get_task( - task_id: int, *dataset_args, download_splits: Optional[bool] = None, **get_dataset_kwargs + task_id: int, + *dataset_args: Any, + download_splits: bool | None = None, + **get_dataset_kwargs: Any, ) -> OpenMLTask: """Download OpenML task for a given task ID. @@ -374,6 +418,7 @@ def get_task( "of ``True`` and be independent from `download_data`. To disable this message until " "version 0.15 explicitly set `download_splits` to a bool.", FutureWarning, + stacklevel=3, ) download_splits = get_dataset_kwargs.get("download_data", True) @@ -382,17 +427,15 @@ def get_task( warnings.warn( "Task id must be specified as `int` from 0.14.0 onwards.", FutureWarning, + stacklevel=3, ) try: task_id = int(task_id) - except (ValueError, TypeError): - raise ValueError("Dataset ID is neither an Integer nor can be cast to an Integer.") + except (ValueError, TypeError) as e: + raise ValueError("Dataset ID is neither an Integer nor can be cast to an Integer.") from e - tid_cache_dir = openml.utils._create_cache_directory_for_id( - TASKS_CACHE_DIR_NAME, - task_id, - ) + tid_cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id) try: task = _get_task_description(task_id) @@ -404,38 +447,29 @@ def get_task( task.class_labels = dataset.retrieve_class_labels(task.target_name) # Clustering tasks do not have class labels # and do not offer download_split - if download_splits: - if isinstance(task, OpenMLSupervisedTask): - task.download_split() + if download_splits and isinstance(task, OpenMLSupervisedTask): + task.download_split() except Exception as e: - openml.utils._remove_cache_dir_for_id( - TASKS_CACHE_DIR_NAME, - tid_cache_dir, - ) + openml.utils._remove_cache_dir_for_id(TASKS_CACHE_DIR_NAME, tid_cache_dir) raise e return task -def _get_task_description(task_id): +def _get_task_description(task_id: int) -> OpenMLTask: try: return _get_cached_task(task_id) except OpenMLCacheException: - xml_file = os.path.join( - openml.utils._create_cache_directory_for_id( - TASKS_CACHE_DIR_NAME, - task_id, - ), - "task.xml", - ) + _cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id) + xml_file = _cache_dir / "task.xml" task_xml = openml._api_calls._perform_api_call("task/%d" % task_id, "get") - with io.open(xml_file, "w", encoding="utf8") as fh: + with xml_file.open("w", encoding="utf8") as fh: fh.write(task_xml) return _create_task_from_xml(task_xml) -def _create_task_from_xml(xml): +def _create_task_from_xml(xml: str) -> OpenMLTask: """Create a task given a xml string. Parameters @@ -448,8 +482,8 @@ def _create_task_from_xml(xml): OpenMLTask """ dic = xmltodict.parse(xml)["oml:task"] - estimation_parameters = dict() - inputs = dict() + estimation_parameters = {} + inputs = {} # Due to the unordered structure we obtain, we first have to extract # the possible keys of oml:input; dic["oml:input"] is a list of # OrderedDicts @@ -508,22 +542,20 @@ def _create_task_from_xml(xml): }.get(task_type) if cls is None: raise NotImplementedError("Task type %s not supported." % common_kwargs["task_type"]) - return cls(**common_kwargs) + return cls(**common_kwargs) # type: ignore +# TODO(eddiebergman): overload on `task_type` def create_task( task_type: TaskType, dataset_id: int, estimation_procedure_id: int, - target_name: Optional[str] = None, - evaluation_measure: Optional[str] = None, - **kwargs, -) -> Union[ - OpenMLClassificationTask, - OpenMLRegressionTask, - OpenMLLearningCurveTask, - OpenMLClusteringTask, -]: + target_name: str | None = None, + evaluation_measure: str | None = None, + **kwargs: Any, +) -> ( + OpenMLClassificationTask | OpenMLRegressionTask | OpenMLLearningCurveTask | OpenMLClusteringTask +): """Create a task based on different given attributes. Builds a task object with the function arguments as @@ -556,25 +588,26 @@ def create_task( OpenMLClassificationTask, OpenMLRegressionTask, OpenMLLearningCurveTask, OpenMLClusteringTask """ - task_cls = { - TaskType.SUPERVISED_CLASSIFICATION: OpenMLClassificationTask, - TaskType.SUPERVISED_REGRESSION: OpenMLRegressionTask, - TaskType.CLUSTERING: OpenMLClusteringTask, - TaskType.LEARNING_CURVE: OpenMLLearningCurveTask, - }.get(task_type) - - if task_cls is None: - raise NotImplementedError("Task type {0:d} not supported.".format(task_type)) + if task_type == TaskType.CLUSTERING: + task_cls = OpenMLClusteringTask + elif task_type == TaskType.LEARNING_CURVE: + task_cls = OpenMLLearningCurveTask # type: ignore + elif task_type == TaskType.SUPERVISED_CLASSIFICATION: + task_cls = OpenMLClassificationTask # type: ignore + elif task_type == TaskType.SUPERVISED_REGRESSION: + task_cls = OpenMLRegressionTask # type: ignore else: - return task_cls( - task_type_id=task_type, - task_type=None, - data_set_id=dataset_id, - target_name=target_name, - estimation_procedure_id=estimation_procedure_id, - evaluation_measure=evaluation_measure, - **kwargs, - ) + raise NotImplementedError(f"Task type {task_type:d} not supported.") + + return task_cls( + task_type_id=task_type, + task_type="None", # TODO: refactor to get task type string from ID. + data_set_id=dataset_id, + target_name=target_name, + estimation_procedure_id=estimation_procedure_id, + evaluation_measure=evaluation_measure, + **kwargs, + ) def delete_task(task_id: int) -> bool: diff --git a/openml/tasks/split.py b/openml/tasks/split.py index bc0dac55d..81105f1fd 100644 --- a/openml/tasks/split.py +++ b/openml/tasks/split.py @@ -1,17 +1,24 @@ # License: BSD 3-Clause +from __future__ import annotations -from collections import namedtuple, OrderedDict -import os import pickle +from collections import OrderedDict +from pathlib import Path +from typing import Any +from typing_extensions import NamedTuple +import arff # type: ignore import numpy as np -import arff -Split = namedtuple("Split", ["train", "test"]) +class Split(NamedTuple): + """A single split of a dataset.""" + train: np.ndarray + test: np.ndarray -class OpenMLSplit(object): + +class OpenMLSplit: """OpenML Split object. Parameters @@ -21,29 +28,37 @@ class OpenMLSplit(object): split : dict """ - def __init__(self, name, description, split): + def __init__( + self, + name: int | str, + description: str, + split: dict[int, dict[int, dict[int, tuple[np.ndarray, np.ndarray]]]], + ): self.description = description self.name = name - self.split = dict() + self.split: dict[int, dict[int, dict[int, tuple[np.ndarray, np.ndarray]]]] = {} # Add splits according to repetition for repetition in split: - repetition = int(repetition) - self.split[repetition] = OrderedDict() - for fold in split[repetition]: - self.split[repetition][fold] = OrderedDict() - for sample in split[repetition][fold]: - self.split[repetition][fold][sample] = split[repetition][fold][sample] + _rep = int(repetition) + self.split[_rep] = OrderedDict() + for fold in split[_rep]: + self.split[_rep][fold] = OrderedDict() + for sample in split[_rep][fold]: + self.split[_rep][fold][sample] = split[_rep][fold][sample] self.repeats = len(self.split) - if any([len(self.split[0]) != len(self.split[i]) for i in range(self.repeats)]): + + # TODO(eddiebergman): Better error message + if any(len(self.split[0]) != len(self.split[i]) for i in range(self.repeats)): raise ValueError("") + self.folds = len(self.split[0]) self.samples = len(self.split[0][0]) - def __eq__(self, other): + def __eq__(self, other: Any) -> bool: if ( - type(self) != type(other) + (not isinstance(self, type(other))) or self.name != other.name or self.description != other.description or self.split.keys() != other.split.keys() @@ -69,23 +84,26 @@ def __eq__(self, other): return True @classmethod - def _from_arff_file(cls, filename: str) -> "OpenMLSplit": + def _from_arff_file(cls, filename: Path) -> OpenMLSplit: # noqa: C901, PLR0912 repetitions = None + name = None - pkl_filename = filename.replace(".arff", ".pkl.py3") + pkl_filename = filename.with_suffix(".pkl.py3") - if os.path.exists(pkl_filename): - with open(pkl_filename, "rb") as fh: - _ = pickle.load(fh) - repetitions = _["repetitions"] - name = _["name"] + if pkl_filename.exists(): + with pkl_filename.open("rb") as fh: + # TODO(eddiebergman): Would be good to figure out what _split is and assert it is + _split = pickle.load(fh) # noqa: S301 + repetitions = _split["repetitions"] + name = _split["name"] # Cache miss if repetitions is None: # Faster than liac-arff and sufficient in this situation! - if not os.path.exists(filename): - raise FileNotFoundError("Split arff %s does not exist!" % filename) - file_data = arff.load(open(filename), return_type=arff.DENSE_GEN) + if not filename.exists(): + raise FileNotFoundError(f"Split arff {filename} does not exist!") + + file_data = arff.load(filename.open("r"), return_type=arff.DENSE_GEN) splits = file_data["data"] name = file_data["relation"] attrnames = [attr[0] for attr in file_data["attributes"]] @@ -130,15 +148,34 @@ def _from_arff_file(cls, filename: str) -> "OpenMLSplit": np.array(repetitions[repetition][fold][sample][1], dtype=np.int32), ) - with open(pkl_filename, "wb") as fh: + with pkl_filename.open("wb") as fh: pickle.dump({"name": name, "repetitions": repetitions}, fh, protocol=2) + assert name is not None return cls(name, "", repetitions) - def from_dataset(self, X, Y, folds, repeats): - raise NotImplementedError() - - def get(self, repeat=0, fold=0, sample=0): + def get(self, repeat: int = 0, fold: int = 0, sample: int = 0) -> tuple[np.ndarray, np.ndarray]: + """Returns the specified data split from the CrossValidationSplit object. + + Parameters + ---------- + repeat : int + Index of the repeat to retrieve. + fold : int + Index of the fold to retrieve. + sample : int + Index of the sample to retrieve. + + Returns + ------- + numpy.ndarray + The data split for the specified repeat, fold, and sample. + + Raises + ------ + ValueError + If the specified repeat, fold, or sample is not known. + """ if repeat not in self.split: raise ValueError("Repeat %s not known" % str(repeat)) if fold not in self.split[repeat]: diff --git a/openml/tasks/task.py b/openml/tasks/task.py index 36e0ada1c..4ad4cec62 100644 --- a/openml/tasks/task.py +++ b/openml/tasks/task.py @@ -1,25 +1,34 @@ # License: BSD 3-Clause +# TODO(eddbergman): Seems like a lot of the subclasses could just get away with setting +# a `ClassVar` for whatever changes as their `__init__` defaults, less duplicated code. +from __future__ import annotations + import warnings from abc import ABC -from collections import OrderedDict from enum import Enum -import io -import os -from typing import Union, Tuple, Dict, List, Optional, Any -from warnings import warn - -import numpy as np -import pandas as pd -import scipy.sparse +from pathlib import Path +from typing import TYPE_CHECKING, Any, Sequence +from typing_extensions import Literal, TypedDict, overload import openml._api_calls +import openml.config +from openml import datasets from openml.base import OpenMLBase -from .. import datasets +from openml.utils import _create_cache_directory_for_id + from .split import OpenMLSplit -from ..utils import _create_cache_directory_for_id + +if TYPE_CHECKING: + import numpy as np + import pandas as pd + import scipy.sparse +# TODO(eddiebergman): Should use `auto()` but might be too late if these numbers are used +# and stored on server. class TaskType(Enum): + """Possible task types as defined in OpenML.""" + SUPERVISED_CLASSIFICATION = 1 SUPERVISED_REGRESSION = 2 LEARNING_CURVE = 3 @@ -31,61 +40,76 @@ class TaskType(Enum): MULTITASK_REGRESSION = 9 +class _EstimationProcedure(TypedDict): + type: str | None + parameters: dict[str, str] | None + data_splits_url: str | None + + class OpenMLTask(OpenMLBase): """OpenML Task object. Parameters ---------- - task_type_id : TaskType - Refers to the type of task. - task_type : str - Refers to the task. + task_id: Union[int, None] + Refers to the unique identifier of OpenML task. + task_type_id: TaskType + Refers to the type of OpenML task. + task_type: str + Refers to the OpenML task. data_set_id: int Refers to the data. estimation_procedure_id: int Refers to the type of estimates used. + estimation_procedure_type: str, default=None + Refers to the type of estimation procedure used for the OpenML task. + estimation_parameters: [Dict[str, str]], default=None + Estimation parameters used for the OpenML task. + evaluation_measure: str, default=None + Refers to the evaluation measure. + data_splits_url: str, default=None + Refers to the URL of the data splits used for the OpenML task. """ - def __init__( + def __init__( # noqa: PLR0913 self, - task_id: Optional[int], + task_id: int | None, task_type_id: TaskType, task_type: str, data_set_id: int, estimation_procedure_id: int = 1, - estimation_procedure_type: Optional[str] = None, - estimation_parameters: Optional[Dict[str, str]] = None, - evaluation_measure: Optional[str] = None, - data_splits_url: Optional[str] = None, + estimation_procedure_type: str | None = None, + estimation_parameters: dict[str, str] | None = None, + evaluation_measure: str | None = None, + data_splits_url: str | None = None, ): self.task_id = int(task_id) if task_id is not None else None self.task_type_id = task_type_id self.task_type = task_type self.dataset_id = int(data_set_id) self.evaluation_measure = evaluation_measure - self.estimation_procedure = ( - dict() - ) # type: Dict[str, Optional[Union[str, Dict]]] # noqa E501 - self.estimation_procedure["type"] = estimation_procedure_type - self.estimation_procedure["parameters"] = estimation_parameters - self.estimation_procedure["data_splits_url"] = data_splits_url + self.estimation_procedure: _EstimationProcedure = { + "type": estimation_procedure_type, + "parameters": estimation_parameters, + "data_splits_url": data_splits_url, + } self.estimation_procedure_id = estimation_procedure_id - self.split = None # type: Optional[OpenMLSplit] + self.split: OpenMLSplit | None = None @classmethod def _entity_letter(cls) -> str: return "t" @property - def id(self) -> Optional[int]: + def id(self) -> int | None: + """Return the OpenML ID of this task.""" return self.task_id - def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: + def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str]]]: """Collect all information to display in the __repr__ body.""" - fields: Dict[str, Any] = { - "Task Type Description": "{}/tt/{}".format( - openml.config.get_server_base_url(), self.task_type_id - ) + base_server_url = openml.config.get_server_base_url() + fields: dict[str, Any] = { + "Task Type Description": f"{base_server_url}/tt/{self.task_type_id}" } if self.task_id is not None: fields["Task ID"] = self.task_id @@ -94,10 +118,17 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: fields["Evaluation Measure"] = self.evaluation_measure if self.estimation_procedure is not None: fields["Estimation Procedure"] = self.estimation_procedure["type"] - if getattr(self, "target_name", None) is not None: - fields["Target Feature"] = getattr(self, "target_name") - if hasattr(self, "class_labels") and getattr(self, "class_labels") is not None: - fields["# of Classes"] = len(getattr(self, "class_labels")) + + # TODO(eddiebergman): Subclasses could advertise/provide this, instead of having to + # have the base class know about it's subclasses. + target_name = getattr(self, "target_name", None) + if target_name is not None: + fields["Target Feature"] = target_name + + class_labels = getattr(self, "class_labels", None) + if class_labels is not None: + fields["# of Classes"] = len(class_labels) + if hasattr(self, "cost_matrix"): fields["Cost Matrix"] = "Available" @@ -115,7 +146,7 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: return [(key, fields[key]) for key in order if key in fields] def get_dataset(self) -> datasets.OpenMLDataset: - """Download dataset associated with task""" + """Download dataset associated with task.""" return datasets.get_dataset(self.dataset_id) def get_train_test_split_indices( @@ -123,80 +154,68 @@ def get_train_test_split_indices( fold: int = 0, repeat: int = 0, sample: int = 0, - ) -> Tuple[np.ndarray, np.ndarray]: + ) -> tuple[np.ndarray, np.ndarray]: + """Get the indices of the train and test splits for a given task.""" # Replace with retrieve from cache if self.split is None: self.split = self.download_split() - train_indices, test_indices = self.split.get( - repeat=repeat, - fold=fold, - sample=sample, - ) - return train_indices, test_indices + return self.split.get(repeat=repeat, fold=fold, sample=sample) - def _download_split(self, cache_file: str): + def _download_split(self, cache_file: Path) -> None: + # TODO(eddiebergman): Not sure about this try to read and error approach try: - with io.open(cache_file, encoding="utf8"): + with cache_file.open(encoding="utf8"): pass - except (OSError, IOError): + except OSError: split_url = self.estimation_procedure["data_splits_url"] openml._api_calls._download_text_file( source=str(split_url), - output_path=cache_file, + output_path=str(cache_file), ) def download_split(self) -> OpenMLSplit: """Download the OpenML split for a given task.""" - cached_split_file = os.path.join( - _create_cache_directory_for_id("tasks", self.task_id), - "datasplits.arff", - ) + # TODO(eddiebergman): Can this every be `None`? + assert self.task_id is not None + cache_dir = _create_cache_directory_for_id("tasks", self.task_id) + cached_split_file = cache_dir / "datasplits.arff" try: split = OpenMLSplit._from_arff_file(cached_split_file) - except (OSError, IOError): + except OSError: # Next, download and cache the associated split file self._download_split(cached_split_file) split = OpenMLSplit._from_arff_file(cached_split_file) return split - def get_split_dimensions(self) -> Tuple[int, int, int]: + def get_split_dimensions(self) -> tuple[int, int, int]: + """Get the (repeats, folds, samples) of the split for a given task.""" if self.split is None: self.split = self.download_split() return self.split.repeats, self.split.folds, self.split.samples - def _to_dict(self) -> "OrderedDict[str, OrderedDict]": - """Creates a dictionary representation of self.""" - task_container = OrderedDict() # type: OrderedDict[str, OrderedDict] - task_dict = OrderedDict( - [("@xmlns:oml", "http://openml.org/openml")] - ) # type: OrderedDict[str, Union[List, str, int]] - - task_container["oml:task_inputs"] = task_dict - task_dict["oml:task_type_id"] = self.task_type_id.value - - # having task_inputs and adding a type annotation - # solves wrong warnings - task_inputs = [ - OrderedDict([("@name", "source_data"), ("#text", str(self.dataset_id))]), - OrderedDict( - [("@name", "estimation_procedure"), ("#text", str(self.estimation_procedure_id))] - ), - ] # type: List[OrderedDict] - - if self.evaluation_measure is not None: - task_inputs.append( - OrderedDict([("@name", "evaluation_measures"), ("#text", self.evaluation_measure)]) - ) - - task_dict["oml:input"] = task_inputs - - return task_container + # TODO(eddiebergman): Really need some better typing on all this + def _to_dict(self) -> dict[str, dict[str, int | str | list[dict[str, Any]]]]: + """Creates a dictionary representation of self in a string format (for XML parsing).""" + oml_input = [ + {"@name": "source_data", "#text": str(self.dataset_id)}, + {"@name": "estimation_procedure", "#text": str(self.estimation_procedure_id)}, + ] + if self.evaluation_measure is not None: # + oml_input.append({"@name": "evaluation_measures", "#text": self.evaluation_measure}) + + return { + "oml:task_inputs": { + "@xmlns:oml": "http://openml.org/openml", + "oml:task_type_id": self.task_type_id.value, # This is an int from the enum? + "oml:input": oml_input, + } + } - def _parse_publish_response(self, xml_response: Dict): + def _parse_publish_response(self, xml_response: dict) -> None: """Parse the id from the xml_response and assign it to self.""" self.task_id = int(xml_response["oml:upload_task"]["oml:id"]) @@ -206,24 +225,42 @@ class OpenMLSupervisedTask(OpenMLTask, ABC): Parameters ---------- + task_type_id : TaskType + ID of the task type. + task_type : str + Name of the task type. + data_set_id : int + ID of the OpenML dataset associated with the task. target_name : str Name of the target feature (the class variable). + estimation_procedure_id : int, default=None + ID of the estimation procedure for the task. + estimation_procedure_type : str, default=None + Type of the estimation procedure for the task. + estimation_parameters : dict, default=None + Estimation parameters for the task. + evaluation_measure : str, default=None + Name of the evaluation measure for the task. + data_splits_url : str, default=None + URL of the data splits for the task. + task_id: Union[int, None] + Refers to the unique identifier of task. """ - def __init__( + def __init__( # noqa: PLR0913 self, task_type_id: TaskType, task_type: str, data_set_id: int, target_name: str, estimation_procedure_id: int = 1, - estimation_procedure_type: Optional[str] = None, - estimation_parameters: Optional[Dict[str, str]] = None, - evaluation_measure: Optional[str] = None, - data_splits_url: Optional[str] = None, - task_id: Optional[int] = None, + estimation_procedure_type: str | None = None, + estimation_parameters: dict[str, str] | None = None, + evaluation_measure: str | None = None, + data_splits_url: str | None = None, + task_id: int | None = None, ): - super(OpenMLSupervisedTask, self).__init__( + super().__init__( task_id=task_id, task_type_id=task_type_id, task_type=task_type, @@ -237,11 +274,30 @@ def __init__( self.target_name = target_name + @overload def get_X_and_y( - self, - dataset_format: str = "array", - ) -> Tuple[ - Union[np.ndarray, pd.DataFrame, scipy.sparse.spmatrix], Union[np.ndarray, pd.Series] + self, dataset_format: Literal["array"] = "array" + ) -> tuple[ + np.ndarray | scipy.sparse.spmatrix, + np.ndarray | None, + ]: + ... + + @overload + def get_X_and_y( + self, dataset_format: Literal["dataframe"] + ) -> tuple[ + pd.DataFrame, + pd.Series | pd.DataFrame | None, + ]: + ... + + # TODO(eddiebergman): Do all OpenMLSupervisedTask have a `y`? + def get_X_and_y( + self, dataset_format: Literal["dataframe", "array"] = "array" + ) -> tuple[ + np.ndarray | pd.DataFrame | scipy.sparse.spmatrix, + np.ndarray | pd.Series | pd.DataFrame | None, ]: """Get data associated with the current task. @@ -273,34 +329,35 @@ def get_X_and_y( TaskType.LEARNING_CURVE, ): raise NotImplementedError(self.task_type) + X, y, _, _ = dataset.get_data( dataset_format=dataset_format, target=self.target_name, ) return X, y - def _to_dict(self) -> "OrderedDict[str, OrderedDict]": - task_container = super(OpenMLSupervisedTask, self)._to_dict() - task_dict = task_container["oml:task_inputs"] - - task_dict["oml:input"].append( - OrderedDict([("@name", "target_feature"), ("#text", self.target_name)]) - ) + def _to_dict(self) -> dict[str, dict]: + task_container = super()._to_dict() + oml_input = task_container["oml:task_inputs"]["oml:input"] # type: ignore + assert isinstance(oml_input, list) + oml_input.append({"@name": "target_feature", "#text": self.target_name}) return task_container @property - def estimation_parameters(self): - warn( + def estimation_parameters(self) -> dict[str, str] | None: + """Return the estimation parameters for the task.""" + warnings.warn( "The estimation_parameters attribute will be " "deprecated in the future, please use " "estimation_procedure['parameters'] instead", PendingDeprecationWarning, + stacklevel=2, ) return self.estimation_procedure["parameters"] @estimation_parameters.setter - def estimation_parameters(self, est_parameters): + def estimation_parameters(self, est_parameters: dict[str, str] | None) -> None: self.estimation_procedure["parameters"] = est_parameters @@ -309,26 +366,48 @@ class OpenMLClassificationTask(OpenMLSupervisedTask): Parameters ---------- - class_labels : List of str (optional) - cost_matrix: array (optional) + task_type_id : TaskType + ID of the Classification task type. + task_type : str + Name of the Classification task type. + data_set_id : int + ID of the OpenML dataset associated with the Classification task. + target_name : str + Name of the target variable. + estimation_procedure_id : int, default=None + ID of the estimation procedure for the Classification task. + estimation_procedure_type : str, default=None + Type of the estimation procedure. + estimation_parameters : dict, default=None + Estimation parameters for the Classification task. + evaluation_measure : str, default=None + Name of the evaluation measure. + data_splits_url : str, default=None + URL of the data splits for the Classification task. + task_id : Union[int, None] + ID of the Classification task (if it already exists on OpenML). + class_labels : List of str, default=None + A list of class labels (for classification tasks). + cost_matrix : array, default=None + A cost matrix (for classification tasks). """ - def __init__( + def __init__( # noqa: PLR0913 self, task_type_id: TaskType, task_type: str, data_set_id: int, target_name: str, estimation_procedure_id: int = 1, - estimation_procedure_type: Optional[str] = None, - estimation_parameters: Optional[Dict[str, str]] = None, - evaluation_measure: Optional[str] = None, - data_splits_url: Optional[str] = None, - task_id: Optional[int] = None, - class_labels: Optional[List[str]] = None, - cost_matrix: Optional[np.ndarray] = None, + estimation_procedure_type: str | None = None, + estimation_parameters: dict[str, str] | None = None, + evaluation_measure: str | None = None, + data_splits_url: str | None = None, + task_id: int | None = None, + class_labels: list[str] | None = None, + cost_matrix: np.ndarray | None = None, ): - super(OpenMLClassificationTask, self).__init__( + super().__init__( task_id=task_id, task_type_id=task_type_id, task_type=task_type, @@ -348,22 +427,46 @@ def __init__( class OpenMLRegressionTask(OpenMLSupervisedTask): - """OpenML Regression object.""" + """OpenML Regression object. + + Parameters + ---------- + task_type_id : TaskType + Task type ID of the OpenML Regression task. + task_type : str + Task type of the OpenML Regression task. + data_set_id : int + ID of the OpenML dataset. + target_name : str + Name of the target feature used in the Regression task. + estimation_procedure_id : int, default=None + ID of the OpenML estimation procedure. + estimation_procedure_type : str, default=None + Type of the OpenML estimation procedure. + estimation_parameters : dict, default=None + Parameters used by the OpenML estimation procedure. + data_splits_url : str, default=None + URL of the OpenML data splits for the Regression task. + task_id : Union[int, None] + ID of the OpenML Regression task. + evaluation_measure : str, default=None + Evaluation measure used in the Regression task. + """ - def __init__( + def __init__( # noqa: PLR0913 self, task_type_id: TaskType, task_type: str, data_set_id: int, target_name: str, estimation_procedure_id: int = 7, - estimation_procedure_type: Optional[str] = None, - estimation_parameters: Optional[Dict[str, str]] = None, - data_splits_url: Optional[str] = None, - task_id: Optional[int] = None, - evaluation_measure: Optional[str] = None, + estimation_procedure_type: str | None = None, + estimation_parameters: dict[str, str] | None = None, + data_splits_url: str | None = None, + task_id: int | None = None, + evaluation_measure: str | None = None, ): - super(OpenMLRegressionTask, self).__init__( + super().__init__( task_id=task_id, task_type_id=task_type_id, task_type=task_type, @@ -382,25 +485,43 @@ class OpenMLClusteringTask(OpenMLTask): Parameters ---------- - target_name : str (optional) + task_type_id : TaskType + Task type ID of the OpenML clustering task. + task_type : str + Task type of the OpenML clustering task. + data_set_id : int + ID of the OpenML dataset used in clustering the task. + estimation_procedure_id : int, default=None + ID of the OpenML estimation procedure. + task_id : Union[int, None] + ID of the OpenML clustering task. + estimation_procedure_type : str, default=None + Type of the OpenML estimation procedure used in the clustering task. + estimation_parameters : dict, default=None + Parameters used by the OpenML estimation procedure. + data_splits_url : str, default=None + URL of the OpenML data splits for the clustering task. + evaluation_measure : str, default=None + Evaluation measure used in the clustering task. + target_name : str, default=None Name of the target feature (class) that is not part of the feature set for the clustering task. """ - def __init__( + def __init__( # noqa: PLR0913 self, task_type_id: TaskType, task_type: str, data_set_id: int, estimation_procedure_id: int = 17, - task_id: Optional[int] = None, - estimation_procedure_type: Optional[str] = None, - estimation_parameters: Optional[Dict[str, str]] = None, - data_splits_url: Optional[str] = None, - evaluation_measure: Optional[str] = None, - target_name: Optional[str] = None, + task_id: int | None = None, + estimation_procedure_type: str | None = None, + estimation_parameters: dict[str, str] | None = None, + data_splits_url: str | None = None, + evaluation_measure: str | None = None, + target_name: str | None = None, ): - super(OpenMLClusteringTask, self).__init__( + super().__init__( task_id=task_id, task_type_id=task_type_id, task_type=task_type, @@ -414,10 +535,21 @@ def __init__( self.target_name = target_name + @overload def get_X( self, - dataset_format: str = "array", - ) -> Union[np.ndarray, pd.DataFrame, scipy.sparse.spmatrix]: + dataset_format: Literal["array"] = "array", + ) -> np.ndarray | scipy.sparse.spmatrix: + ... + + @overload + def get_X(self, dataset_format: Literal["dataframe"]) -> pd.DataFrame: + ... + + def get_X( + self, + dataset_format: Literal["array", "dataframe"] = "array", + ) -> np.ndarray | pd.DataFrame | scipy.sparse.spmatrix: """Get data associated with the current task. Parameters @@ -432,15 +564,10 @@ def get_X( """ dataset = self.get_dataset() - data, *_ = dataset.get_data( - dataset_format=dataset_format, - target=None, - ) + data, *_ = dataset.get_data(dataset_format=dataset_format, target=None) return data - def _to_dict(self) -> "OrderedDict[str, OrderedDict]": - task_container = super(OpenMLClusteringTask, self)._to_dict() - + def _to_dict(self) -> dict[str, dict[str, int | str | list[dict[str, Any]]]]: # Right now, it is not supported as a feature. # Uncomment if it is supported on the server # in the future. @@ -455,28 +582,56 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]": ]) ) """ - return task_container + return super()._to_dict() class OpenMLLearningCurveTask(OpenMLClassificationTask): - """OpenML Learning Curve object.""" + """OpenML Learning Curve object. + + Parameters + ---------- + task_type_id : TaskType + ID of the Learning Curve task. + task_type : str + Name of the Learning Curve task. + data_set_id : int + ID of the dataset that this task is associated with. + target_name : str + Name of the target feature in the dataset. + estimation_procedure_id : int, default=None + ID of the estimation procedure to use for evaluating models. + estimation_procedure_type : str, default=None + Type of the estimation procedure. + estimation_parameters : dict, default=None + Additional parameters for the estimation procedure. + data_splits_url : str, default=None + URL of the file containing the data splits for Learning Curve task. + task_id : Union[int, None] + ID of the Learning Curve task. + evaluation_measure : str, default=None + Name of the evaluation measure to use for evaluating models. + class_labels : list of str, default=None + Class labels for Learning Curve tasks. + cost_matrix : numpy array, default=None + Cost matrix for Learning Curve tasks. + """ - def __init__( + def __init__( # noqa: PLR0913 self, task_type_id: TaskType, task_type: str, data_set_id: int, target_name: str, estimation_procedure_id: int = 13, - estimation_procedure_type: Optional[str] = None, - estimation_parameters: Optional[Dict[str, str]] = None, - data_splits_url: Optional[str] = None, - task_id: Optional[int] = None, - evaluation_measure: Optional[str] = None, - class_labels: Optional[List[str]] = None, - cost_matrix: Optional[np.ndarray] = None, + estimation_procedure_type: str | None = None, + estimation_parameters: dict[str, str] | None = None, + data_splits_url: str | None = None, + task_id: int | None = None, + evaluation_measure: str | None = None, + class_labels: list[str] | None = None, + cost_matrix: np.ndarray | None = None, ): - super(OpenMLLearningCurveTask, self).__init__( + super().__init__( task_id=task_id, task_type_id=task_type_id, task_type=task_type, diff --git a/openml/testing.py b/openml/testing.py index ecb9620e1..4af361507 100644 --- a/openml/testing.py +++ b/openml/testing.py @@ -1,22 +1,32 @@ # License: BSD 3-Clause +from __future__ import annotations import hashlib import inspect +import logging import os import pathlib import shutil -import sys import time -from typing import Dict, Union, cast import unittest +from pathlib import Path +from typing import ClassVar + import pandas as pd import requests import openml -from openml.tasks import TaskType from openml.exceptions import OpenMLServerException +from openml.tasks import TaskType -import logging + +def _check_dataset(dataset: dict) -> None: + assert isinstance(dataset, dict) + assert len(dataset) >= 2 + assert "did" in dataset + assert isinstance(dataset["did"], int) + assert "status" in dataset + assert dataset["status"] in ["in_preparation", "active", "deactivated"] class TestBase(unittest.TestCase): @@ -28,14 +38,16 @@ class TestBase(unittest.TestCase): Hopefully soon allows using a test server, not the production server. """ - publish_tracker = { + # TODO: This could be made more explcit with a TypedDict instead of list[str | int] + publish_tracker: ClassVar[dict[str, list[str | int]]] = { "run": [], "data": [], "flow": [], "task": [], "study": [], "user": [], - } # type: dict + } + flow_name_tracker: ClassVar[list[str]] = [] test_server = "https://test.openml.org/api/v1/xml" # amueller's read/write key that he will throw away later apikey = "610344db6388d9ba34f6db45a3cf71de" @@ -44,7 +56,7 @@ class TestBase(unittest.TestCase): logger = logging.getLogger("unit_tests_published_entities") logger.setLevel(logging.DEBUG) - def setUp(self, n_levels: int = 1): + def setUp(self, n_levels: int = 1) -> None: """Setup variables and temporary directories. In particular, this methods: @@ -61,31 +73,30 @@ def setUp(self, n_levels: int = 1): Number of nested directories the test is in. Necessary to resolve the path to the ``files`` directory, which is located directly under the ``tests`` directory. """ - # This cache directory is checked in to git to simulate a populated # cache self.maxDiff = None - self.static_cache_dir = None - abspath_this_file = os.path.abspath(inspect.getfile(self.__class__)) - static_cache_dir = os.path.dirname(abspath_this_file) + abspath_this_file = Path(inspect.getfile(self.__class__)).absolute() + static_cache_dir = abspath_this_file.parent for _ in range(n_levels): - static_cache_dir = os.path.abspath(os.path.join(static_cache_dir, "..")) + static_cache_dir = static_cache_dir.parent.absolute() + content = os.listdir(static_cache_dir) if "files" in content: - self.static_cache_dir = os.path.join(static_cache_dir, "files") - - if self.static_cache_dir is None: + static_cache_dir = static_cache_dir / "files" + else: raise ValueError( - "Cannot find test cache dir, expected it to be {}!".format(static_cache_dir) + f"Cannot find test cache dir, expected it to be {static_cache_dir}!", ) - self.cwd = os.getcwd() - workdir = os.path.dirname(os.path.abspath(__file__)) + self.static_cache_dir = static_cache_dir + self.cwd = Path.cwd() + workdir = Path(__file__).parent.absolute() tmp_dir_name = self.id() - self.workdir = os.path.join(workdir, tmp_dir_name) + self.workdir = workdir / tmp_dir_name shutil.rmtree(self.workdir, ignore_errors=True) - os.mkdir(self.workdir) + self.workdir.mkdir(exist_ok=True) os.chdir(self.workdir) self.cached = True @@ -93,29 +104,34 @@ def setUp(self, n_levels: int = 1): self.production_server = "https://openml.org/api/v1/xml" openml.config.server = TestBase.test_server openml.config.avoid_duplicate_runs = False - openml.config.set_root_cache_directory(self.workdir) + openml.config.set_root_cache_directory(str(self.workdir)) # Increase the number of retries to avoid spurious server failures self.retry_policy = openml.config.retry_policy self.connection_n_retries = openml.config.connection_n_retries openml.config.set_retry_policy("robot", n_retries=20) - def tearDown(self): + def tearDown(self) -> None: + """Tear down the test""" os.chdir(self.cwd) try: shutil.rmtree(self.workdir) - except PermissionError: - if os.name == "nt": + except PermissionError as e: + if os.name != "nt": # one of the files may still be used by another process - pass - else: - raise + raise e + openml.config.server = self.production_server openml.config.connection_n_retries = self.connection_n_retries openml.config.retry_policy = self.retry_policy @classmethod - def _mark_entity_for_removal(self, entity_type, entity_id): + def _mark_entity_for_removal( + cls, + entity_type: str, + entity_id: int, + entity_name: str | None = None, + ) -> None: """Static record of entities uploaded to test server Dictionary of lists where the keys are 'entity_type'. @@ -127,9 +143,12 @@ def _mark_entity_for_removal(self, entity_type, entity_id): TestBase.publish_tracker[entity_type] = [entity_id] else: TestBase.publish_tracker[entity_type].append(entity_id) + if isinstance(entity_type, openml.flows.OpenMLFlow): + assert entity_name is not None + cls.flow_name_tracker.append(entity_name) @classmethod - def _delete_entity_from_tracker(self, entity_type, entity): + def _delete_entity_from_tracker(cls, entity_type: str, entity: int) -> None: """Deletes entity records from the static file_tracker Given an entity type and corresponding ID, deletes all entries, including @@ -139,61 +158,69 @@ def _delete_entity_from_tracker(self, entity_type, entity): # removes duplicate entries TestBase.publish_tracker[entity_type] = list(set(TestBase.publish_tracker[entity_type])) if entity_type == "flow": - delete_index = [ + delete_index = next( i - for i, (id_, _) in enumerate(TestBase.publish_tracker[entity_type]) + for i, (id_, _) in enumerate( + zip(TestBase.publish_tracker[entity_type], TestBase.flow_name_tracker), + ) if id_ == entity - ][0] + ) else: - delete_index = [ + delete_index = next( i for i, id_ in enumerate(TestBase.publish_tracker[entity_type]) if id_ == entity - ][0] + ) TestBase.publish_tracker[entity_type].pop(delete_index) - def _get_sentinel(self, sentinel=None): + def _get_sentinel(self, sentinel: str | None = None) -> str: if sentinel is None: # Create a unique prefix for the flow. Necessary because the flow # is identified by its name and external version online. Having a # unique name allows us to publish the same flow in each test run. - md5 = hashlib.md5() + md5 = hashlib.md5() # noqa: S324 md5.update(str(time.time()).encode("utf-8")) md5.update(str(os.getpid()).encode("utf-8")) sentinel = md5.hexdigest()[:10] sentinel = "TEST%s" % sentinel return sentinel - def _add_sentinel_to_flow_name(self, flow, sentinel=None): + def _add_sentinel_to_flow_name( + self, + flow: openml.flows.OpenMLFlow, + sentinel: str | None = None, + ) -> tuple[openml.flows.OpenMLFlow, str]: sentinel = self._get_sentinel(sentinel=sentinel) - flows_to_visit = list() + flows_to_visit = [] flows_to_visit.append(flow) while len(flows_to_visit) > 0: current_flow = flows_to_visit.pop() - current_flow.name = "%s%s" % (sentinel, current_flow.name) + current_flow.name = f"{sentinel}{current_flow.name}" for subflow in current_flow.components.values(): flows_to_visit.append(subflow) return flow, sentinel - def _check_dataset(self, dataset): - self.assertEqual(type(dataset), dict) - self.assertGreaterEqual(len(dataset), 2) - self.assertIn("did", dataset) - self.assertIsInstance(dataset["did"], int) - self.assertIn("status", dataset) - self.assertIsInstance(dataset["status"], str) - self.assertIn(dataset["status"], ["in_preparation", "active", "deactivated"]) - - def _check_fold_timing_evaluations( + def _check_dataset(self, dataset: dict[str, str | int]) -> None: + _check_dataset(dataset) + assert isinstance(dataset, dict) + assert len(dataset) >= 2 + assert "did" in dataset + assert isinstance(dataset["did"], int) + assert "status" in dataset + assert isinstance(dataset["status"], str) + assert dataset["status"] in ["in_preparation", "active", "deactivated"] + + def _check_fold_timing_evaluations( # noqa: PLR0913 self, - fold_evaluations: Dict, + fold_evaluations: dict[str, dict[int, dict[int, float]]], num_repeats: int, num_folds: int, + *, max_time_allowed: float = 60000.0, task_type: TaskType = TaskType.SUPERVISED_CLASSIFICATION, check_scores: bool = True, - ): + ) -> None: """ Checks whether the right timing measures are attached to the run (before upload). Test is only performed for versions >= Python3.3 @@ -203,7 +230,6 @@ def _check_fold_timing_evaluations( default max_time_allowed (per fold, in milli seconds) = 1 minute, quite pessimistic """ - # a dict mapping from openml measure to a tuple with the minimum and # maximum allowed value check_measures = { @@ -222,31 +248,31 @@ def _check_fold_timing_evaluations( elif task_type == TaskType.SUPERVISED_REGRESSION: check_measures["mean_absolute_error"] = (0, float("inf")) - self.assertIsInstance(fold_evaluations, dict) - if sys.version_info[:2] >= (3, 3): - # this only holds if we are allowed to record time (otherwise some - # are missing) - self.assertEqual(set(fold_evaluations.keys()), set(check_measures.keys())) + assert isinstance(fold_evaluations, dict) + assert set(fold_evaluations.keys()) == set(check_measures.keys()) - for measure in check_measures.keys(): + for measure in check_measures: if measure in fold_evaluations: num_rep_entrees = len(fold_evaluations[measure]) - self.assertEqual(num_rep_entrees, num_repeats) + assert num_rep_entrees == num_repeats min_val = check_measures[measure][0] max_val = check_measures[measure][1] for rep in range(num_rep_entrees): num_fold_entrees = len(fold_evaluations[measure][rep]) - self.assertEqual(num_fold_entrees, num_folds) + assert num_fold_entrees == num_folds for fold in range(num_fold_entrees): evaluation = fold_evaluations[measure][rep][fold] - self.assertIsInstance(evaluation, float) - self.assertGreaterEqual(evaluation, min_val) - self.assertLessEqual(evaluation, max_val) + assert isinstance(evaluation, float) + assert evaluation >= min_val + assert evaluation <= max_val def check_task_existence( - task_type: TaskType, dataset_id: int, target_name: str, **kwargs -) -> Union[int, None]: + task_type: TaskType, + dataset_id: int, + target_name: str, + **kwargs: dict[str, str | int | dict[str, str | int | openml.tasks.TaskType]], +) -> int | None: """Checks if any task with exists on test server that matches the meta data. Parameter @@ -261,9 +287,10 @@ def check_task_existence( """ return_val = None tasks = openml.tasks.list_tasks(task_type=task_type, output_format="dataframe") + assert isinstance(tasks, pd.DataFrame) if len(tasks) == 0: return None - tasks = cast(pd.DataFrame, tasks).loc[tasks["did"] == dataset_id] + tasks = tasks.loc[tasks["did"] == dataset_id] if len(tasks) == 0: return None tasks = tasks.loc[tasks["target_feature"] == target_name] @@ -305,13 +332,13 @@ class CustomImputer(SimpleImputer): Helps bypass the sklearn extension duplicate operation check """ - pass - def create_request_response( - *, status_code: int, content_filepath: pathlib.Path + *, + status_code: int, + content_filepath: pathlib.Path, ) -> requests.Response: - with open(content_filepath, "r") as xml_response: + with content_filepath.open("r") as xml_response: response_body = xml_response.read() response = requests.Response() diff --git a/openml/utils.py b/openml/utils.py index ffcc308dd..80d7caaae 100644 --- a/openml/utils.py +++ b/openml/utils.py @@ -1,37 +1,58 @@ # License: BSD 3-Clause +from __future__ import annotations -import os -import xmltodict +import contextlib import shutil -from typing import TYPE_CHECKING, List, Tuple, Union, Type import warnings -import pandas as pd from functools import wraps -import collections +from pathlib import Path +from typing import TYPE_CHECKING, Any, Callable, Mapping, TypeVar, overload +from typing_extensions import Literal, ParamSpec + +import numpy as np +import pandas as pd +import xmltodict import openml import openml._api_calls import openml.exceptions + from . import config # Avoid import cycles: https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles if TYPE_CHECKING: from openml.base import OpenMLBase -oslo_installed = False -try: - # Currently, importing oslo raises a lot of warning that it will stop working - # under python3.8; remove this once they disappear - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - from oslo_concurrency import lockutils + P = ParamSpec("P") + R = TypeVar("R") + + +@overload +def extract_xml_tags( + xml_tag_name: str, + node: Mapping[str, Any], + *, + allow_none: Literal[True] = ..., +) -> Any | None: + ... + - oslo_installed = True -except ImportError: - pass +@overload +def extract_xml_tags( + xml_tag_name: str, + node: Mapping[str, Any], + *, + allow_none: Literal[False], +) -> Any: + ... -def extract_xml_tags(xml_tag_name, node, allow_none=True): +def extract_xml_tags( + xml_tag_name: str, + node: Mapping[str, Any], + *, + allow_none: bool = True, +) -> Any | None: """Helper to extract xml tags from xmltodict. Parameters @@ -39,7 +60,7 @@ def extract_xml_tags(xml_tag_name, node, allow_none=True): xml_tag_name : str Name of the xml tag to extract from the node. - node : object + node : Mapping[str, Any] Node object returned by ``xmltodict`` from which ``xml_tag_name`` should be extracted. @@ -52,46 +73,48 @@ def extract_xml_tags(xml_tag_name, node, allow_none=True): object """ if xml_tag_name in node and node[xml_tag_name] is not None: - if isinstance(node[xml_tag_name], dict): - rval = [node[xml_tag_name]] - elif isinstance(node[xml_tag_name], str): - rval = [node[xml_tag_name]] - elif isinstance(node[xml_tag_name], list): - rval = node[xml_tag_name] - else: - raise ValueError("Received not string and non list as tag item") + if isinstance(node[xml_tag_name], (dict, str)): + return [node[xml_tag_name]] + if isinstance(node[xml_tag_name], list): + return node[xml_tag_name] - return rval - else: - if allow_none: - return None - else: - raise ValueError("Could not find tag '%s' in node '%s'" % (xml_tag_name, str(node))) + raise ValueError("Received not string and non list as tag item") + + if allow_none: + return None + + raise ValueError(f"Could not find tag '{xml_tag_name}' in node '{node!s}'") -def _get_rest_api_type_alias(oml_object: "OpenMLBase") -> str: +def _get_rest_api_type_alias(oml_object: OpenMLBase) -> str: """Return the alias of the openml entity as it is defined for the REST API.""" - rest_api_mapping: List[Tuple[Union[Type, Tuple], str]] = [ + rest_api_mapping: list[tuple[type | tuple, str]] = [ (openml.datasets.OpenMLDataset, "data"), (openml.flows.OpenMLFlow, "flow"), (openml.tasks.OpenMLTask, "task"), (openml.runs.OpenMLRun, "run"), ((openml.study.OpenMLStudy, openml.study.OpenMLBenchmarkSuite), "study"), ] - _, api_type_alias = [ + _, api_type_alias = next( (python_type, api_alias) for (python_type, api_alias) in rest_api_mapping if isinstance(oml_object, python_type) - ][0] + ) return api_type_alias -def _tag_openml_base(oml_object: "OpenMLBase", tag: str, untag: bool = False): +def _tag_openml_base(oml_object: OpenMLBase, tag: str, untag: bool = False) -> None: # noqa: FBT001, FBT002 api_type_alias = _get_rest_api_type_alias(oml_object) - _tag_entity(api_type_alias, oml_object.id, tag, untag) + if oml_object.id is None: + raise openml.exceptions.ObjectNotPublishedError( + f"Cannot tag an {api_type_alias} that has not been published yet." + "Please publish the object first before being able to tag it." + f"\n{oml_object}", + ) + _tag_entity(entity_type=api_type_alias, entity_id=oml_object.id, tag=tag, untag=untag) -def _tag_entity(entity_type, entity_id, tag, untag=False): +def _tag_entity(entity_type: str, entity_id: int, tag: str, *, untag: bool = False) -> list[str]: """ Function that tags or untags a given entity on OpenML. As the OpenML API tag functions all consist of the same format, this function covers @@ -119,27 +142,32 @@ def _tag_entity(entity_type, entity_id, tag, untag=False): """ legal_entities = {"data", "task", "flow", "setup", "run"} if entity_type not in legal_entities: - raise ValueError("Can't tag a %s" % entity_type) + raise ValueError(f"Can't tag a {entity_type}") - uri = "%s/tag" % entity_type - main_tag = "oml:%s_tag" % entity_type if untag: - uri = "%s/untag" % entity_type - main_tag = "oml:%s_untag" % entity_type + uri = f"{entity_type}/untag" + main_tag = f"oml:{entity_type}_untag" + else: + uri = f"{entity_type}/tag" + main_tag = f"oml:{entity_type}_tag" - post_variables = {"%s_id" % entity_type: entity_id, "tag": tag} - result_xml = openml._api_calls._perform_api_call(uri, "post", post_variables) + result_xml = openml._api_calls._perform_api_call( + uri, + "post", + {f"{entity_type}_id": entity_id, "tag": tag}, + ) result = xmltodict.parse(result_xml, force_list={"oml:tag"})[main_tag] if "oml:tag" in result: - return result["oml:tag"] - else: - # no tags, return empty list - return [] + return result["oml:tag"] # type: ignore + + # no tags, return empty list + return [] -def _delete_entity(entity_type, entity_id): +# TODO(eddiebergman): Maybe this can be made more specific with a Literal +def _delete_entity(entity_type: str, entity_id: int) -> bool: """ Function that deletes a given entity on OpenML. As the OpenML API tag functions all consist of the same format, this function covers @@ -197,7 +225,7 @@ def _delete_entity(entity_type, entity_id): message=( f"The {entity_type} can not be deleted because " f"it still has associated entities: {e.message}" - ) + ), ) from e if e.code in unknown_reason: raise openml.exceptions.OpenMLServerError( @@ -209,7 +237,42 @@ def _delete_entity(entity_type, entity_id): raise -def _list_all(listing_call, output_format="dict", *args, **filters): +@overload +def _list_all( + listing_call: Callable[P, Any], + list_output_format: Literal["dict"] = ..., + *args: P.args, + **filters: P.kwargs, +) -> dict: + ... + + +@overload +def _list_all( + listing_call: Callable[P, Any], + list_output_format: Literal["object"], + *args: P.args, + **filters: P.kwargs, +) -> dict: + ... + + +@overload +def _list_all( + listing_call: Callable[P, Any], + list_output_format: Literal["dataframe"], + *args: P.args, + **filters: P.kwargs, +) -> pd.DataFrame: + ... + + +def _list_all( # noqa: C901, PLR0912 + listing_call: Callable[P, Any], + list_output_format: Literal["dict", "dataframe", "object"] = "dict", + *args: P.args, + **filters: P.kwargs, +) -> dict | pd.DataFrame: """Helper to handle paged listing requests. Example usage: @@ -220,49 +283,45 @@ def _list_all(listing_call, output_format="dict", *args, **filters): ---------- listing_call : callable Call listing, e.g. list_evaluations. - output_format : str, optional (default='dict') + list_output_format : str, optional (default='dict') The parameter decides the format of the output. - If 'dict' the output is a dict of dict - If 'dataframe' the output is a pandas DataFrame + - If 'object' the output is a dict of objects (only for some `listing_call`) *args : Variable length argument list Any required arguments for the listing call. **filters : Arbitrary keyword arguments Any filters that can be applied to the listing function. additionally, the batch_size can be specified. This is useful for testing purposes. + Returns ------- dict or dataframe """ - # eliminate filters that have a None value active_filters = {key: value for key, value in filters.items() if value is not None} page = 0 - result = collections.OrderedDict() - if output_format == "dataframe": - result = pd.DataFrame() + result = pd.DataFrame() if list_output_format == "dataframe" else {} # Default batch size per paging. # This one can be set in filters (batch_size), but should not be # changed afterwards. The derived batch_size can be changed. - BATCH_SIZE_ORIG = 10000 - if "batch_size" in active_filters: - BATCH_SIZE_ORIG = active_filters["batch_size"] - del active_filters["batch_size"] + BATCH_SIZE_ORIG = active_filters.pop("batch_size", 10000) + if not isinstance(BATCH_SIZE_ORIG, int): + raise ValueError(f"'batch_size' should be an integer but got {BATCH_SIZE_ORIG}") # max number of results to be shown - LIMIT = None - offset = 0 - if "size" in active_filters: - LIMIT = active_filters["size"] - del active_filters["size"] + LIMIT: int | float | None = active_filters.pop("size", None) # type: ignore + if (LIMIT is not None) and (not isinstance(LIMIT, int)) and (not np.isinf(LIMIT)): + raise ValueError(f"'limit' should be an integer or inf but got {LIMIT}") if LIMIT is not None and BATCH_SIZE_ORIG > LIMIT: BATCH_SIZE_ORIG = LIMIT - if "offset" in active_filters: - offset = active_filters["offset"] - del active_filters["offset"] + offset = active_filters.pop("offset", 0) + if not isinstance(offset, int): + raise ValueError(f"'offset' should be an integer but got {offset}") batch_size = BATCH_SIZE_ORIG while True: @@ -270,24 +329,27 @@ def _list_all(listing_call, output_format="dict", *args, **filters): current_offset = offset + BATCH_SIZE_ORIG * page new_batch = listing_call( *args, - limit=batch_size, - offset=current_offset, - output_format=output_format, - **active_filters, + output_format=list_output_format, # type: ignore + **{**active_filters, "limit": batch_size, "offset": current_offset}, # type: ignore ) except openml.exceptions.OpenMLServerNoResult: # we want to return an empty dict in this case + # NOTE: This above statement may not actually happen, but we could just return here + # to enforce it... break - if output_format == "dataframe": + + if list_output_format == "dataframe": if len(result) == 0: result = new_batch else: result = pd.concat([result, new_batch], ignore_index=True) else: - # For output_format = 'dict' or 'object' + # For output_format = 'dict' (or catch all) result.update(new_batch) + if len(new_batch) < batch_size: break + page += 1 if LIMIT is not None: # check if the number of required results has been achieved @@ -295,24 +357,24 @@ def _list_all(listing_call, output_format="dict", *args, **filters): # in case of bugs to prevent infinite loops if len(result) >= LIMIT: break + # check if there are enough results to fulfill a batch - if BATCH_SIZE_ORIG > LIMIT - len(result): + if LIMIT - len(result) < BATCH_SIZE_ORIG: batch_size = LIMIT - len(result) return result -def _get_cache_dir_for_key(key): - cache = config.get_cache_directory() - return os.path.join(cache, key) +def _get_cache_dir_for_key(key: str) -> Path: + return Path(config.get_cache_directory()) / key -def _create_cache_directory(key): +def _create_cache_directory(key: str) -> Path: cache_dir = _get_cache_dir_for_key(key) try: - os.makedirs(cache_dir, exist_ok=True) - except Exception as e: + cache_dir.mkdir(exist_ok=True, parents=True) + except Exception as e: # noqa: BLE001 raise openml.exceptions.OpenMLCacheException( f"Cannot create cache directory {cache_dir}." ) from e @@ -320,16 +382,12 @@ def _create_cache_directory(key): return cache_dir -def _get_cache_dir_for_id(key, id_, create=False): - if create: - cache_dir = _create_cache_directory(key) - else: - cache_dir = _get_cache_dir_for_key(key) +def _get_cache_dir_for_id(key: str, id_: int, create: bool = False) -> Path: # noqa: FBT001, FBT002 + cache_dir = _create_cache_directory(key) if create else _get_cache_dir_for_key(key) + return Path(cache_dir) / str(id_) - return os.path.join(cache_dir, str(id_)) - -def _create_cache_directory_for_id(key, id_): +def _create_cache_directory_for_id(key: str, id_: int) -> Path: """Create the cache directory for a specific ID In order to have a clearer cache structure and because every task @@ -347,20 +405,18 @@ def _create_cache_directory_for_id(key, id_): Returns ------- - str + cache_dir : Path Path of the created dataset cache directory. """ cache_dir = _get_cache_dir_for_id(key, id_, create=True) - if os.path.isdir(cache_dir): - pass - elif os.path.exists(cache_dir): + if cache_dir.exists() and not cache_dir.is_dir(): raise ValueError("%s cache dir exists but is not a directory!" % key) - else: - os.makedirs(cache_dir) + + cache_dir.mkdir(exist_ok=True, parents=True) return cache_dir -def _remove_cache_dir_for_id(key, cache_dir): +def _remove_cache_dir_for_id(key: str, cache_dir: Path) -> None: """Remove the task cache directory This function is NOT thread/multiprocessing safe. @@ -373,18 +429,22 @@ def _remove_cache_dir_for_id(key, cache_dir): """ try: shutil.rmtree(cache_dir) - except (OSError, IOError): + except OSError as e: raise ValueError( - "Cannot remove faulty %s cache directory %s." - "Please do this manually!" % (key, cache_dir) - ) + f"Cannot remove faulty {key} cache directory {cache_dir}. Please do this manually!", + ) from e -def thread_safe_if_oslo_installed(func): - if oslo_installed: +def thread_safe_if_oslo_installed(func: Callable[P, R]) -> Callable[P, R]: + try: + # Currently, importing oslo raises a lot of warning that it will stop working + # under python3.8; remove this once they disappear + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + from oslo_concurrency import lockutils @wraps(func) - def safe_func(*args, **kwargs): + def safe_func(*args: P.args, **kwargs: P.kwargs) -> R: # Lock directories use the id that is passed as either positional or keyword argument. id_parameters = [parameter_name for parameter_name in kwargs if "_id" in parameter_name] if len(id_parameters) == 1: @@ -393,24 +453,21 @@ def safe_func(*args, **kwargs): id_ = args[0] else: raise RuntimeError( - "An id must be specified for {}, was passed: ({}, {}).".format( - func.__name__, args, kwargs - ) + f"An id must be specified for {func.__name__}, was passed: ({args}, {kwargs}).", ) # The [7:] gets rid of the 'openml.' prefix - lock_name = "{}.{}:{}".format(func.__module__[7:], func.__name__, id_) + lock_name = f"{func.__module__[7:]}.{func.__name__}:{id_}" with lockutils.external_lock(name=lock_name, lock_path=_create_lockfiles_dir()): return func(*args, **kwargs) return safe_func - else: + except ImportError: return func -def _create_lockfiles_dir(): - dir = os.path.join(config.get_cache_directory(), "locks") - try: - os.makedirs(dir) - except OSError: - pass - return dir +def _create_lockfiles_dir() -> Path: + path = Path(config.get_cache_directory()) / "locks" + # TODO(eddiebergman): Not sure why this is allowed to error and ignore??? + with contextlib.suppress(OSError): + path.mkdir(exist_ok=True, parents=True) + return path diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..99ff2b804 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,318 @@ +# -*- coding: utf-8 -*- + +# License: BSD 3-Clause +[build-system] +requires = ["setuptools >= 61.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "openml" +dynamic = ["version"] # Will take it from the __version__ file, update there +dependencies = [ + "liac-arff>=2.4.0", + "xmltodict", + "requests", + "scikit-learn>=0.18", + "python-dateutil", # Installed through pandas anyway. + "pandas>=1.0.0", + "scipy>=0.13.3", + "numpy>=1.6.2", + "minio", + "pyarrow", +] +requires-python = ">=3.8" +authors = [ + { name = "Matthias Feurer", email="feurerm@informatik.uni-freiburg.de" }, + { name = "Jan van Rijn" }, + { name = "Arlind Kadra" }, + { name = "Pieter Gijsbers" }, + { name = "Neeratyoy Mallik" }, + { name = "Sahithya Ravi" }, + { name = "Andreas Müller" }, + { name = "Joaquin Vanschoren " }, + { name = "Frank Hutter" }, +] +readme = "README.md" +description = "Python API for OpenML" +classifiers = [ + "Intended Audience :: Science/Research", + "Intended Audience :: Developers", + "License :: OSI Approved :: BSD License", + "Programming Language :: Python", + "Topic :: Software Development", + "Topic :: Scientific/Engineering", + "Operating System :: POSIX", + "Operating System :: Unix", + "Operating System :: MacOS", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", +] +license = { file = "LICENSE" } + +[project.scripts] +openml = "openml.cli:main" + +[project.optional-dependencies] +test=[ + "nbconvert", + "jupyter_client", + "matplotlib", + "pytest", + "pytest-xdist", + "pytest-timeout", + "nbformat", + "oslo.concurrency", + "flaky", + "pre-commit", + "pytest-cov", + "pytest-rerunfailures", + "mypy", + "ruff", +] +examples=[ + "matplotlib", + "jupyter", + "notebook", + "nbconvert", + "nbformat", + "jupyter_client", + "ipython", + "ipykernel", + "seaborn", +] +examples_unix=["fanova"] +docs=[ + "sphinx>=3", + "sphinx-gallery", + "sphinx_bootstrap_theme", + "numpydoc", +] + +[project.urls] +home="https://openml.org/" +documentation = "https://openml.github.io/openml-python/" +source = "https://github.com/openml/openml-python" + +[tool.setuptools.packages.find] +where = [""] +include = ["openml*"] +namespaces = false + +[tool.setuptools.package-data] +openml = ["*.txt", "*.md", "py.typed"] + +[tool.setuptools.dynamic] +version = {attr = "openml.__version__.__version__"} + +# https://docs.pytest.org/en/7.2.x/reference/reference.html#ini-options-ref +[tool.pytest.ini_options] +testpaths = ["tests"] +minversion = "7.0" +xfail_strict = true +filterwarnings=[ + "ignore:the matrix subclass:PendingDeprecationWarning" +] +markers = [ + "server: anything that connects to a server", + "upload: anything that uploads to a server", + "production: any interaction with the production server", + "cache: anything that interacts with the (test) cache", +] + +# https://github.com/charliermarsh/ruff +[tool.ruff] +target-version = "py37" +line-length = 100 +show-source = true +src = ["openml", "tests", "examples"] +unsafe-fixes = true + +# Allow unused variables when underscore-prefixed. +dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" + +select = [ + "A", + # "ANN", # Handled by mypy + "ARG", + "B", + "BLE", + "COM", + "C4", + "D", + # "DTZ", # One day I should know how to utilize timezones and dates... + "E", + # "EXE", Meh + "ERA", + "F", + "FBT", + "I", + # "ISC", # Favours implicit string concatenation + "INP", + # "INT", # I don't understand this one + "N", + "NPY", + "PD", + "PLC", + "PLE", + "PLR", + "PLW", + "PIE", + "PT", + "PTH", + # "PYI", # Specific to .pyi files for type stubs + "Q", + "PGH004", + "RET", + "RUF", + "C90", + "S", + # "SLF", # Private member accessed (sure, it's python) + "SIM", + # "TRY", # Good in principle, would take a lot of work to statisfy + "T10", + "T20", + "TID", + "TCH", + "UP", + "N", + "W", + "YTT", +] + +ignore = [ + "D105", # Missing docstring in magic mthod + "D401", # First line of docstring should be in imperative mood + "N806", # Variable X in function should be lowercase + "E731", # Do not assign a lambda expression, use a def + "S101", # Use of assert detected. + "W292", # No newline at end of file + "PLC1901", # "" can be simplified to be falsey + "TCH003", # Move stdlib import into TYPE_CHECKING + "COM812", # Trailing comma missing (handled by linter, ruff recommend disabling if using formatter) + "N803", # Argument should be lowercase (but we accept things like `X`) + + # TODO(@eddibergman): These should be enabled + "D100", # Missing docstring in public module + "D103", # Missing docstring in public function + "D104", # Missing docstring in public package + + # TODO(@eddiebergman): Maybe fix + "PLR2004", # Magic value used in comparison, consider replacing 2 with a constant variable + "D400", # First line must end with a period (@eddiebergman too many to fix so ignoring this for now) + "D203", # 1 blank line required before class docstring + "D205", # 1 blank line between summary and description + + # TODO(@eddiebergman): Could be backwards breaking + "N802", # Public function name should be lower case (i.e. get_X()) +] + +exclude = [ + # TODO(eddiebergman): Tests should be re-enabled after the refactor + "tests", + # + ".bzr", + ".direnv", + ".eggs", + ".git", + ".hg", + ".mypy_cache", + ".nox", + ".pants.d", + ".ruff_cache", + ".svn", + ".tox", + ".venv", + "__pypackages__", + "_build", + "buck-out", + "build", + "dist", + "node_modules", + "venv", + "docs", +] + +# Exclude a variety of commonly ignored directories. +[tool.ruff.per-file-ignores] +"tests/*.py" = [ + "D100", # Undocumented public module + "D101", # Missing docstring in public class + "D102", # Missing docstring in public method + "D103", # Missing docstring in public function + "S101", # Use of assert + "ANN201", # Missing return type annotation for public function + "FBT001", # Positional boolean argument + "PLR2004",# No use of magic numbers + "PD901", # X is a bad variable name. (pandas) + "TCH", # https://docs.astral.sh/ruff/rules/#flake8-type-checking-tch + "N803", # Argument name {name} should be lowercase +] +"openml/cli.py" = [ + "T201", # print found + "T203", # pprint found +] +"openml/__version__.py" = [ + "D100", # Undocumented public module +] +"__init__.py" = [ + "I002", # Missing required import (i.e. from __future__ import annotations) +] +"examples/*.py" = [ + "D101", # Missing docstring in public class + "D102", # Missing docstring in public method + "D103", # Missing docstring in public function + "D415", # First line should end with a . or ? or ! + "INP001", # File is part of an implicit namespace package, add an __init__.py + "I002", # Missing required import (i.e. from __future__ import annotations) + "E741", # Ambigiuous variable name + "T201", # print found + "T203", # pprint found + "ERA001", # found commeneted out code + "E402", # Module level import not at top of cell + "E501", # Line too long +] + + +[tool.ruff.isort] +known-first-party = ["openml"] +no-lines-before = ["future"] +required-imports = ["from __future__ import annotations"] +combine-as-imports = true +extra-standard-library = ["typing_extensions"] +force-wrap-aliases = true + +[tool.ruff.pydocstyle] +convention = "numpy" + +[tool.mypy] +python_version = "3.7" +packages = ["openml", "tests"] + +show_error_codes = true + +warn_unused_configs = true # warn about unused [tool.mypy] lines + +follow_imports = "normal" # Type check top level api code we use from imports +ignore_missing_imports = false # prefer explicit ignores + +disallow_untyped_defs = true # All functions must have types +disallow_untyped_decorators = true # ... even decorators +disallow_incomplete_defs = true # ...all types + +no_implicit_optional = true +check_untyped_defs = true + +warn_return_any = true + + +[[tool.mypy.overrides]] +module = ["tests.*", "openml.extensions.sklearn.*"] + +# TODO(eddiebergman): This should be re-enabled after tests get refactored +ignore_errors = true +#disallow_untyped_defs = false # Sometimes we just want to ignore verbose types +#disallow_untyped_decorators = false # Test decorators are not properly typed +#disallow_incomplete_defs = false # Sometimes we just want to ignore verbose types +#disable_error_code = ["var-annotated"] diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 726c8fa73..000000000 --- a/setup.cfg +++ /dev/null @@ -1,6 +0,0 @@ -[metadata] -description-file = README.md - -[tool:pytest] -filterwarnings = - ignore:the matrix subclass:PendingDeprecationWarning diff --git a/setup.py b/setup.py deleted file mode 100644 index 9f3cdd0e6..000000000 --- a/setup.py +++ /dev/null @@ -1,112 +0,0 @@ -# -*- coding: utf-8 -*- - -# License: BSD 3-Clause - -import os -import setuptools -import sys - -with open("openml/__version__.py") as fh: - version = fh.readlines()[-1].split()[-1].strip("\"'") - -if sys.version_info < (3, 6): - raise ValueError( - "Unsupported Python version {}.{}.{} found. OpenML requires Python 3.6 or higher.".format( - sys.version_info.major, sys.version_info.minor, sys.version_info.micro - ) - ) - -with open(os.path.join("README.md"), encoding="utf-8") as fid: - README = fid.read() - -setuptools.setup( - name="openml", - author="Matthias Feurer, Jan van Rijn, Arlind Kadra, Pieter Gijsbers, " - "Neeratyoy Mallik, Sahithya Ravi, Andreas Müller, Joaquin Vanschoren " - "and Frank Hutter", - author_email="feurerm@informatik.uni-freiburg.de", - maintainer="Matthias Feurer", - maintainer_email="feurerm@informatik.uni-freiburg.de", - description="Python API for OpenML", - long_description=README, - long_description_content_type="text/markdown", - license="BSD 3-clause", - url="https://openml.org/", - project_urls={ - "Documentation": "https://openml.github.io/openml-python/", - "Source Code": "https://github.com/openml/openml-python", - }, - version=version, - # Make sure to remove stale files such as the egg-info before updating this: - # https://stackoverflow.com/a/26547314 - packages=setuptools.find_packages( - include=["openml.*", "openml"], - exclude=["*.tests", "*.tests.*", "tests.*", "tests"], - ), - package_data={"": ["*.txt", "*.md", "py.typed"]}, - python_requires=">=3.6", - install_requires=[ - "liac-arff>=2.4.0", - "xmltodict", - "requests", - "scikit-learn>=0.18", - "python-dateutil", # Installed through pandas anyway. - "pandas>=1.0.0", - "scipy>=0.13.3", - "numpy>=1.6.2", - "minio", - "pyarrow", - ], - extras_require={ - "test": [ - "nbconvert", - "jupyter_client", - "matplotlib", - "pytest", - "pytest-xdist", - "pytest-timeout", - "nbformat", - "oslo.concurrency", - "flaky", - "pre-commit", - "pytest-cov", - "pytest-rerunfailures", - "mypy", - ], - "examples": [ - "matplotlib", - "jupyter", - "notebook", - "nbconvert", - "nbformat", - "jupyter_client", - "ipython", - "ipykernel", - "seaborn", - ], - "examples_unix": ["fanova"], - "docs": [ - "sphinx>=3", - "sphinx-gallery", - "sphinx_bootstrap_theme", - "numpydoc", - ], - }, - test_suite="pytest", - classifiers=[ - "Intended Audience :: Science/Research", - "Intended Audience :: Developers", - "License :: OSI Approved :: BSD License", - "Programming Language :: Python", - "Topic :: Software Development", - "Topic :: Scientific/Engineering", - "Operating System :: POSIX", - "Operating System :: Unix", - "Operating System :: MacOS", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - ], - entry_points={"console_scripts": ["openml=openml.cli:main"]}, -) diff --git a/tests/conftest.py b/tests/conftest.py index 43e2cc3ee..62fe3c7e8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -21,11 +21,11 @@ """ # License: BSD 3-Clause +from __future__ import annotations -import os import logging -import pathlib -from typing import List +import os +from pathlib import Path import pytest import openml @@ -52,29 +52,29 @@ def worker_id() -> str: return "master" -def read_file_list() -> List[pathlib.Path]: +def read_file_list() -> list[Path]: """Returns a list of paths to all files that currently exist in 'openml/tests/files/' - :return: List[pathlib.Path] + :return: List[Path] """ - test_files_dir = pathlib.Path(__file__).parent / "files" + test_files_dir = Path(__file__).parent / "files" return [f for f in test_files_dir.rglob("*") if f.is_file()] -def compare_delete_files(old_list: List[pathlib.Path], new_list: List[pathlib.Path]) -> None: +def compare_delete_files(old_list: list[Path], new_list: list[Path]) -> None: """Deletes files that are there in the new_list but not in the old_list - :param old_list: List[pathlib.Path] - :param new_list: List[pathlib.Path] + :param old_list: List[Path] + :param new_list: List[Path] :return: None """ file_list = list(set(new_list) - set(old_list)) for file in file_list: os.remove(file) - logger.info("Deleted from local: {}".format(file)) + logger.info(f"Deleted from local: {file}") -def delete_remote_files(tracker) -> None: +def delete_remote_files(tracker, flow_names) -> None: """Function that deletes the entities passed as input, from the OpenML test server The TestBase class in openml/testing.py has an attribute called publish_tracker. @@ -94,27 +94,27 @@ def delete_remote_files(tracker) -> None: # reordering to delete sub flows at the end of flows # sub-flows have shorter names, hence, sorting by descending order of flow name length if "flow" in tracker: + to_sort = list(zip(tracker["flow"], flow_names)) flow_deletion_order = [ - entity_id - for entity_id, _ in sorted(tracker["flow"], key=lambda x: len(x[1]), reverse=True) + entity_id for entity_id, _ in sorted(to_sort, key=lambda x: len(x[1]), reverse=True) ] - tracker["flow"] = flow_deletion_order + tracker["flow"] = [flow_deletion_order[1] for flow_id, _ in flow_deletion_order] # deleting all collected entities published to test server # 'run's are deleted first to prevent dependency issue of entities on deletion logger.info("Entity Types: {}".format(["run", "data", "flow", "task", "study"])) for entity_type in ["run", "data", "flow", "task", "study"]: - logger.info("Deleting {}s...".format(entity_type)) - for i, entity in enumerate(tracker[entity_type]): + logger.info(f"Deleting {entity_type}s...") + for _i, entity in enumerate(tracker[entity_type]): try: openml.utils._delete_entity(entity_type, entity) - logger.info("Deleted ({}, {})".format(entity_type, entity)) + logger.info(f"Deleted ({entity_type}, {entity})") except Exception as e: - logger.warning("Cannot delete ({},{}): {}".format(entity_type, entity, e)) + logger.warning(f"Cannot delete ({entity_type},{entity}): {e}") def pytest_sessionstart() -> None: - """pytest hook that is executed before any unit test starts + """Pytest hook that is executed before any unit test starts This function will be called by each of the worker processes, along with the master process when they are spawned. This happens even before the collection of unit tests. @@ -136,7 +136,7 @@ def pytest_sessionstart() -> None: def pytest_sessionfinish() -> None: - """pytest hook that is executed after all unit tests of a worker ends + """Pytest hook that is executed after all unit tests of a worker ends This function will be called by each of the worker processes, along with the master process when they are done with the unit tests allocated to them. @@ -154,11 +154,11 @@ def pytest_sessionfinish() -> None: # allows access to the file_list read in the set up phase global file_list worker = worker_id() - logger.info("Finishing worker {}".format(worker)) + logger.info(f"Finishing worker {worker}") # Test file deletion - logger.info("Deleting files uploaded to test server for worker {}".format(worker)) - delete_remote_files(TestBase.publish_tracker) + logger.info(f"Deleting files uploaded to test server for worker {worker}") + delete_remote_files(TestBase.publish_tracker, TestBase.flow_name_tracker) if worker == "master": # Local file deletion @@ -166,7 +166,7 @@ def pytest_sessionfinish() -> None: compare_delete_files(file_list, new_file_list) logger.info("Local files deleted") - logger.info("{} is killed".format(worker)) + logger.info(f"{worker} is killed") def pytest_configure(config): @@ -182,16 +182,58 @@ def pytest_addoption(parser): ) +def _expected_static_cache_state(root_dir: Path) -> list[Path]: + _c_root_dir = root_dir / "org" / "openml" / "test" + res_paths = [root_dir, _c_root_dir] + + for _d in ["datasets", "tasks", "runs", "setups"]: + res_paths.append(_c_root_dir / _d) + + for _id in ["-1","2"]: + tmp_p = _c_root_dir / "datasets" / _id + res_paths.extend([ + tmp_p / "dataset.arff", + tmp_p / "features.xml", + tmp_p / "qualities.xml", + tmp_p / "description.xml", + ]) + + res_paths.append(_c_root_dir / "datasets" / "30" / "dataset_30.pq") + res_paths.append(_c_root_dir / "runs" / "1" / "description.xml") + res_paths.append(_c_root_dir / "setups" / "1" / "description.xml") + + for _id in ["1", "3", "1882"]: + tmp_p = _c_root_dir / "tasks" / _id + res_paths.extend([ + tmp_p / "datasplits.arff", + tmp_p / "task.xml", + ]) + + return res_paths + + +def assert_static_test_cache_correct(root_dir: Path) -> None: + for p in _expected_static_cache_state(root_dir): + assert p.exists(), f"Expected path {p} does not exist" + + @pytest.fixture(scope="class") def long_version(request): request.cls.long_version = request.config.getoption("--long") -@pytest.fixture -def test_files_directory() -> pathlib.Path: - return pathlib.Path(__file__).parent / "files" +@pytest.fixture() +def test_files_directory() -> Path: + return Path(__file__).parent / "files" @pytest.fixture() def test_api_key() -> str: return "c0c42819af31e706efe1f4b88c23c6c1" + + +@pytest.fixture(autouse=True) +def verify_cache_state(test_files_directory) -> None: + assert_static_test_cache_correct(test_files_directory) + yield + assert_static_test_cache_correct(test_files_directory) diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 93e0247d2..80da9c842 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -1,8 +1,9 @@ # License: BSD 3-Clause +from __future__ import annotations import os -from time import time import unittest.mock +from time import time import numpy as np import pandas as pd @@ -10,16 +11,17 @@ from scipy import sparse import openml -from openml.testing import TestBase +from openml.datasets import OpenMLDataFeature, OpenMLDataset from openml.exceptions import PyOpenMLError -from openml.datasets import OpenMLDataset, OpenMLDataFeature +from openml.testing import TestBase +@pytest.mark.production() class OpenMLDatasetTest(TestBase): _multiprocess_can_split_ = True def setUp(self): - super(OpenMLDatasetTest, self).setUp() + super().setUp() openml.config.server = self.production_server # Load dataset id 2 - dataset 2 is interesting because it contains @@ -77,7 +79,9 @@ def test_init_string_validation(self): with pytest.raises(ValueError, match="Invalid symbols 'ü' in citation"): openml.datasets.OpenMLDataset( - name="somename", description="a description", citation="Something by Müller" + name="somename", + description="a description", + citation="Something by Müller", ) def test__unpack_categories_with_nan_likes(self): @@ -94,14 +98,14 @@ def test__unpack_categories_with_nan_likes(self): def test_get_data_array(self): # Basic usage rval, _, categorical, attribute_names = self.dataset.get_data(dataset_format="array") - self.assertIsInstance(rval, np.ndarray) - self.assertEqual(rval.dtype, np.float32) - self.assertEqual((898, 39), rval.shape) - self.assertEqual(len(categorical), 39) - self.assertTrue(all([isinstance(cat, bool) for cat in categorical])) - self.assertEqual(len(attribute_names), 39) - self.assertTrue(all([isinstance(att, str) for att in attribute_names])) - self.assertIsNone(_) + assert isinstance(rval, np.ndarray) + assert rval.dtype == np.float32 + assert rval.shape == (898, 39) + assert len(categorical) == 39 + assert all(isinstance(cat, bool) for cat in categorical) + assert len(attribute_names) == 39 + assert all(isinstance(att, str) for att in attribute_names) + assert _ is None # check that an error is raised when the dataset contains string err_msg = "PyOpenML cannot handle string when returning numpy arrays" @@ -110,9 +114,9 @@ def test_get_data_array(self): def test_get_data_pandas(self): data, _, _, _ = self.titanic.get_data(dataset_format="dataframe") - self.assertTrue(isinstance(data, pd.DataFrame)) - self.assertEqual(data.shape[1], len(self.titanic.features)) - self.assertEqual(data.shape[0], 1309) + assert isinstance(data, pd.DataFrame) + assert data.shape[1] == len(self.titanic.features) + assert data.shape[0] == 1309 col_dtype = { "pclass": "uint8", "survived": "category", @@ -130,30 +134,31 @@ def test_get_data_pandas(self): "home.dest": "object", } for col_name in data.columns: - self.assertTrue(data[col_name].dtype.name == col_dtype[col_name]) + assert data[col_name].dtype.name == col_dtype[col_name] X, y, _, _ = self.titanic.get_data( - dataset_format="dataframe", target=self.titanic.default_target_attribute + dataset_format="dataframe", + target=self.titanic.default_target_attribute, ) - self.assertTrue(isinstance(X, pd.DataFrame)) - self.assertTrue(isinstance(y, pd.Series)) - self.assertEqual(X.shape, (1309, 13)) - self.assertEqual(y.shape, (1309,)) + assert isinstance(X, pd.DataFrame) + assert isinstance(y, pd.Series) + assert X.shape == (1309, 13) + assert y.shape == (1309,) for col_name in X.columns: - self.assertTrue(X[col_name].dtype.name == col_dtype[col_name]) - self.assertTrue(y.dtype.name == col_dtype["survived"]) + assert X[col_name].dtype.name == col_dtype[col_name] + assert y.dtype.name == col_dtype["survived"] @pytest.mark.skip("https://github.com/openml/openml-python/issues/1157") def test_get_data_boolean_pandas(self): # test to check that we are converting properly True and False even # with some inconsistency when dumping the data on openml data, _, _, _ = self.jm1.get_data() - self.assertTrue(data["defects"].dtype.name == "category") - self.assertTrue(set(data["defects"].cat.categories) == {True, False}) + assert data["defects"].dtype.name == "category" + assert set(data["defects"].cat.categories) == {True, False} data, _, _, _ = self.pc4.get_data() - self.assertTrue(data["c"].dtype.name == "category") - self.assertTrue(set(data["c"].cat.categories) == {True, False}) + assert data["c"].dtype.name == "category" + assert set(data["c"].cat.categories) == {True, False} def test_get_data_no_str_data_for_nparrays(self): # check that an error is raised when the dataset contains string @@ -169,59 +174,59 @@ def _check_expected_type(self, dtype, is_cat, col): else: expected_type = "float64" - self.assertEqual(dtype.name, expected_type) + assert dtype.name == expected_type @pytest.mark.skip("https://github.com/openml/openml-python/issues/1157") def test_get_data_with_rowid(self): self.dataset.row_id_attribute = "condition" rval, _, categorical, _ = self.dataset.get_data(include_row_id=True) - self.assertIsInstance(rval, pd.DataFrame) + assert isinstance(rval, pd.DataFrame) for dtype, is_cat, col in zip(rval.dtypes, categorical, rval): self._check_expected_type(dtype, is_cat, rval[col]) - self.assertEqual(rval.shape, (898, 39)) - self.assertEqual(len(categorical), 39) + assert rval.shape == (898, 39) + assert len(categorical) == 39 rval, _, categorical, _ = self.dataset.get_data() - self.assertIsInstance(rval, pd.DataFrame) + assert isinstance(rval, pd.DataFrame) for dtype, is_cat, col in zip(rval.dtypes, categorical, rval): self._check_expected_type(dtype, is_cat, rval[col]) - self.assertEqual(rval.shape, (898, 38)) - self.assertEqual(len(categorical), 38) + assert rval.shape == (898, 38) + assert len(categorical) == 38 def test_get_data_with_target_array(self): X, y, _, attribute_names = self.dataset.get_data(dataset_format="array", target="class") - self.assertIsInstance(X, np.ndarray) - self.assertEqual(X.dtype, np.float32) - self.assertEqual(X.shape, (898, 38)) - self.assertIn(y.dtype, [np.int32, np.int64]) - self.assertEqual(y.shape, (898,)) - self.assertEqual(len(attribute_names), 38) - self.assertNotIn("class", attribute_names) + assert isinstance(X, np.ndarray) + assert X.dtype == np.float32 + assert X.shape == (898, 38) + assert y.dtype in [np.int32, np.int64] + assert y.shape == (898,) + assert len(attribute_names) == 38 + assert "class" not in attribute_names @pytest.mark.skip("https://github.com/openml/openml-python/issues/1157") def test_get_data_with_target_pandas(self): X, y, categorical, attribute_names = self.dataset.get_data(target="class") - self.assertIsInstance(X, pd.DataFrame) + assert isinstance(X, pd.DataFrame) for dtype, is_cat, col in zip(X.dtypes, categorical, X): self._check_expected_type(dtype, is_cat, X[col]) - self.assertIsInstance(y, pd.Series) - self.assertEqual(y.dtype.name, "category") + assert isinstance(y, pd.Series) + assert y.dtype.name == "category" - self.assertEqual(X.shape, (898, 38)) - self.assertEqual(len(attribute_names), 38) - self.assertEqual(y.shape, (898,)) + assert X.shape == (898, 38) + assert len(attribute_names) == 38 + assert y.shape == (898,) - self.assertNotIn("class", attribute_names) + assert "class" not in attribute_names def test_get_data_rowid_and_ignore_and_target(self): self.dataset.ignore_attribute = ["condition"] self.dataset.row_id_attribute = ["hardness"] X, y, categorical, names = self.dataset.get_data(target="class") - self.assertEqual(X.shape, (898, 36)) - self.assertEqual(len(categorical), 36) + assert X.shape == (898, 36) + assert len(categorical) == 36 cats = [True] * 3 + [False, True, True, False] + [True] * 23 + [False] * 3 + [True] * 3 self.assertListEqual(categorical, cats) - self.assertEqual(y.shape, (898,)) + assert y.shape == (898,) @pytest.mark.skip("https://github.com/openml/openml-python/issues/1157") def test_get_data_with_ignore_attributes(self): @@ -229,26 +234,26 @@ def test_get_data_with_ignore_attributes(self): rval, _, categorical, _ = self.dataset.get_data(include_ignore_attribute=True) for dtype, is_cat, col in zip(rval.dtypes, categorical, rval): self._check_expected_type(dtype, is_cat, rval[col]) - self.assertEqual(rval.shape, (898, 39)) - self.assertEqual(len(categorical), 39) + assert rval.shape == (898, 39) + assert len(categorical) == 39 rval, _, categorical, _ = self.dataset.get_data(include_ignore_attribute=False) for dtype, is_cat, col in zip(rval.dtypes, categorical, rval): self._check_expected_type(dtype, is_cat, rval[col]) - self.assertEqual(rval.shape, (898, 38)) - self.assertEqual(len(categorical), 38) + assert rval.shape == (898, 38) + assert len(categorical) == 38 def test_get_data_with_nonexisting_class(self): # This class is using the anneal dataset with labels [1, 2, 3, 4, 5, 'U']. However, # label 4 does not exist and we test that the features 5 and 'U' are correctly mapped to # indices 4 and 5, and that nothing is mapped to index 3. _, y, _, _ = self.dataset.get_data("class", dataset_format="dataframe") - self.assertEqual(list(y.dtype.categories), ["1", "2", "3", "4", "5", "U"]) + assert list(y.dtype.categories) == ["1", "2", "3", "4", "5", "U"] _, y, _, _ = self.dataset.get_data("class", dataset_format="array") - self.assertEqual(np.min(y), 0) - self.assertEqual(np.max(y), 5) + assert np.min(y) == 0 + assert np.max(y) == 5 # Check that no label is mapped to 3, since it is reserved for label '4'. - self.assertEqual(np.sum(y == 3), 0) + assert np.sum(y == 3) == 0 def test_get_data_corrupt_pickle(self): # Lazy loaded dataset, populate cache. @@ -259,155 +264,216 @@ def test_get_data_corrupt_pickle(self): # Despite the corrupt file, the data should be loaded from the ARFF file. # A warning message is written to the python logger. xy, _, _, _ = self.iris.get_data() - self.assertIsInstance(xy, pd.DataFrame) - self.assertEqual(xy.shape, (150, 5)) + assert isinstance(xy, pd.DataFrame) + assert xy.shape == (150, 5) def test_lazy_loading_metadata(self): # Initial Setup did_cache_dir = openml.utils._create_cache_directory_for_id( - openml.datasets.functions.DATASETS_CACHE_DIR_NAME, 2 + openml.datasets.functions.DATASETS_CACHE_DIR_NAME, + 2, ) _compare_dataset = openml.datasets.get_dataset( - 2, download_data=False, download_features_meta_data=True, download_qualities=True + 2, + download_data=False, + download_features_meta_data=True, + download_qualities=True, ) change_time = os.stat(did_cache_dir).st_mtime # Test with cache _dataset = openml.datasets.get_dataset( - 2, download_data=False, download_features_meta_data=False, download_qualities=False + 2, + download_data=False, + download_features_meta_data=False, + download_qualities=False, ) - self.assertEqual(change_time, os.stat(did_cache_dir).st_mtime) - self.assertEqual(_dataset.features, _compare_dataset.features) - self.assertEqual(_dataset.qualities, _compare_dataset.qualities) + assert change_time == os.stat(did_cache_dir).st_mtime + assert _dataset.features == _compare_dataset.features + assert _dataset.qualities == _compare_dataset.qualities # -- Test without cache openml.utils._remove_cache_dir_for_id( - openml.datasets.functions.DATASETS_CACHE_DIR_NAME, did_cache_dir + openml.datasets.functions.DATASETS_CACHE_DIR_NAME, + did_cache_dir, ) _dataset = openml.datasets.get_dataset( - 2, download_data=False, download_features_meta_data=False, download_qualities=False + 2, + download_data=False, + download_features_meta_data=False, + download_qualities=False, ) - self.assertEqual(["description.xml"], os.listdir(did_cache_dir)) - self.assertNotEqual(change_time, os.stat(did_cache_dir).st_mtime) - self.assertEqual(_dataset.features, _compare_dataset.features) - self.assertEqual(_dataset.qualities, _compare_dataset.qualities) + assert ["description.xml"] == os.listdir(did_cache_dir) + assert change_time != os.stat(did_cache_dir).st_mtime + assert _dataset.features == _compare_dataset.features + assert _dataset.qualities == _compare_dataset.qualities class OpenMLDatasetTestOnTestServer(TestBase): def setUp(self): - super(OpenMLDatasetTestOnTestServer, self).setUp() + super().setUp() # longley, really small dataset self.dataset = openml.datasets.get_dataset(125, download_data=False) def test_tagging(self): - tag = "testing_tag_{}_{}".format(self.id(), time()) + # tags can be at most 64 alphanumeric (+ underscore) chars + unique_indicator = str(time()).replace(".", "") + tag = f"test_tag_OpenMLDatasetTestOnTestServer_{unique_indicator}" datasets = openml.datasets.list_datasets(tag=tag, output_format="dataframe") - self.assertTrue(datasets.empty) + assert datasets.empty self.dataset.push_tag(tag) datasets = openml.datasets.list_datasets(tag=tag, output_format="dataframe") - self.assertEqual(len(datasets), 1) - self.assertIn(125, datasets["did"]) + assert len(datasets) == 1 + assert 125 in datasets["did"] self.dataset.remove_tag(tag) datasets = openml.datasets.list_datasets(tag=tag, output_format="dataframe") - self.assertTrue(datasets.empty) - - + assert datasets.empty + + def test_get_feature_with_ontology_data_id_11(self): + # test on car dataset, which has built-in ontology references + dataset = openml.datasets.get_dataset(11) + assert len(dataset.features) == 7 + assert len(dataset.features[1].ontologies) >= 2 + assert len(dataset.features[2].ontologies) >= 1 + assert len(dataset.features[3].ontologies) >= 1 + + def test_add_remove_ontology_to_dataset(self): + did = 1 + feature_index = 1 + ontology = 'https://www.openml.org/unittest/' + str(time()) + openml.datasets.functions.data_feature_add_ontology(did, feature_index, ontology) + openml.datasets.functions.data_feature_remove_ontology(did, feature_index, ontology) + + def test_add_same_ontology_multiple_features(self): + did = 1 + ontology = 'https://www.openml.org/unittest/' + str(time()) + + for i in range(3): + openml.datasets.functions.data_feature_add_ontology(did, i, ontology) + + + def test_add_illegal_long_ontology(self): + did = 1 + ontology = 'http://www.google.com/' + ('a' * 257) + try: + openml.datasets.functions.data_feature_add_ontology(did, 1, ontology) + assert False + except openml.exceptions.OpenMLServerException as e: + assert e.code == 1105 + + def test_add_illegal_url_ontology(self): + did = 1 + ontology = 'not_a_url' + str(time()) + try: + openml.datasets.functions.data_feature_add_ontology(did, 1, ontology) + assert False + except openml.exceptions.OpenMLServerException as e: + assert e.code == 1106 + +@pytest.mark.production() class OpenMLDatasetTestSparse(TestBase): _multiprocess_can_split_ = True def setUp(self): - super(OpenMLDatasetTestSparse, self).setUp() + super().setUp() openml.config.server = self.production_server self.sparse_dataset = openml.datasets.get_dataset(4136, download_data=False) def test_get_sparse_dataset_array_with_target(self): X, y, _, attribute_names = self.sparse_dataset.get_data( - dataset_format="array", target="class" + dataset_format="array", + target="class", ) - self.assertTrue(sparse.issparse(X)) - self.assertEqual(X.dtype, np.float32) - self.assertEqual(X.shape, (600, 20000)) + assert sparse.issparse(X) + assert X.dtype == np.float32 + assert X.shape == (600, 20000) - self.assertIsInstance(y, np.ndarray) - self.assertIn(y.dtype, [np.int32, np.int64]) - self.assertEqual(y.shape, (600,)) + assert isinstance(y, np.ndarray) + assert y.dtype in [np.int32, np.int64] + assert y.shape == (600,) - self.assertEqual(len(attribute_names), 20000) - self.assertNotIn("class", attribute_names) + assert len(attribute_names) == 20000 + assert "class" not in attribute_names def test_get_sparse_dataset_dataframe_with_target(self): X, y, _, attribute_names = self.sparse_dataset.get_data( - dataset_format="dataframe", target="class" + dataset_format="dataframe", + target="class", ) - self.assertIsInstance(X, pd.DataFrame) - self.assertIsInstance(X.dtypes[0], pd.SparseDtype) - self.assertEqual(X.shape, (600, 20000)) + assert isinstance(X, pd.DataFrame) + assert isinstance(X.dtypes[0], pd.SparseDtype) + assert X.shape == (600, 20000) - self.assertIsInstance(y, pd.Series) - self.assertIsInstance(y.dtypes, pd.SparseDtype) - self.assertEqual(y.shape, (600,)) + assert isinstance(y, pd.Series) + assert isinstance(y.dtypes, pd.SparseDtype) + assert y.shape == (600,) - self.assertEqual(len(attribute_names), 20000) - self.assertNotIn("class", attribute_names) + assert len(attribute_names) == 20000 + assert "class" not in attribute_names def test_get_sparse_dataset_array(self): rval, _, categorical, attribute_names = self.sparse_dataset.get_data(dataset_format="array") - self.assertTrue(sparse.issparse(rval)) - self.assertEqual(rval.dtype, np.float32) - self.assertEqual((600, 20001), rval.shape) + assert sparse.issparse(rval) + assert rval.dtype == np.float32 + assert rval.shape == (600, 20001) - self.assertEqual(len(categorical), 20001) - self.assertTrue(all([isinstance(cat, bool) for cat in categorical])) + assert len(categorical) == 20001 + assert all(isinstance(cat, bool) for cat in categorical) - self.assertEqual(len(attribute_names), 20001) - self.assertTrue(all([isinstance(att, str) for att in attribute_names])) + assert len(attribute_names) == 20001 + assert all(isinstance(att, str) for att in attribute_names) def test_get_sparse_dataset_dataframe(self): rval, *_ = self.sparse_dataset.get_data() - self.assertIsInstance(rval, pd.DataFrame) + assert isinstance(rval, pd.DataFrame) np.testing.assert_array_equal( - [pd.SparseDtype(np.float32, fill_value=0.0)] * len(rval.dtypes), rval.dtypes + [pd.SparseDtype(np.float32, fill_value=0.0)] * len(rval.dtypes), + rval.dtypes, ) - self.assertEqual((600, 20001), rval.shape) + assert rval.shape == (600, 20001) def test_get_sparse_dataset_with_rowid(self): self.sparse_dataset.row_id_attribute = ["V256"] rval, _, categorical, _ = self.sparse_dataset.get_data( - dataset_format="array", include_row_id=True + dataset_format="array", + include_row_id=True, ) - self.assertTrue(sparse.issparse(rval)) - self.assertEqual(rval.dtype, np.float32) - self.assertEqual(rval.shape, (600, 20001)) - self.assertEqual(len(categorical), 20001) + assert sparse.issparse(rval) + assert rval.dtype == np.float32 + assert rval.shape == (600, 20001) + assert len(categorical) == 20001 rval, _, categorical, _ = self.sparse_dataset.get_data( - dataset_format="array", include_row_id=False + dataset_format="array", + include_row_id=False, ) - self.assertTrue(sparse.issparse(rval)) - self.assertEqual(rval.dtype, np.float32) - self.assertEqual(rval.shape, (600, 20000)) - self.assertEqual(len(categorical), 20000) + assert sparse.issparse(rval) + assert rval.dtype == np.float32 + assert rval.shape == (600, 20000) + assert len(categorical) == 20000 def test_get_sparse_dataset_with_ignore_attributes(self): self.sparse_dataset.ignore_attribute = ["V256"] rval, _, categorical, _ = self.sparse_dataset.get_data( - dataset_format="array", include_ignore_attribute=True + dataset_format="array", + include_ignore_attribute=True, ) - self.assertTrue(sparse.issparse(rval)) - self.assertEqual(rval.dtype, np.float32) - self.assertEqual(rval.shape, (600, 20001)) + assert sparse.issparse(rval) + assert rval.dtype == np.float32 + assert rval.shape == (600, 20001) - self.assertEqual(len(categorical), 20001) + assert len(categorical) == 20001 rval, _, categorical, _ = self.sparse_dataset.get_data( - dataset_format="array", include_ignore_attribute=False + dataset_format="array", + include_ignore_attribute=False, ) - self.assertTrue(sparse.issparse(rval)) - self.assertEqual(rval.dtype, np.float32) - self.assertEqual(rval.shape, (600, 20000)) - self.assertEqual(len(categorical), 20000) + assert sparse.issparse(rval) + assert rval.dtype == np.float32 + assert rval.shape == (600, 20000) + assert len(categorical) == 20000 def test_get_sparse_dataset_rowid_and_ignore_and_target(self): # TODO: re-add row_id and ignore attributes @@ -419,24 +485,24 @@ def test_get_sparse_dataset_rowid_and_ignore_and_target(self): include_row_id=False, include_ignore_attribute=False, ) - self.assertTrue(sparse.issparse(X)) - self.assertEqual(X.dtype, np.float32) - self.assertIn(y.dtype, [np.int32, np.int64]) - self.assertEqual(X.shape, (600, 19998)) + assert sparse.issparse(X) + assert X.dtype == np.float32 + assert y.dtype in [np.int32, np.int64] + assert X.shape == (600, 19998) - self.assertEqual(len(categorical), 19998) + assert len(categorical) == 19998 self.assertListEqual(categorical, [False] * 19998) - self.assertEqual(y.shape, (600,)) + assert y.shape == (600,) def test_get_sparse_categorical_data_id_395(self): dataset = openml.datasets.get_dataset(395, download_data=True) feature = dataset.features[3758] - self.assertTrue(isinstance(dataset, OpenMLDataset)) - self.assertTrue(isinstance(feature, OpenMLDataFeature)) - self.assertEqual(dataset.name, "re1.wc") - self.assertEqual(feature.name, "CLASS_LABEL") - self.assertEqual(feature.data_type, "nominal") - self.assertEqual(len(feature.nominal_values), 25) + assert isinstance(dataset, OpenMLDataset) + assert isinstance(feature, OpenMLDataFeature) + assert dataset.name == "re1.wc" + assert feature.name == "CLASS_LABEL" + assert feature.data_type == "nominal" + assert len(feature.nominal_values) == 25 class OpenMLDatasetFunctionTest(TestBase): @@ -445,51 +511,65 @@ class OpenMLDatasetFunctionTest(TestBase): def test__read_features(self, filename_mock, pickle_mock): """Test we read the features from the xml if no cache pickle is available. - This test also does some simple checks to verify that the features are read correctly""" + This test also does some simple checks to verify that the features are read correctly + """ filename_mock.return_value = os.path.join(self.workdir, "features.xml.pkl") pickle_mock.load.side_effect = FileNotFoundError features = openml.datasets.dataset._read_features( os.path.join( - self.static_cache_dir, "org", "openml", "test", "datasets", "2", "features.xml" - ) + self.static_cache_dir, + "org", + "openml", + "test", + "datasets", + "2", + "features.xml", + ), ) - self.assertIsInstance(features, dict) - self.assertEqual(len(features), 39) - self.assertIsInstance(features[0], OpenMLDataFeature) - self.assertEqual(features[0].name, "family") - self.assertEqual(len(features[0].nominal_values), 9) + assert isinstance(features, dict) + assert len(features) == 39 + assert isinstance(features[0], OpenMLDataFeature) + assert features[0].name == "family" + assert len(features[0].nominal_values) == 9 # pickle.load is never called because the features pickle file didn't exist - self.assertEqual(pickle_mock.load.call_count, 0) - self.assertEqual(pickle_mock.dump.call_count, 1) + assert pickle_mock.load.call_count == 0 + assert pickle_mock.dump.call_count == 1 @unittest.mock.patch("openml.datasets.dataset.pickle") @unittest.mock.patch("openml.datasets.dataset._get_qualities_pickle_file") def test__read_qualities(self, filename_mock, pickle_mock): """Test we read the qualities from the xml if no cache pickle is available. - This test also does some minor checks to ensure that the qualities are read correctly.""" + This test also does some minor checks to ensure that the qualities are read correctly. + """ filename_mock.return_value = os.path.join(self.workdir, "qualities.xml.pkl") pickle_mock.load.side_effect = FileNotFoundError qualities = openml.datasets.dataset._read_qualities( os.path.join( - self.static_cache_dir, "org", "openml", "test", "datasets", "2", "qualities.xml" - ) + self.static_cache_dir, + "org", + "openml", + "test", + "datasets", + "2", + "qualities.xml", + ), ) - self.assertIsInstance(qualities, dict) - self.assertEqual(len(qualities), 106) + assert isinstance(qualities, dict) + assert len(qualities) == 106 # pickle.load is never called because the qualities pickle file didn't exist - self.assertEqual(pickle_mock.load.call_count, 0) - self.assertEqual(pickle_mock.dump.call_count, 1) + assert pickle_mock.load.call_count == 0 + assert pickle_mock.dump.call_count == 1 def test__check_qualities(self): qualities = [{"oml:name": "a", "oml:value": "0.5"}] qualities = openml.datasets.dataset._check_qualities(qualities) - self.assertEqual(qualities["a"], 0.5) + assert qualities["a"] == 0.5 qualities = [{"oml:name": "a", "oml:value": "null"}] qualities = openml.datasets.dataset._check_qualities(qualities) - self.assertNotEqual(qualities["a"], qualities["a"]) + assert qualities["a"] != qualities["a"] qualities = [{"oml:name": "a", "oml:value": None}] qualities = openml.datasets.dataset._check_qualities(qualities) - self.assertNotEqual(qualities["a"], qualities["a"]) + assert qualities["a"] != qualities["a"] diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index fe04f7d96..f3d269dc1 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -1,18 +1,18 @@ # License: BSD 3-Clause +from __future__ import annotations import os -import pathlib +from pathlib import Path import random +import shutil +import time from itertools import product from unittest import mock -import shutil import arff -import time - -import pytest import numpy as np import pandas as pd +import pytest import requests import scipy.sparse from oslo_concurrency import lockutils @@ -20,41 +20,41 @@ import openml from openml import OpenMLDataset from openml._api_calls import _download_minio_file -from openml.exceptions import ( - OpenMLHashException, - OpenMLPrivateDatasetError, - OpenMLServerException, - OpenMLNotAuthorizedError, -) -from openml.testing import TestBase, create_request_response -from openml.utils import _tag_entity, _create_cache_directory_for_id +from openml.datasets import edit_dataset, fork_dataset from openml.datasets.functions import ( - create_dataset, - attributes_arff_from_df, + DATASETS_CACHE_DIR_NAME, _get_dataset_arff, _get_dataset_description, _get_dataset_features_file, + _get_dataset_parquet, _get_dataset_qualities_file, _get_online_dataset_arff, _get_online_dataset_format, - DATASETS_CACHE_DIR_NAME, - _get_dataset_parquet, _topic_add_dataset, _topic_delete_dataset, + attributes_arff_from_df, + create_dataset, +) +from openml.exceptions import ( + OpenMLHashException, + OpenMLNotAuthorizedError, + OpenMLPrivateDatasetError, + OpenMLServerException, ) -from openml.datasets import fork_dataset, edit_dataset from openml.tasks import TaskType, create_task +from openml.testing import TestBase, create_request_response +from openml.utils import _create_cache_directory_for_id, _tag_entity class TestOpenMLDataset(TestBase): _multiprocess_can_split_ = True def setUp(self): - super(TestOpenMLDataset, self).setUp() + super().setUp() def tearDown(self): self._remove_pickle_files() - super(TestOpenMLDataset, self).tearDown() + super().tearDown() def _remove_pickle_files(self): self.lock_path = os.path.join(openml.config.get_cache_directory(), "locks") @@ -64,7 +64,10 @@ def _remove_pickle_files(self): lock_path=self.lock_path, ): pickle_path = os.path.join( - openml.config.get_cache_directory(), "datasets", did, "dataset.pkl.py3" + openml.config.get_cache_directory(), + "datasets", + did, + "dataset.pkl.py3", ) try: os.remove(pickle_path) @@ -90,13 +93,13 @@ def _get_empty_param_for_dataset(self): } def _check_dataset(self, dataset): - self.assertEqual(type(dataset), dict) - self.assertGreaterEqual(len(dataset), 2) - self.assertIn("did", dataset) - self.assertIsInstance(dataset["did"], int) - self.assertIn("status", dataset) - self.assertIsInstance(dataset["status"], str) - self.assertIn(dataset["status"], ["in_preparation", "active", "deactivated"]) + assert type(dataset) == dict + assert len(dataset) >= 2 + assert "did" in dataset + assert isinstance(dataset["did"], int) + assert "status" in dataset + assert isinstance(dataset["status"], str) + assert dataset["status"] in ["in_preparation", "active", "deactivated"] def _check_datasets(self, datasets): for did in datasets: @@ -105,29 +108,31 @@ def _check_datasets(self, datasets): def test_tag_untag_dataset(self): tag = "test_tag_%d" % random.randint(1, 1000000) all_tags = _tag_entity("data", 1, tag) - self.assertTrue(tag in all_tags) + assert tag in all_tags all_tags = _tag_entity("data", 1, tag, untag=True) - self.assertTrue(tag not in all_tags) + assert tag not in all_tags def test_list_datasets_output_format(self): datasets = openml.datasets.list_datasets(output_format="dataframe") - self.assertIsInstance(datasets, pd.DataFrame) - self.assertGreaterEqual(len(datasets), 100) + assert isinstance(datasets, pd.DataFrame) + assert len(datasets) >= 100 def test_list_datasets_paginate(self): size = 10 max = 100 for i in range(0, max, size): datasets = openml.datasets.list_datasets(offset=i, size=size) - self.assertEqual(size, len(datasets)) + assert size == len(datasets) self._check_datasets(datasets) def test_list_datasets_empty(self): datasets = openml.datasets.list_datasets( - tag="NoOneWouldUseThisTagAnyway", output_format="dataframe" + tag="NoOneWouldUseThisTagAnyway", + output_format="dataframe", ) - self.assertTrue(datasets.empty) + assert datasets.empty + @pytest.mark.production() def test_check_datasets_active(self): # Have to test on live because there is no deactivated dataset on the test server. openml.config.server = self.production_server @@ -135,9 +140,9 @@ def test_check_datasets_active(self): [2, 17, 79], raise_error_if_not_exist=False, ) - self.assertTrue(active[2]) - self.assertFalse(active[17]) - self.assertIsNone(active.get(79)) + assert active[2] + assert not active[17] + assert active.get(79) is None self.assertRaisesRegex( ValueError, r"Could not find dataset\(s\) 79 in OpenML dataset list.", @@ -146,6 +151,24 @@ def test_check_datasets_active(self): ) openml.config.server = self.test_server + def test_illegal_character_tag(self): + dataset = openml.datasets.get_dataset(1) + tag = "illegal_tag&" + try: + dataset.push_tag(tag) + raise AssertionError() + except openml.exceptions.OpenMLServerException as e: + assert e.code == 477 + + def test_illegal_length_tag(self): + dataset = openml.datasets.get_dataset(1) + tag = "a" * 65 + try: + dataset.push_tag(tag) + raise AssertionError() + except openml.exceptions.OpenMLServerException as e: + assert e.code == 477 + def _datasets_retrieved_successfully(self, dids, metadata_only=True): """Checks that all files for the given dids have been downloaded. @@ -156,25 +179,19 @@ def _datasets_retrieved_successfully(self, dids, metadata_only=True): - absence of data arff if metadata_only, else it must be present too. """ for did in dids: - self.assertTrue( - os.path.exists( - os.path.join( - openml.config.get_cache_directory(), "datasets", str(did), "description.xml" - ) + assert os.path.exists( + os.path.join( + openml.config.get_cache_directory(), "datasets", str(did), "description.xml" ) ) - self.assertTrue( - os.path.exists( - os.path.join( - openml.config.get_cache_directory(), "datasets", str(did), "qualities.xml" - ) + assert os.path.exists( + os.path.join( + openml.config.get_cache_directory(), "datasets", str(did), "qualities.xml" ) ) - self.assertTrue( - os.path.exists( - os.path.join( - openml.config.get_cache_directory(), "datasets", str(did), "features.xml" - ) + assert os.path.exists( + os.path.join( + openml.config.get_cache_directory(), "datasets", str(did), "features.xml" ) ) @@ -182,28 +199,35 @@ def _datasets_retrieved_successfully(self, dids, metadata_only=True): data_assert( os.path.exists( os.path.join( - openml.config.get_cache_directory(), "datasets", str(did), "dataset.arff" - ) - ) + openml.config.get_cache_directory(), + "datasets", + str(did), + "dataset.arff", + ), + ), ) + @pytest.mark.production() def test__name_to_id_with_deactivated(self): """Check that an activated dataset is returned if an earlier deactivated one exists.""" openml.config.server = self.production_server # /d/1 was deactivated - self.assertEqual(openml.datasets.functions._name_to_id("anneal"), 2) + assert openml.datasets.functions._name_to_id("anneal") == 2 openml.config.server = self.test_server + @pytest.mark.production() def test__name_to_id_with_multiple_active(self): """With multiple active datasets, retrieve the least recent active.""" openml.config.server = self.production_server - self.assertEqual(openml.datasets.functions._name_to_id("iris"), 61) + assert openml.datasets.functions._name_to_id("iris") == 61 + @pytest.mark.production() def test__name_to_id_with_version(self): """With multiple active datasets, retrieve the least recent active.""" openml.config.server = self.production_server - self.assertEqual(openml.datasets.functions._name_to_id("iris", version=3), 969) + assert openml.datasets.functions._name_to_id("iris", version=3) == 969 + @pytest.mark.production() def test__name_to_id_with_multiple_active_error(self): """With multiple active datasets, retrieve the least recent active.""" openml.config.server = self.production_server @@ -238,40 +262,41 @@ def test_get_datasets_by_name(self): # did 1 and 2 on the test server: dids = ["anneal", "kr-vs-kp"] datasets = openml.datasets.get_datasets(dids, download_data=False) - self.assertEqual(len(datasets), 2) + assert len(datasets) == 2 self._datasets_retrieved_successfully([1, 2]) def test_get_datasets_by_mixed(self): # did 1 and 2 on the test server: dids = ["anneal", 2] datasets = openml.datasets.get_datasets(dids, download_data=False) - self.assertEqual(len(datasets), 2) + assert len(datasets) == 2 self._datasets_retrieved_successfully([1, 2]) def test_get_datasets(self): dids = [1, 2] datasets = openml.datasets.get_datasets(dids) - self.assertEqual(len(datasets), 2) + assert len(datasets) == 2 self._datasets_retrieved_successfully([1, 2], metadata_only=False) def test_get_datasets_lazy(self): dids = [1, 2] datasets = openml.datasets.get_datasets(dids, download_data=False) - self.assertEqual(len(datasets), 2) + assert len(datasets) == 2 self._datasets_retrieved_successfully([1, 2], metadata_only=True) datasets[0].get_data() datasets[1].get_data() self._datasets_retrieved_successfully([1, 2], metadata_only=False) + @pytest.mark.production() def test_get_dataset_by_name(self): dataset = openml.datasets.get_dataset("anneal") - self.assertEqual(type(dataset), OpenMLDataset) - self.assertEqual(dataset.dataset_id, 1) + assert type(dataset) == OpenMLDataset + assert dataset.dataset_id == 1 self._datasets_retrieved_successfully([1], metadata_only=False) - self.assertGreater(len(dataset.features), 1) - self.assertGreater(len(dataset.qualities), 4) + assert len(dataset.features) > 1 + assert len(dataset.qualities) > 4 # Issue324 Properly handle private datasets when trying to access them openml.config.server = self.production_server @@ -288,33 +313,35 @@ def test_get_dataset_download_all_files(self): def test_get_dataset_uint8_dtype(self): dataset = openml.datasets.get_dataset(1) - self.assertEqual(type(dataset), OpenMLDataset) - self.assertEqual(dataset.name, "anneal") + assert type(dataset) == OpenMLDataset + assert dataset.name == "anneal" df, _, _, _ = dataset.get_data() - self.assertEqual(df["carbon"].dtype, "uint8") + assert df["carbon"].dtype == "uint8" + @pytest.mark.production() def test_get_dataset(self): # This is the only non-lazy load to ensure default behaviour works. dataset = openml.datasets.get_dataset(1) - self.assertEqual(type(dataset), OpenMLDataset) - self.assertEqual(dataset.name, "anneal") + assert type(dataset) == OpenMLDataset + assert dataset.name == "anneal" self._datasets_retrieved_successfully([1], metadata_only=False) - self.assertGreater(len(dataset.features), 1) - self.assertGreater(len(dataset.qualities), 4) + assert len(dataset.features) > 1 + assert len(dataset.qualities) > 4 # Issue324 Properly handle private datasets when trying to access them openml.config.server = self.production_server self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, 45) + @pytest.mark.production() def test_get_dataset_lazy(self): dataset = openml.datasets.get_dataset(1, download_data=False) - self.assertEqual(type(dataset), OpenMLDataset) - self.assertEqual(dataset.name, "anneal") + assert type(dataset) == OpenMLDataset + assert dataset.name == "anneal" self._datasets_retrieved_successfully([1], metadata_only=True) - self.assertGreater(len(dataset.features), 1) - self.assertGreater(len(dataset.qualities), 4) + assert len(dataset.features) > 1 + assert len(dataset.qualities) > 4 dataset.get_data() self._datasets_retrieved_successfully([1], metadata_only=False) @@ -329,12 +356,8 @@ def test_get_dataset_lazy_all_functions(self): # We only tests functions as general integrity is tested by test_get_dataset_lazy def ensure_absence_of_real_data(): - self.assertFalse( - os.path.exists( - os.path.join( - openml.config.get_cache_directory(), "datasets", "1", "dataset.arff" - ) - ) + assert not os.path.exists( + os.path.join(openml.config.get_cache_directory(), "datasets", "1", "dataset.arff") ) tag = "test_lazy_tag_%d" % random.randint(1, 1000000) @@ -349,36 +372,36 @@ def ensure_absence_of_real_data(): correct = [0, 1, 2, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 35, 36, 37, 38] # fmt: on - self.assertEqual(nominal_indices, correct) + assert nominal_indices == correct ensure_absence_of_real_data() classes = dataset.retrieve_class_labels() - self.assertEqual(classes, ["1", "2", "3", "4", "5", "U"]) + assert classes == ["1", "2", "3", "4", "5", "U"] ensure_absence_of_real_data() def test_get_dataset_sparse(self): dataset = openml.datasets.get_dataset(102, download_data=False) X, *_ = dataset.get_data(dataset_format="array") - self.assertIsInstance(X, scipy.sparse.csr_matrix) + assert isinstance(X, scipy.sparse.csr_matrix) def test_download_rowid(self): # Smoke test which checks that the dataset has the row-id set correctly did = 44 dataset = openml.datasets.get_dataset(did, download_data=False) - self.assertEqual(dataset.row_id_attribute, "Counter") + assert dataset.row_id_attribute == "Counter" def test__get_dataset_description(self): description = _get_dataset_description(self.workdir, 2) - self.assertIsInstance(description, dict) + assert isinstance(description, dict) description_xml_path = os.path.join(self.workdir, "description.xml") - self.assertTrue(os.path.exists(description_xml_path)) + assert os.path.exists(description_xml_path) def test__getarff_path_dataset_arff(self): openml.config.set_root_cache_directory(self.static_cache_dir) description = _get_dataset_description(self.workdir, 2) arff_path = _get_dataset_arff(description, cache_directory=self.workdir) - self.assertIsInstance(arff_path, str) - self.assertTrue(os.path.exists(arff_path)) + assert isinstance(arff_path, Path) + assert arff_path.exists() def test__download_minio_file_object_does_not_exist(self): self.assertRaisesRegex( @@ -396,10 +419,9 @@ def test__download_minio_file_to_directory(self): destination=self.workdir, exists_ok=True, ) - self.assertTrue( - os.path.isfile(os.path.join(self.workdir, "dataset_20.pq")), - "_download_minio_file can save to a folder by copying the object name", - ) + assert os.path.isfile( + os.path.join(self.workdir, "dataset_20.pq") + ), "_download_minio_file can save to a folder by copying the object name" def test__download_minio_file_to_path(self): file_destination = os.path.join(self.workdir, "custom.pq") @@ -408,13 +430,12 @@ def test__download_minio_file_to_path(self): destination=file_destination, exists_ok=True, ) - self.assertTrue( - os.path.isfile(file_destination), - "_download_minio_file can save to a folder by copying the object name", - ) + assert os.path.isfile( + file_destination + ), "_download_minio_file can save to a folder by copying the object name" def test__download_minio_file_raises_FileExists_if_destination_in_use(self): - file_destination = pathlib.Path(self.workdir, "custom.pq") + file_destination = Path(self.workdir, "custom.pq") file_destination.touch() self.assertRaises( @@ -426,47 +447,46 @@ def test__download_minio_file_raises_FileExists_if_destination_in_use(self): ) def test__download_minio_file_works_with_bucket_subdirectory(self): - file_destination = pathlib.Path(self.workdir, "custom.pq") + file_destination = Path(self.workdir, "custom.pq") _download_minio_file( source="http://openml1.win.tue.nl/dataset61/dataset_61.pq", destination=file_destination, exists_ok=True, ) - self.assertTrue( - os.path.isfile(file_destination), - "_download_minio_file can download from subdirectories", - ) + assert os.path.isfile( + file_destination + ), "_download_minio_file can download from subdirectories" def test__get_dataset_parquet_not_cached(self): description = { - "oml:minio_url": "http://openml1.win.tue.nl/dataset20/dataset_20.pq", + "oml:parquet_url": "http://openml1.win.tue.nl/dataset20/dataset_20.pq", "oml:id": "20", } path = _get_dataset_parquet(description, cache_directory=self.workdir) - self.assertIsInstance(path, str, "_get_dataset_parquet returns a path") - self.assertTrue(os.path.isfile(path), "_get_dataset_parquet returns path to real file") + assert isinstance(path, Path), "_get_dataset_parquet returns a path" + assert path.is_file(), "_get_dataset_parquet returns path to real file" @mock.patch("openml._api_calls._download_minio_file") def test__get_dataset_parquet_is_cached(self, patch): openml.config.set_root_cache_directory(self.static_cache_dir) patch.side_effect = RuntimeError( - "_download_minio_file should not be called when loading from cache" + "_download_parquet_url should not be called when loading from cache", ) description = { - "oml:minio_url": "http://openml1.win.tue.nl/dataset30/dataset_30.pq", + "oml:parquet_url": "http://openml1.win.tue.nl/dataset30/dataset_30.pq", "oml:id": "30", } path = _get_dataset_parquet(description, cache_directory=None) - self.assertIsInstance(path, str, "_get_dataset_parquet returns a path") - self.assertTrue(os.path.isfile(path), "_get_dataset_parquet returns path to real file") + assert isinstance(path, Path), "_get_dataset_parquet returns a path" + assert path.is_file(), "_get_dataset_parquet returns path to real file" def test__get_dataset_parquet_file_does_not_exist(self): description = { - "oml:minio_url": "http://openml1.win.tue.nl/dataset20/does_not_exist.pq", + "oml:parquet_url": "http://openml1.win.tue.nl/dataset20/does_not_exist.pq", "oml:id": "20", } path = _get_dataset_parquet(description, cache_directory=self.workdir) - self.assertIsNone(path, "_get_dataset_parquet returns None if no file is found") + assert path is None, "_get_dataset_parquet returns None if no file is found" def test__getarff_md5_issue(self): description = { @@ -489,26 +509,28 @@ def test__getarff_md5_issue(self): def test__get_dataset_features(self): features_file = _get_dataset_features_file(self.workdir, 2) - self.assertIsInstance(features_file, str) - features_xml_path = os.path.join(self.workdir, "features.xml") - self.assertTrue(os.path.exists(features_xml_path)) + assert isinstance(features_file, Path) + features_xml_path = self.workdir / "features.xml" + assert features_xml_path.exists() def test__get_dataset_qualities(self): qualities = _get_dataset_qualities_file(self.workdir, 2) - self.assertIsInstance(qualities, str) - qualities_xml_path = os.path.join(self.workdir, "qualities.xml") - self.assertTrue(os.path.exists(qualities_xml_path)) + assert isinstance(qualities, Path) + qualities_xml_path = self.workdir / "qualities.xml" + assert qualities_xml_path.exists() def test__get_dataset_skip_download(self): dataset = openml.datasets.get_dataset( - 2, download_qualities=False, download_features_meta_data=False + 2, + download_qualities=False, + download_features_meta_data=False, ) # Internal representation without lazy loading - self.assertIsNone(dataset._qualities) - self.assertIsNone(dataset._features) + assert dataset._qualities is None + assert dataset._features is None # External representation with lazy loading - self.assertIsNotNone(dataset.qualities) - self.assertIsNotNone(dataset.features) + assert dataset.qualities is not None + assert dataset.features is not None def test_get_dataset_force_refresh_cache(self): did_cache_dir = _create_cache_directory_for_id( @@ -520,11 +542,11 @@ def test_get_dataset_force_refresh_cache(self): # Test default openml.datasets.get_dataset(2) - self.assertEqual(change_time, os.stat(did_cache_dir).st_mtime) + assert change_time == os.stat(did_cache_dir).st_mtime # Test refresh openml.datasets.get_dataset(2, force_refresh_cache=True) - self.assertNotEqual(change_time, os.stat(did_cache_dir).st_mtime) + assert change_time != os.stat(did_cache_dir).st_mtime # Final clean up openml.utils._remove_cache_dir_for_id( @@ -545,7 +567,7 @@ def test_get_dataset_force_refresh_cache_clean_start(self): # Test clean start openml.datasets.get_dataset(2, force_refresh_cache=True) - self.assertTrue(os.path.exists(did_cache_dir)) + assert os.path.exists(did_cache_dir) # Final clean up openml.utils._remove_cache_dir_for_id( @@ -559,12 +581,12 @@ def test_deletion_of_cache_dir(self): DATASETS_CACHE_DIR_NAME, 1, ) - self.assertTrue(os.path.exists(did_cache_dir)) + assert os.path.exists(did_cache_dir) openml.utils._remove_cache_dir_for_id( DATASETS_CACHE_DIR_NAME, did_cache_dir, ) - self.assertFalse(os.path.exists(did_cache_dir)) + assert not os.path.exists(did_cache_dir) # Use _get_dataset_arff to load the description, trigger an exception in the # test target and have a slightly higher coverage @@ -573,13 +595,16 @@ def test_deletion_of_cache_dir_faulty_download(self, patch): patch.side_effect = Exception("Boom!") self.assertRaisesRegex(Exception, "Boom!", openml.datasets.get_dataset, dataset_id=1) datasets_cache_dir = os.path.join(self.workdir, "org", "openml", "test", "datasets") - self.assertEqual(len(os.listdir(datasets_cache_dir)), 0) + assert len(os.listdir(datasets_cache_dir)) == 0 def test_publish_dataset(self): # lazy loading not possible as we need the arff-file. openml.datasets.get_dataset(3) file_path = os.path.join( - openml.config.get_cache_directory(), "datasets", "3", "dataset.arff" + openml.config.get_cache_directory(), + "datasets", + "3", + "dataset.arff", ) dataset = OpenMLDataset( "anneal", @@ -593,18 +618,25 @@ def test_publish_dataset(self): dataset.publish() TestBase._mark_entity_for_removal("data", dataset.dataset_id) TestBase.logger.info( - "collected from {}: {}".format(__file__.split("/")[-1], dataset.dataset_id) + "collected from {}: {}".format(__file__.split("/")[-1], dataset.dataset_id), ) - self.assertIsInstance(dataset.dataset_id, int) + assert isinstance(dataset.dataset_id, int) def test__retrieve_class_labels(self): openml.config.set_root_cache_directory(self.static_cache_dir) labels = openml.datasets.get_dataset(2, download_data=False).retrieve_class_labels() - self.assertEqual(labels, ["1", "2", "3", "4", "5", "U"]) + assert labels == ["1", "2", "3", "4", "5", "U"] + labels = openml.datasets.get_dataset(2, download_data=False).retrieve_class_labels( - target_name="product-type" + target_name="product-type", ) - self.assertEqual(labels, ["C", "H", "G"]) + assert labels == ["C", "H", "G"] + + # Test workaround for string-typed class labels + custom_ds = openml.datasets.get_dataset(2, download_data=False) + custom_ds.features[31].data_type = "string" + labels = custom_ds.retrieve_class_labels(target_name=custom_ds.features[31].name) + assert labels == ["COIL", "SHEET"] def test_upload_dataset_with_url(self): dataset = OpenMLDataset( @@ -617,21 +649,23 @@ def test_upload_dataset_with_url(self): dataset.publish() TestBase._mark_entity_for_removal("data", dataset.dataset_id) TestBase.logger.info( - "collected from {}: {}".format(__file__.split("/")[-1], dataset.dataset_id) + "collected from {}: {}".format(__file__.split("/")[-1], dataset.dataset_id), ) - self.assertIsInstance(dataset.dataset_id, int) + assert isinstance(dataset.dataset_id, int) def _assert_status_of_dataset(self, *, did: int, status: str): """Asserts there is exactly one dataset with id `did` and its current status is `status`""" # need to use listing fn, as this is immune to cache result = openml.datasets.list_datasets( - data_id=[did], status="all", output_format="dataframe" + data_id=[did], + status="all", + output_format="dataframe", ) result = result.to_dict(orient="index") # I think we should drop the test that one result is returned, # the server should never return multiple results? - self.assertEqual(len(result), 1) - self.assertEqual(result[did]["status"], status) + assert len(result) == 1 + assert result[did]["status"] == status @pytest.mark.flaky() def test_data_status(self): @@ -660,7 +694,7 @@ def test_data_status(self): openml.datasets.status_update(did, "active") self._assert_status_of_dataset(did=did, status="active") - with self.assertRaises(ValueError): + with pytest.raises(ValueError): openml.datasets.status_update(did, "in_preparation") self._assert_status_of_dataset(did=did, status="active") @@ -672,32 +706,29 @@ def test_attributes_arff_from_df(self): ) df["category"] = df["category"].astype("category") attributes = attributes_arff_from_df(df) - self.assertEqual( - attributes, - [ - ("integer", "INTEGER"), - ("floating", "REAL"), - ("string", "STRING"), - ("category", ["A", "B"]), - ("boolean", ["True", "False"]), - ], - ) + assert attributes == [ + ("integer", "INTEGER"), + ("floating", "REAL"), + ("string", "STRING"), + ("category", ["A", "B"]), + ("boolean", ["True", "False"]), + ] # DataFrame with Sparse columns case df = pd.DataFrame( { "integer": pd.arrays.SparseArray([1, 2, 0], fill_value=0), "floating": pd.arrays.SparseArray([1.0, 2.0, 0], fill_value=0.0), - } + }, ) df["integer"] = df["integer"].astype(np.int64) attributes = attributes_arff_from_df(df) - self.assertEqual(attributes, [("integer", "INTEGER"), ("floating", "REAL")]) + assert attributes == [("integer", "INTEGER"), ("floating", "REAL")] def test_attributes_arff_from_df_numeric_column(self): # Test column names are automatically converted to str if needed (#819) df = pd.DataFrame({0: [1, 2, 3], 0.5: [4, 5, 6], "target": [0, 1, 1]}) attributes = attributes_arff_from_df(df) - self.assertEqual(attributes, [("0", "INTEGER"), ("0.5", "INTEGER"), ("target", "INTEGER")]) + assert attributes == [("0", "INTEGER"), ("0.5", "INTEGER"), ("target", "INTEGER")] def test_attributes_arff_from_df_mixed_dtype_categories(self): # liac-arff imposed categorical attributes to be of sting dtype. We @@ -719,8 +750,7 @@ def test_attributes_arff_from_df_unknown_dtype(self): for arr, dt in zip(data, dtype): df = pd.DataFrame(arr) err_msg = ( - "The dtype '{}' of the column '0' is not currently " - "supported by liac-arff".format(dt) + f"The dtype '{dt}' of the column '0' is not currently " "supported by liac-arff" ) with pytest.raises(ValueError, match=err_msg): attributes_arff_from_df(df) @@ -728,7 +758,7 @@ def test_attributes_arff_from_df_unknown_dtype(self): def test_create_dataset_numpy(self): data = np.array([[1, 2, 3], [1.2, 2.5, 3.8], [2, 5, 8], [0, 1, 0]]).T - attributes = [("col_{}".format(i), "REAL") for i in range(data.shape[1])] + attributes = [(f"col_{i}", "REAL") for i in range(data.shape[1])] dataset = create_dataset( name="%s-NumPy_testing_dataset" % self._get_sentinel(), @@ -738,7 +768,7 @@ def test_create_dataset_numpy(self): collection_date="01-01-2018", language="English", licence="MIT", - default_target_attribute="col_{}".format(data.shape[1] - 1), + default_target_attribute=f"col_{data.shape[1] - 1}", row_id_attribute=None, ignore_attribute=None, citation="None", @@ -753,12 +783,10 @@ def test_create_dataset_numpy(self): TestBase._mark_entity_for_removal("data", dataset.id) TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id)) - self.assertEqual( - _get_online_dataset_arff(dataset.id), - dataset._dataset, - "Uploaded arff does not match original one", - ) - self.assertEqual(_get_online_dataset_format(dataset.id), "arff", "Wrong format for dataset") + assert ( + _get_online_dataset_arff(dataset.id) == dataset._dataset + ), "Uploaded arff does not match original one" + assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset" def test_create_dataset_list(self): data = [ @@ -809,17 +837,15 @@ def test_create_dataset_list(self): dataset.publish() TestBase._mark_entity_for_removal("data", dataset.id) TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id)) - self.assertEqual( - _get_online_dataset_arff(dataset.id), - dataset._dataset, - "Uploaded ARFF does not match original one", - ) - self.assertEqual(_get_online_dataset_format(dataset.id), "arff", "Wrong format for dataset") + assert ( + _get_online_dataset_arff(dataset.id) == dataset._dataset + ), "Uploaded ARFF does not match original one" + assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset" def test_create_dataset_sparse(self): # test the scipy.sparse.coo_matrix sparse_data = scipy.sparse.coo_matrix( - ([0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])) + ([0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])), ) column_names = [ @@ -848,16 +874,14 @@ def test_create_dataset_sparse(self): xor_dataset.publish() TestBase._mark_entity_for_removal("data", xor_dataset.id) TestBase.logger.info( - "collected from {}: {}".format(__file__.split("/")[-1], xor_dataset.id) - ) - self.assertEqual( - _get_online_dataset_arff(xor_dataset.id), - xor_dataset._dataset, - "Uploaded ARFF does not match original one", - ) - self.assertEqual( - _get_online_dataset_format(xor_dataset.id), "sparse_arff", "Wrong format for dataset" + "collected from {}: {}".format(__file__.split("/")[-1], xor_dataset.id), ) + assert ( + _get_online_dataset_arff(xor_dataset.id) == xor_dataset._dataset + ), "Uploaded ARFF does not match original one" + assert ( + _get_online_dataset_format(xor_dataset.id) == "sparse_arff" + ), "Wrong format for dataset" # test the list of dicts sparse representation sparse_data = [{0: 0.0}, {1: 1.0, 2: 1.0}, {0: 1.0, 2: 1.0}, {0: 1.0, 1: 1.0}] @@ -882,16 +906,14 @@ def test_create_dataset_sparse(self): xor_dataset.publish() TestBase._mark_entity_for_removal("data", xor_dataset.id) TestBase.logger.info( - "collected from {}: {}".format(__file__.split("/")[-1], xor_dataset.id) - ) - self.assertEqual( - _get_online_dataset_arff(xor_dataset.id), - xor_dataset._dataset, - "Uploaded ARFF does not match original one", - ) - self.assertEqual( - _get_online_dataset_format(xor_dataset.id), "sparse_arff", "Wrong format for dataset" + "collected from {}: {}".format(__file__.split("/")[-1], xor_dataset.id), ) + assert ( + _get_online_dataset_arff(xor_dataset.id) == xor_dataset._dataset + ), "Uploaded ARFF does not match original one" + assert ( + _get_online_dataset_format(xor_dataset.id) == "sparse_arff" + ), "Wrong format for dataset" def test_create_invalid_dataset(self): data = [ @@ -928,15 +950,11 @@ def test_get_online_dataset_arff(self): # the same as the arff from _get_arff function d_format = (dataset.format).lower() - self.assertEqual( - dataset._get_arff(d_format), - decoder.decode( - _get_online_dataset_arff(dataset_id), - encode_nominal=True, - return_type=arff.DENSE if d_format == "arff" else arff.COO, - ), - "ARFF files are not equal", - ) + assert dataset._get_arff(d_format) == decoder.decode( + _get_online_dataset_arff(dataset_id), + encode_nominal=True, + return_type=arff.DENSE if d_format == "arff" else arff.COO, + ), "ARFF files are not equal" def test_topic_api_error(self): # Check server exception when non-admin accessses apis @@ -961,11 +979,9 @@ def test_get_online_dataset_format(self): dataset_id = 77 dataset = openml.datasets.get_dataset(dataset_id, download_data=False) - self.assertEqual( - (dataset.format).lower(), - _get_online_dataset_format(dataset_id), - "The format of the ARFF files is different", - ) + assert dataset.format.lower() == _get_online_dataset_format( + dataset_id + ), "The format of the ARFF files is different" def test_create_dataset_pandas(self): data = [ @@ -1012,15 +1028,13 @@ def test_create_dataset_pandas(self): dataset.publish() TestBase._mark_entity_for_removal("data", dataset.id) TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id)) - self.assertEqual( - _get_online_dataset_arff(dataset.id), - dataset._dataset, - "Uploaded ARFF does not match original one", - ) + assert ( + _get_online_dataset_arff(dataset.id) == dataset._dataset + ), "Uploaded ARFF does not match original one" # Check that DataFrame with Sparse columns are supported properly sparse_data = scipy.sparse.coo_matrix( - ([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])) + ([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])), ) column_names = ["input1", "input2", "y"] df = pd.DataFrame.sparse.from_spmatrix(sparse_data, columns=column_names) @@ -1047,14 +1061,10 @@ def test_create_dataset_pandas(self): dataset.publish() TestBase._mark_entity_for_removal("data", dataset.id) TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id)) - self.assertEqual( - _get_online_dataset_arff(dataset.id), - dataset._dataset, - "Uploaded ARFF does not match original one", - ) - self.assertEqual( - _get_online_dataset_format(dataset.id), "sparse_arff", "Wrong format for dataset" - ) + assert ( + _get_online_dataset_arff(dataset.id) == dataset._dataset + ), "Uploaded ARFF does not match original one" + assert _get_online_dataset_format(dataset.id) == "sparse_arff", "Wrong format for dataset" # Check that we can overwrite the attributes data = [["a"], ["b"], ["c"], ["d"], ["e"]] @@ -1084,10 +1094,8 @@ def test_create_dataset_pandas(self): TestBase._mark_entity_for_removal("data", dataset.id) TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id)) downloaded_data = _get_online_dataset_arff(dataset.id) - self.assertEqual( - downloaded_data, dataset._dataset, "Uploaded ARFF does not match original one" - ) - self.assertTrue("@ATTRIBUTE rnd_str {a, b, c, d, e, f, g}" in downloaded_data) + assert downloaded_data == dataset._dataset, "Uploaded ARFF does not match original one" + assert "@ATTRIBUTE rnd_str {a, b, c, d, e, f, g}" in downloaded_data def test_ignore_attributes_dataset(self): data = [ @@ -1136,7 +1144,7 @@ def test_ignore_attributes_dataset(self): original_data_url=original_data_url, paper_url=paper_url, ) - self.assertEqual(dataset.ignore_attribute, ["outlook"]) + assert dataset.ignore_attribute == ["outlook"] # pass a list to ignore_attribute ignore_attribute = ["outlook", "windy"] @@ -1158,7 +1166,7 @@ def test_ignore_attributes_dataset(self): original_data_url=original_data_url, paper_url=paper_url, ) - self.assertEqual(dataset.ignore_attribute, ignore_attribute) + assert dataset.ignore_attribute == ignore_attribute # raise an error if unknown type err_msg = "Wrong data type for ignore_attribute. Should be list." @@ -1173,7 +1181,7 @@ def test_ignore_attributes_dataset(self): licence=licence, default_target_attribute=default_target_attribute, row_id_attribute=None, - ignore_attribute=tuple(["outlook", "windy"]), + ignore_attribute=("outlook", "windy"), citation=citation, attributes="auto", data=df, @@ -1235,10 +1243,10 @@ def test_publish_fetch_ignore_attribute(self): TestBase._mark_entity_for_removal("data", dataset.id) TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id)) # test if publish was successful - self.assertIsInstance(dataset.id, int) + assert isinstance(dataset.id, int) downloaded_dataset = self._wait_for_dataset_being_processed(dataset.id) - self.assertEqual(downloaded_dataset.ignore_attribute, ignore_attribute) + assert downloaded_dataset.ignore_attribute == ignore_attribute def _wait_for_dataset_being_processed(self, dataset_id): downloaded_dataset = None @@ -1255,12 +1263,12 @@ def _wait_for_dataset_being_processed(self, dataset_id): # returned code 273: Dataset not processed yet # returned code 362: No qualities found TestBase.logger.error( - "Failed to fetch dataset:{} with '{}'.".format(dataset_id, str(e)) + f"Failed to fetch dataset:{dataset_id} with '{e!s}'.", ) time.sleep(10) continue if downloaded_dataset is None: - raise ValueError("TIMEOUT: Failed to fetch uploaded dataset - {}".format(dataset_id)) + raise ValueError(f"TIMEOUT: Failed to fetch uploaded dataset - {dataset_id}") return downloaded_dataset def test_create_dataset_row_id_attribute_error(self): @@ -1321,7 +1329,8 @@ def test_create_dataset_row_id_attribute_inference(self): df_index_name = [None, "index_name"] expected_row_id = [None, "index_name", "integer", "integer"] for output_row_id, (row_id, index_name) in zip( - expected_row_id, product(row_id_attr, df_index_name) + expected_row_id, + product(row_id_attr, df_index_name), ): df.index.name = index_name dataset = openml.datasets.functions.create_dataset( @@ -1342,18 +1351,18 @@ def test_create_dataset_row_id_attribute_inference(self): original_data_url=original_data_url, paper_url=paper_url, ) - self.assertEqual(dataset.row_id_attribute, output_row_id) + assert dataset.row_id_attribute == output_row_id dataset.publish() TestBase._mark_entity_for_removal("data", dataset.id) TestBase.logger.info( - "collected from {}: {}".format(__file__.split("/")[-1], dataset.id) + "collected from {}: {}".format(__file__.split("/")[-1], dataset.id), ) arff_dataset = arff.loads(_get_online_dataset_arff(dataset.id)) arff_data = np.array(arff_dataset["data"], dtype=object) # if we set the name of the index then the index will be added to # the data expected_shape = (5, 3) if index_name is None else (5, 4) - self.assertEqual(arff_data.shape, expected_shape) + assert arff_data.shape == expected_shape def test_create_dataset_attributes_auto_without_df(self): # attributes cannot be inferred without passing a dataframe @@ -1365,7 +1374,7 @@ def test_create_dataset_attributes_auto_without_df(self): collection_date = "01-01-2018" language = "English" licence = "MIT" - default_target_attribute = "col_{}".format(data.shape[1] - 1) + default_target_attribute = f"col_{data.shape[1] - 1}" citation = "None" original_data_url = "http://openml.github.io/openml-python" paper_url = "http://openml.github.io/openml-python" @@ -1392,23 +1401,23 @@ def test_create_dataset_attributes_auto_without_df(self): def test_list_qualities(self): qualities = openml.datasets.list_qualities() - self.assertEqual(isinstance(qualities, list), True) - self.assertEqual(all([isinstance(q, str) for q in qualities]), True) + assert isinstance(qualities, list) is True + assert all(isinstance(q, str) for q in qualities) is True def test_get_dataset_cache_format_pickle(self): dataset = openml.datasets.get_dataset(1) dataset.get_data() - self.assertEqual(type(dataset), OpenMLDataset) - self.assertEqual(dataset.name, "anneal") - self.assertGreater(len(dataset.features), 1) - self.assertGreater(len(dataset.qualities), 4) + assert type(dataset) == OpenMLDataset + assert dataset.name == "anneal" + assert len(dataset.features) > 1 + assert len(dataset.qualities) > 4 X, y, categorical, attribute_names = dataset.get_data() - self.assertIsInstance(X, pd.DataFrame) - self.assertEqual(X.shape, (898, 39)) - self.assertEqual(len(categorical), X.shape[1]) - self.assertEqual(len(attribute_names), X.shape[1]) + assert isinstance(X, pd.DataFrame) + assert X.shape == (898, 39) + assert len(categorical) == X.shape[1] + assert len(attribute_names) == X.shape[1] def test_get_dataset_cache_format_feather(self): # This test crashed due to using the parquet file by default, which is downloaded @@ -1416,7 +1425,7 @@ def test_get_dataset_cache_format_feather(self): # The parquet file on minio with ID 128 is not the iris dataset from the test server. dataset = openml.datasets.get_dataset(128, cache_format="feather") # Workaround - dataset._minio_url = None + dataset._parquet_url = None dataset.parquet_file = None dataset.get_data() @@ -1426,21 +1435,21 @@ def test_get_dataset_cache_format_feather(self): feather_file = os.path.join(cache_dir_for_id, "dataset.feather") pickle_file = os.path.join(cache_dir_for_id, "dataset.feather.attributes.pkl.py3") data = pd.read_feather(feather_file) - self.assertTrue(os.path.isfile(feather_file), msg="Feather file is missing") - self.assertTrue(os.path.isfile(pickle_file), msg="Attributes pickle file is missing") - self.assertEqual(data.shape, (150, 5)) + assert os.path.isfile(feather_file), "Feather file is missing" + assert os.path.isfile(pickle_file), "Attributes pickle file is missing" + assert data.shape == (150, 5) # Check if get_data is able to retrieve feather data - self.assertEqual(type(dataset), OpenMLDataset) - self.assertEqual(dataset.name, "iris") - self.assertGreater(len(dataset.features), 1) - self.assertGreater(len(dataset.qualities), 4) + assert type(dataset) == OpenMLDataset + assert dataset.name == "iris" + assert len(dataset.features) > 1 + assert len(dataset.qualities) > 4 X, y, categorical, attribute_names = dataset.get_data() - self.assertIsInstance(X, pd.DataFrame) - self.assertEqual(X.shape, (150, 5)) - self.assertEqual(len(categorical), X.shape[1]) - self.assertEqual(len(attribute_names), X.shape[1]) + assert isinstance(X, pd.DataFrame) + assert X.shape == (150, 5) + assert len(categorical) == X.shape[1] + assert len(attribute_names) == X.shape[1] def test_data_edit_non_critical_field(self): # Case 1 @@ -1459,9 +1468,9 @@ def test_data_edit_non_critical_field(self): citation="The use of multiple measurements in taxonomic problems", language="English", ) - self.assertEqual(did, result) + assert did == result edited_dataset = openml.datasets.get_dataset(did) - self.assertEqual(edited_dataset.description, desc) + assert edited_dataset.description == desc def test_data_edit_critical_field(self): # Case 2 @@ -1470,15 +1479,15 @@ def test_data_edit_critical_field(self): did = fork_dataset(1) self._wait_for_dataset_being_processed(did) result = edit_dataset(did, default_target_attribute="shape", ignore_attribute="oil") - self.assertEqual(did, result) + assert did == result n_tries = 10 # we need to wait for the edit to be reflected on the server for i in range(n_tries): edited_dataset = openml.datasets.get_dataset(did) try: - self.assertEqual(edited_dataset.default_target_attribute, "shape", edited_dataset) - self.assertEqual(edited_dataset.ignore_attribute, ["oil"], edited_dataset) + assert edited_dataset.default_target_attribute == "shape", edited_dataset + assert edited_dataset.ignore_attribute == ["oil"], edited_dataset break except AssertionError as e: if i == n_tries - 1: @@ -1486,7 +1495,7 @@ def test_data_edit_critical_field(self): time.sleep(10) # Delete the cache dir to get the newer version of the dataset shutil.rmtree( - os.path.join(self.workdir, "org", "openml", "test", "datasets", str(did)) + os.path.join(self.workdir, "org", "openml", "test", "datasets", str(did)), ) def test_data_edit_errors(self): @@ -1547,7 +1556,7 @@ def test_data_edit_errors(self): def test_data_fork(self): did = 1 result = fork_dataset(did) - self.assertNotEqual(did, result) + assert did != result # Check server exception when unknown dataset is provided self.assertRaisesRegex( OpenMLServerException, @@ -1556,15 +1565,17 @@ def test_data_fork(self): data_id=999999, ) + @pytest.mark.production() def test_get_dataset_parquet(self): # Parquet functionality is disabled on the test server # There is no parquet-copy of the test server yet. openml.config.server = self.production_server dataset = openml.datasets.get_dataset(61) - self.assertIsNotNone(dataset._minio_url) - self.assertIsNotNone(dataset.parquet_file) - self.assertTrue(os.path.isfile(dataset.parquet_file)) + assert dataset._parquet_url is not None + assert dataset.parquet_file is not None + assert os.path.isfile(dataset.parquet_file) + @pytest.mark.production() def test_list_datasets_with_high_size_parameter(self): # Testing on prod since concurrent deletion of uploded datasets make the test fail openml.config.server = self.production_server @@ -1574,11 +1585,11 @@ def test_list_datasets_with_high_size_parameter(self): # Reverting to test server openml.config.server = self.test_server - self.assertEqual(len(datasets_a), len(datasets_b)) + assert len(datasets_a) == len(datasets_b) @pytest.mark.parametrize( - "default_target_attribute,row_id_attribute,ignore_attribute", + ("default_target_attribute", "row_id_attribute", "ignore_attribute"), [ ("wrong", None, None), (None, "wrong", None), @@ -1590,7 +1601,9 @@ def test_list_datasets_with_high_size_parameter(self): ], ) def test_invalid_attribute_validations( - default_target_attribute, row_id_attribute, ignore_attribute + default_target_attribute, + row_id_attribute, + ignore_attribute, ): data = [ ["a", "sunny", 85.0, 85.0, "FALSE", "no"], @@ -1637,7 +1650,7 @@ def test_invalid_attribute_validations( @pytest.mark.parametrize( - "default_target_attribute,row_id_attribute,ignore_attribute", + ("default_target_attribute", "row_id_attribute", "ignore_attribute"), [ ("outlook", None, None), (None, "outlook", None), @@ -1735,7 +1748,7 @@ def test_delete_dataset(self): ) dataset.publish() _dataset_id = dataset.id - self.assertTrue(openml.datasets.delete_dataset(_dataset_id)) + assert openml.datasets.delete_dataset(_dataset_id) @mock.patch.object(requests.Session, "delete") @@ -1745,7 +1758,8 @@ def test_delete_dataset_not_owned(mock_delete, test_files_directory, test_api_ke test_files_directory / "mock_responses" / "datasets" / "data_delete_not_owned.xml" ) mock_delete.return_value = create_request_response( - status_code=412, content_filepath=content_file + status_code=412, + content_filepath=content_file, ) with pytest.raises( @@ -1768,7 +1782,8 @@ def test_delete_dataset_with_run(mock_delete, test_files_directory, test_api_key test_files_directory / "mock_responses" / "datasets" / "data_delete_has_tasks.xml" ) mock_delete.return_value = create_request_response( - status_code=412, content_filepath=content_file + status_code=412, + content_filepath=content_file, ) with pytest.raises( @@ -1791,7 +1806,8 @@ def test_delete_dataset_success(mock_delete, test_files_directory, test_api_key) test_files_directory / "mock_responses" / "datasets" / "data_delete_successful.xml" ) mock_delete.return_value = create_request_response( - status_code=200, content_filepath=content_file + status_code=200, + content_filepath=content_file, ) success = openml.datasets.delete_dataset(40000) @@ -1811,7 +1827,8 @@ def test_delete_unknown_dataset(mock_delete, test_files_directory, test_api_key) test_files_directory / "mock_responses" / "datasets" / "data_delete_not_exist.xml" ) mock_delete.return_value = create_request_response( - status_code=412, content_filepath=content_file + status_code=412, + content_filepath=content_file, ) with pytest.raises( @@ -1841,7 +1858,7 @@ def test_list_datasets(all_datasets: pd.DataFrame): # We can only perform a smoke test here because we test on dynamic # data from the internet... # 1087 as the number of datasets on openml.org - assert 100 <= len(all_datasets) + assert len(all_datasets) >= 100 _assert_datasets_have_id_and_valid_status(all_datasets) @@ -1853,13 +1870,14 @@ def test_list_datasets_by_tag(all_datasets: pd.DataFrame): def test_list_datasets_by_size(): datasets = openml.datasets.list_datasets(size=5, output_format="dataframe") - assert 5 == len(datasets) + assert len(datasets) == 5 _assert_datasets_have_id_and_valid_status(datasets) def test_list_datasets_by_number_instances(all_datasets: pd.DataFrame): small_datasets = openml.datasets.list_datasets( - number_instances="5..100", output_format="dataframe" + number_instances="5..100", + output_format="dataframe", ) assert 0 < len(small_datasets) <= len(all_datasets) _assert_datasets_have_id_and_valid_status(small_datasets) @@ -1867,7 +1885,8 @@ def test_list_datasets_by_number_instances(all_datasets: pd.DataFrame): def test_list_datasets_by_number_features(all_datasets: pd.DataFrame): wide_datasets = openml.datasets.list_datasets( - number_features="50..100", output_format="dataframe" + number_features="50..100", + output_format="dataframe", ) assert 8 <= len(wide_datasets) < len(all_datasets) _assert_datasets_have_id_and_valid_status(wide_datasets) @@ -1875,7 +1894,8 @@ def test_list_datasets_by_number_features(all_datasets: pd.DataFrame): def test_list_datasets_by_number_classes(all_datasets: pd.DataFrame): five_class_datasets = openml.datasets.list_datasets( - number_classes="5", output_format="dataframe" + number_classes="5", + output_format="dataframe", ) assert 3 <= len(five_class_datasets) < len(all_datasets) _assert_datasets_have_id_and_valid_status(five_class_datasets) @@ -1883,7 +1903,8 @@ def test_list_datasets_by_number_classes(all_datasets: pd.DataFrame): def test_list_datasets_by_number_missing_values(all_datasets: pd.DataFrame): na_datasets = openml.datasets.list_datasets( - number_missing_values="5..100", output_format="dataframe" + number_missing_values="5..100", + output_format="dataframe", ) assert 5 <= len(na_datasets) < len(all_datasets) _assert_datasets_have_id_and_valid_status(na_datasets) diff --git a/tests/test_evaluations/test_evaluation_functions.py b/tests/test_evaluations/test_evaluation_functions.py index 70f36ce19..7af01384f 100644 --- a/tests/test_evaluations/test_evaluation_functions.py +++ b/tests/test_evaluations/test_evaluation_functions.py @@ -1,4 +1,6 @@ # License: BSD 3-Clause +from __future__ import annotations + import pytest import openml @@ -12,19 +14,26 @@ class TestEvaluationFunctions(TestBase): def _check_list_evaluation_setups(self, **kwargs): evals_setups = openml.evaluations.list_evaluations_setups( - "predictive_accuracy", **kwargs, sort_order="desc", output_format="dataframe" + "predictive_accuracy", + **kwargs, + sort_order="desc", + output_format="dataframe", ) evals = openml.evaluations.list_evaluations( - "predictive_accuracy", **kwargs, sort_order="desc", output_format="dataframe" + "predictive_accuracy", + **kwargs, + sort_order="desc", + output_format="dataframe", ) # Check if list is non-empty - self.assertGreater(len(evals_setups), 0) + assert len(evals_setups) > 0 # Check if length is accurate - self.assertEqual(len(evals_setups), len(evals)) + assert len(evals_setups) == len(evals) # Check if output from sort is sorted in the right order self.assertSequenceEqual( - sorted(evals_setups["value"].tolist(), reverse=True), evals_setups["value"].tolist() + sorted(evals_setups["value"].tolist(), reverse=True), + evals_setups["value"].tolist(), ) # Check if output and order of list_evaluations is preserved @@ -34,7 +43,7 @@ def _check_list_evaluation_setups(self, **kwargs): evals_setups = evals_setups.head(1) # Check if the hyper-parameter column is as accurate and flow_id - for index, row in evals_setups.iterrows(): + for _index, row in evals_setups.iterrows(): params = openml.runs.get_run(row["run_id"]).parameter_settings list1 = [param["oml:value"] for param in params] list2 = list(row["parameters"].values()) @@ -42,99 +51,119 @@ def _check_list_evaluation_setups(self, **kwargs): self.assertSequenceEqual(sorted(list1), sorted(list2)) return evals_setups + @pytest.mark.production() def test_evaluation_list_filter_task(self): openml.config.server = self.production_server task_id = 7312 evaluations = openml.evaluations.list_evaluations( - "predictive_accuracy", size=110, tasks=[task_id] + "predictive_accuracy", + size=110, + tasks=[task_id], ) - self.assertGreater(len(evaluations), 100) - for run_id in evaluations.keys(): - self.assertEqual(evaluations[run_id].task_id, task_id) + assert len(evaluations) > 100 + for run_id in evaluations: + assert evaluations[run_id].task_id == task_id # default behaviour of this method: return aggregated results (not # per fold) - self.assertIsNotNone(evaluations[run_id].value) - self.assertIsNone(evaluations[run_id].values) + assert evaluations[run_id].value is not None + assert evaluations[run_id].values is None + @pytest.mark.production() def test_evaluation_list_filter_uploader_ID_16(self): openml.config.server = self.production_server uploader_id = 16 evaluations = openml.evaluations.list_evaluations( - "predictive_accuracy", size=60, uploaders=[uploader_id], output_format="dataframe" + "predictive_accuracy", + size=60, + uploaders=[uploader_id], + output_format="dataframe", ) - self.assertEqual(evaluations["uploader"].unique(), [uploader_id]) + assert evaluations["uploader"].unique() == [uploader_id] - self.assertGreater(len(evaluations), 50) + assert len(evaluations) > 50 + @pytest.mark.production() def test_evaluation_list_filter_uploader_ID_10(self): openml.config.server = self.production_server setup_id = 10 evaluations = openml.evaluations.list_evaluations( - "predictive_accuracy", size=60, setups=[setup_id] + "predictive_accuracy", + size=60, + setups=[setup_id], ) - self.assertGreater(len(evaluations), 50) - for run_id in evaluations.keys(): - self.assertEqual(evaluations[run_id].setup_id, setup_id) + assert len(evaluations) > 50 + for run_id in evaluations: + assert evaluations[run_id].setup_id == setup_id # default behaviour of this method: return aggregated results (not # per fold) - self.assertIsNotNone(evaluations[run_id].value) - self.assertIsNone(evaluations[run_id].values) + assert evaluations[run_id].value is not None + assert evaluations[run_id].values is None + @pytest.mark.production() def test_evaluation_list_filter_flow(self): openml.config.server = self.production_server flow_id = 100 evaluations = openml.evaluations.list_evaluations( - "predictive_accuracy", size=10, flows=[flow_id] + "predictive_accuracy", + size=10, + flows=[flow_id], ) - self.assertGreater(len(evaluations), 2) - for run_id in evaluations.keys(): - self.assertEqual(evaluations[run_id].flow_id, flow_id) + assert len(evaluations) > 2 + for run_id in evaluations: + assert evaluations[run_id].flow_id == flow_id # default behaviour of this method: return aggregated results (not # per fold) - self.assertIsNotNone(evaluations[run_id].value) - self.assertIsNone(evaluations[run_id].values) + assert evaluations[run_id].value is not None + assert evaluations[run_id].values is None + @pytest.mark.production() def test_evaluation_list_filter_run(self): openml.config.server = self.production_server run_id = 12 evaluations = openml.evaluations.list_evaluations( - "predictive_accuracy", size=2, runs=[run_id] + "predictive_accuracy", + size=2, + runs=[run_id], ) - self.assertEqual(len(evaluations), 1) - for run_id in evaluations.keys(): - self.assertEqual(evaluations[run_id].run_id, run_id) + assert len(evaluations) == 1 + for run_id in evaluations: + assert evaluations[run_id].run_id == run_id # default behaviour of this method: return aggregated results (not # per fold) - self.assertIsNotNone(evaluations[run_id].value) - self.assertIsNone(evaluations[run_id].values) + assert evaluations[run_id].value is not None + assert evaluations[run_id].values is None + @pytest.mark.production() def test_evaluation_list_limit(self): openml.config.server = self.production_server evaluations = openml.evaluations.list_evaluations( - "predictive_accuracy", size=100, offset=100 + "predictive_accuracy", + size=100, + offset=100, ) - self.assertEqual(len(evaluations), 100) + assert len(evaluations) == 100 def test_list_evaluations_empty(self): evaluations = openml.evaluations.list_evaluations("unexisting_measure") if len(evaluations) > 0: raise ValueError("UnitTest Outdated, got somehow results") - self.assertIsInstance(evaluations, dict) + assert isinstance(evaluations, dict) + @pytest.mark.production() def test_evaluation_list_per_fold(self): openml.config.server = self.production_server size = 1000 @@ -152,10 +181,10 @@ def test_evaluation_list_per_fold(self): per_fold=True, ) - self.assertEqual(len(evaluations), size) - for run_id in evaluations.keys(): - self.assertIsNone(evaluations[run_id].value) - self.assertIsNotNone(evaluations[run_id].values) + assert len(evaluations) == size + for run_id in evaluations: + assert evaluations[run_id].value is None + assert evaluations[run_id].values is not None # potentially we could also test array values, but these might be # added in the future @@ -168,39 +197,48 @@ def test_evaluation_list_per_fold(self): uploaders=uploader_ids, per_fold=False, ) - for run_id in evaluations.keys(): - self.assertIsNotNone(evaluations[run_id].value) - self.assertIsNone(evaluations[run_id].values) + for run_id in evaluations: + assert evaluations[run_id].value is not None + assert evaluations[run_id].values is None + @pytest.mark.production() def test_evaluation_list_sort(self): openml.config.server = self.production_server size = 10 task_id = 6 # Get all evaluations of the task unsorted_eval = openml.evaluations.list_evaluations( - "predictive_accuracy", size=None, offset=0, tasks=[task_id] + "predictive_accuracy", + size=None, + offset=0, + tasks=[task_id], ) # Get top 10 evaluations of the same task sorted_eval = openml.evaluations.list_evaluations( - "predictive_accuracy", size=size, offset=0, tasks=[task_id], sort_order="desc" + "predictive_accuracy", + size=size, + offset=0, + tasks=[task_id], + sort_order="desc", ) - self.assertEqual(len(sorted_eval), size) - self.assertGreater(len(unsorted_eval), 0) + assert len(sorted_eval) == size + assert len(unsorted_eval) > 0 sorted_output = [evaluation.value for evaluation in sorted_eval.values()] unsorted_output = [evaluation.value for evaluation in unsorted_eval.values()] # Check if output from sort is sorted in the right order - self.assertTrue(sorted(sorted_output, reverse=True) == sorted_output) + assert sorted(sorted_output, reverse=True) == sorted_output # Compare manual sorting against sorted output test_output = sorted(unsorted_output, reverse=True) - self.assertTrue(test_output[:size] == sorted_output) + assert test_output[:size] == sorted_output def test_list_evaluation_measures(self): measures = openml.evaluations.list_evaluation_measures() - self.assertEqual(isinstance(measures, list), True) - self.assertEqual(all([isinstance(s, str) for s in measures]), True) + assert isinstance(measures, list) is True + assert all(isinstance(s, str) for s in measures) is True + @pytest.mark.production() def test_list_evaluations_setups_filter_flow(self): openml.config.server = self.production_server flow_id = [405] @@ -217,8 +255,9 @@ def test_list_evaluations_setups_filter_flow(self): ) columns = list(evals_cols.columns) keys = list(evals["parameters"].values[0].keys()) - self.assertTrue(all(elem in columns for elem in keys)) + assert all(elem in columns for elem in keys) + @pytest.mark.production() def test_list_evaluations_setups_filter_task(self): openml.config.server = self.production_server task_id = [6] diff --git a/tests/test_evaluations/test_evaluations_example.py b/tests/test_evaluations/test_evaluations_example.py index 5715b570a..bf5b03f3f 100644 --- a/tests/test_evaluations/test_evaluations_example.py +++ b/tests/test_evaluations/test_evaluations_example.py @@ -1,4 +1,5 @@ # License: BSD 3-Clause +from __future__ import annotations import unittest @@ -8,9 +9,10 @@ def test_example_python_paper(self): # Example script which will appear in the upcoming OpenML-Python paper # This test ensures that the example will keep running! - import openml - import numpy as np import matplotlib.pyplot as plt + import numpy as np + + import openml df = openml.evaluations.list_evaluations_setups( "predictive_accuracy", diff --git a/tests/test_extensions/test_functions.py b/tests/test_extensions/test_functions.py index 36bb06061..bc7937c88 100644 --- a/tests/test_extensions/test_functions.py +++ b/tests/test_extensions/test_functions.py @@ -1,10 +1,12 @@ # License: BSD 3-Clause +from __future__ import annotations import inspect -import openml.testing +import pytest -from openml.extensions import get_extension_by_model, get_extension_by_flow, register_extension +import openml.testing +from openml.extensions import get_extension_by_flow, get_extension_by_model, register_extension class DummyFlow: @@ -61,31 +63,29 @@ def setUp(self): _unregister() def test_get_extension_by_flow(self): - self.assertIsNone(get_extension_by_flow(DummyFlow())) - with self.assertRaisesRegex(ValueError, "No extension registered which can handle flow:"): + assert get_extension_by_flow(DummyFlow()) is None + with pytest.raises(ValueError, match="No extension registered which can handle flow:"): get_extension_by_flow(DummyFlow(), raise_if_no_extension=True) register_extension(DummyExtension1) - self.assertIsInstance(get_extension_by_flow(DummyFlow()), DummyExtension1) + assert isinstance(get_extension_by_flow(DummyFlow()), DummyExtension1) register_extension(DummyExtension2) - self.assertIsInstance(get_extension_by_flow(DummyFlow()), DummyExtension1) + assert isinstance(get_extension_by_flow(DummyFlow()), DummyExtension1) register_extension(DummyExtension1) - with self.assertRaisesRegex( - ValueError, - "Multiple extensions registered which can handle flow:", + with pytest.raises( + ValueError, match="Multiple extensions registered which can handle flow:" ): get_extension_by_flow(DummyFlow()) def test_get_extension_by_model(self): - self.assertIsNone(get_extension_by_model(DummyModel())) - with self.assertRaisesRegex(ValueError, "No extension registered which can handle model:"): + assert get_extension_by_model(DummyModel()) is None + with pytest.raises(ValueError, match="No extension registered which can handle model:"): get_extension_by_model(DummyModel(), raise_if_no_extension=True) register_extension(DummyExtension1) - self.assertIsInstance(get_extension_by_model(DummyModel()), DummyExtension1) + assert isinstance(get_extension_by_model(DummyModel()), DummyExtension1) register_extension(DummyExtension2) - self.assertIsInstance(get_extension_by_model(DummyModel()), DummyExtension1) + assert isinstance(get_extension_by_model(DummyModel()), DummyExtension1) register_extension(DummyExtension1) - with self.assertRaisesRegex( - ValueError, - "Multiple extensions registered which can handle model:", + with pytest.raises( + ValueError, match="Multiple extensions registered which can handle model:" ): get_extension_by_model(DummyModel()) diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index 2b07796ed..4c7b0d60e 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -1,17 +1,17 @@ # License: BSD 3-Clause +from __future__ import annotations import collections import json -import re import os +import re import sys -from typing import Any import unittest -from distutils.version import LooseVersion +import warnings from collections import OrderedDict +from distutils.version import LooseVersion +from typing import Any from unittest import mock -import warnings -from packaging import version import numpy as np import pandas as pd @@ -19,6 +19,7 @@ import scipy.optimize import scipy.stats import sklearn.base +import sklearn.cluster import sklearn.datasets import sklearn.decomposition import sklearn.dummy @@ -32,19 +33,17 @@ import sklearn.pipeline import sklearn.preprocessing import sklearn.tree -import sklearn.cluster +from packaging import version from sklearn.pipeline import make_pipeline from sklearn.preprocessing import OneHotEncoder, StandardScaler import openml -from openml.extensions.sklearn import SklearnExtension from openml.exceptions import PyOpenMLError +from openml.extensions.sklearn import SklearnExtension, cat, cont from openml.flows import OpenMLFlow from openml.flows.functions import assert_flows_equal from openml.runs.trace import OpenMLRunTrace -from openml.testing import TestBase, SimpleImputer, CustomImputer -from openml.extensions.sklearn import cat, cont - +from openml.testing import CustomImputer, SimpleImputer, TestBase this_directory = os.path.dirname(os.path.abspath(__file__)) sys.path.append(this_directory) @@ -115,7 +114,12 @@ def _get_expected_pipeline_description(self, model: Any) -> str: return expected_fixture def _serialization_test_helper( - self, model, X, y, subcomponent_parameters, dependencies_mock_call_count=(1, 2) + self, + model, + X, + y, + subcomponent_parameters, + dependencies_mock_call_count=(1, 2), ): # Regex pattern for memory addresses of style 0x7f8e0f31ecf8 pattern = re.compile("0x[0-9a-f]{12}") @@ -129,61 +133,60 @@ def _serialization_test_helper( new_model = self.extension.flow_to_model(serialization) # compares string representations of the dict, as it potentially # contains complex objects that can not be compared with == op - self.assertEqual( - re.sub(pattern, str(model.get_params()), ""), - re.sub(pattern, str(new_model.get_params()), ""), + assert re.sub(pattern, str(model.get_params()), "") == re.sub( + pattern, str(new_model.get_params()), "" ) - self.assertEqual(type(new_model), type(model)) - self.assertIsNot(new_model, model) + assert type(new_model) == type(model) + assert new_model is not model if X is not None: new_model.fit(self.X, self.y) - self.assertEqual(check_dependencies_mock.call_count, dependencies_mock_call_count[0]) + assert check_dependencies_mock.call_count == dependencies_mock_call_count[0] xml = serialization._to_dict() new_model2 = self.extension.flow_to_model(OpenMLFlow._from_dict(xml)) - self.assertEqual( - re.sub(pattern, str(model.get_params()), ""), - re.sub(pattern, str(new_model2.get_params()), ""), + assert re.sub(pattern, str(model.get_params()), "") == re.sub( + pattern, str(new_model2.get_params()), "" ) - self.assertEqual(type(new_model2), type(model)) - self.assertIsNot(new_model2, model) + assert type(new_model2) == type(model) + assert new_model2 is not model if X is not None: new_model2.fit(self.X, self.y) - self.assertEqual(check_dependencies_mock.call_count, dependencies_mock_call_count[1]) + assert check_dependencies_mock.call_count == dependencies_mock_call_count[1] if subcomponent_parameters: for nm in (new_model, new_model2): new_model_params = nm.get_params() model_params = model.get_params() for subcomponent_parameter in subcomponent_parameters: - self.assertEqual( - type(new_model_params[subcomponent_parameter]), - type(model_params[subcomponent_parameter]), + assert type(new_model_params[subcomponent_parameter]) == type( + model_params[subcomponent_parameter] ) - self.assertIsNot( - new_model_params[subcomponent_parameter], - model_params[subcomponent_parameter], + assert ( + new_model_params[subcomponent_parameter] + is not model_params[subcomponent_parameter] ) del new_model_params[subcomponent_parameter] del model_params[subcomponent_parameter] - self.assertEqual(new_model_params, model_params) + assert new_model_params == model_params return serialization, new_model - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_serialize_model(self): model = sklearn.tree.DecisionTreeClassifier( - criterion="entropy", max_features="auto", max_leaf_nodes=2000 + criterion="entropy", + max_features="auto", + max_leaf_nodes=2000, ) tree_name = "tree" if LooseVersion(sklearn.__version__) < "0.22" else "_classes" - fixture_name = "sklearn.tree.{}.DecisionTreeClassifier".format(tree_name) + fixture_name = f"sklearn.tree.{tree_name}.DecisionTreeClassifier" fixture_short_name = "sklearn.DecisionTreeClassifier" # str obtained from self.extension._get_sklearn_description(model) fixture_description = "A decision tree classifier." @@ -207,7 +210,7 @@ def test_serialize_model(self): ("presort", "false"), ("random_state", "null"), ("splitter", '"best"'), - ) + ), ) elif LooseVersion(sklearn.__version__) < "1.0": fixture_parameters = OrderedDict( @@ -225,7 +228,7 @@ def test_serialize_model(self): ("presort", presort_val), ("random_state", "null"), ("splitter", '"best"'), - ) + ), ) else: fixture_parameters = OrderedDict( @@ -242,7 +245,7 @@ def test_serialize_model(self): ("presort", presort_val), ("random_state", "null"), ("splitter", '"best"'), - ) + ), ) if LooseVersion(sklearn.__version__) >= "0.22": @@ -251,22 +254,26 @@ def test_serialize_model(self): if LooseVersion(sklearn.__version__) >= "0.24": del fixture_parameters["presort"] - structure_fixture = {"sklearn.tree.{}.DecisionTreeClassifier".format(tree_name): []} + structure_fixture = {f"sklearn.tree.{tree_name}.DecisionTreeClassifier": []} serialization, _ = self._serialization_test_helper( - model, X=self.X, y=self.y, subcomponent_parameters=None + model, + X=self.X, + y=self.y, + subcomponent_parameters=None, ) structure = serialization.get_structure("name") - self.assertEqual(serialization.name, fixture_name) - self.assertEqual(serialization.class_name, fixture_name) - self.assertEqual(serialization.custom_name, fixture_short_name) - self.assertEqual(serialization.description, fixture_description) - self.assertEqual(serialization.parameters, fixture_parameters) - self.assertEqual(serialization.dependencies, version_fixture) + assert serialization.name == fixture_name + assert serialization.class_name == fixture_name + assert serialization.custom_name == fixture_short_name + assert serialization.description == fixture_description + assert serialization.parameters == fixture_parameters + assert serialization.dependencies == version_fixture self.assertDictEqual(structure, structure_fixture) - @pytest.mark.sklearn + @pytest.mark.sklearn() + @pytest.mark.production() def test_can_handle_flow(self): openml.config.server = self.production_server @@ -277,16 +284,16 @@ def test_can_handle_flow(self): openml.config.server = self.test_server - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_serialize_model_clustering(self): model = sklearn.cluster.KMeans() cluster_name = "k_means_" if LooseVersion(sklearn.__version__) < "0.22" else "_kmeans" - fixture_name = "sklearn.cluster.{}.KMeans".format(cluster_name) + fixture_name = f"sklearn.cluster.{cluster_name}.KMeans" fixture_short_name = "sklearn.KMeans" # str obtained from self.extension._get_sklearn_description(model) fixture_description = "K-Means clustering{}".format( - "" if LooseVersion(sklearn.__version__) < "0.22" else "." + "" if LooseVersion(sklearn.__version__) < "0.22" else ".", ) version_fixture = self.extension._min_dependency_str(sklearn.__version__) @@ -308,7 +315,7 @@ def test_serialize_model_clustering(self): ("random_state", "null"), ("tol", "0.0001"), ("verbose", "0"), - ) + ), ) elif LooseVersion(sklearn.__version__) < "1.0": fixture_parameters = OrderedDict( @@ -324,7 +331,7 @@ def test_serialize_model_clustering(self): ("random_state", "null"), ("tol", "0.0001"), ("verbose", "0"), - ) + ), ) elif LooseVersion(sklearn.__version__) < "1.1": fixture_parameters = OrderedDict( @@ -338,7 +345,7 @@ def test_serialize_model_clustering(self): ("random_state", "null"), ("tol", "0.0001"), ("verbose", "0"), - ) + ), ) else: n_init = '"warn"' if LooseVersion(sklearn.__version__) >= "1.2" else "10" @@ -353,12 +360,15 @@ def test_serialize_model_clustering(self): ("random_state", "null"), ("tol", "0.0001"), ("verbose", "0"), - ) + ), ) - fixture_structure = {"sklearn.cluster.{}.KMeans".format(cluster_name): []} + fixture_structure = {f"sklearn.cluster.{cluster_name}.KMeans": []} serialization, _ = self._serialization_test_helper( - model, X=None, y=None, subcomponent_parameters=None + model, + X=None, + y=None, + subcomponent_parameters=None, ) structure = serialization.get_structure("name") @@ -370,21 +380,22 @@ def test_serialize_model_clustering(self): assert serialization.dependencies == version_fixture assert structure == fixture_structure - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_serialize_model_with_subcomponent(self): model = sklearn.ensemble.AdaBoostClassifier( - n_estimators=100, base_estimator=sklearn.tree.DecisionTreeClassifier() + n_estimators=100, + base_estimator=sklearn.tree.DecisionTreeClassifier(), ) weight_name = "{}weight_boosting".format( - "" if LooseVersion(sklearn.__version__) < "0.22" else "_" + "" if LooseVersion(sklearn.__version__) < "0.22" else "_", ) tree_name = "tree" if LooseVersion(sklearn.__version__) < "0.22" else "_classes" fixture_name = ( - "sklearn.ensemble.{}.AdaBoostClassifier" - "(base_estimator=sklearn.tree.{}.DecisionTreeClassifier)".format(weight_name, tree_name) + f"sklearn.ensemble.{weight_name}.AdaBoostClassifier" + f"(base_estimator=sklearn.tree.{tree_name}.DecisionTreeClassifier)" ) - fixture_class_name = "sklearn.ensemble.{}.AdaBoostClassifier".format(weight_name) + fixture_class_name = f"sklearn.ensemble.{weight_name}.AdaBoostClassifier" fixture_short_name = "sklearn.AdaBoostClassifier" # str obtained from self.extension._get_sklearn_description(model) fixture_description = ( @@ -396,13 +407,13 @@ def test_serialize_model_with_subcomponent(self): " on difficult cases.\n\nThis class implements the algorithm known " "as AdaBoost-SAMME [2]." ) - fixture_subcomponent_name = "sklearn.tree.{}.DecisionTreeClassifier".format(tree_name) - fixture_subcomponent_class_name = "sklearn.tree.{}.DecisionTreeClassifier".format(tree_name) + fixture_subcomponent_name = f"sklearn.tree.{tree_name}.DecisionTreeClassifier" + fixture_subcomponent_class_name = f"sklearn.tree.{tree_name}.DecisionTreeClassifier" # str obtained from self.extension._get_sklearn_description(model.base_estimator) fixture_subcomponent_description = "A decision tree classifier." fixture_structure = { fixture_name: [], - "sklearn.tree.{}.DecisionTreeClassifier".format(tree_name): ["base_estimator"], + f"sklearn.tree.{tree_name}.DecisionTreeClassifier": ["base_estimator"], } serialization, _ = self._serialization_test_helper( @@ -414,24 +425,25 @@ def test_serialize_model_with_subcomponent(self): ) structure = serialization.get_structure("name") - self.assertEqual(serialization.name, fixture_name) - self.assertEqual(serialization.class_name, fixture_class_name) - self.assertEqual(serialization.custom_name, fixture_short_name) - self.assertEqual(serialization.description, fixture_description) - self.assertEqual(serialization.parameters["algorithm"], '"SAMME.R"') - self.assertIsInstance(serialization.parameters["base_estimator"], str) - self.assertEqual(serialization.parameters["learning_rate"], "1.0") - self.assertEqual(serialization.parameters["n_estimators"], "100") - self.assertEqual(serialization.components["base_estimator"].name, fixture_subcomponent_name) - self.assertEqual( - serialization.components["base_estimator"].class_name, fixture_subcomponent_class_name - ) - self.assertEqual( - serialization.components["base_estimator"].description, fixture_subcomponent_description + assert serialization.name == fixture_name + assert serialization.class_name == fixture_class_name + assert serialization.custom_name == fixture_short_name + assert serialization.description == fixture_description + assert serialization.parameters["algorithm"] == '"SAMME.R"' + assert isinstance(serialization.parameters["base_estimator"], str) + assert serialization.parameters["learning_rate"] == "1.0" + assert serialization.parameters["n_estimators"] == "100" + assert serialization.components["base_estimator"].name == fixture_subcomponent_name + assert ( + serialization.components["base_estimator"].class_name == fixture_subcomponent_class_name + ) + assert ( + serialization.components["base_estimator"].description + == fixture_subcomponent_description ) self.assertDictEqual(structure, fixture_structure) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_serialize_pipeline(self): scaler = sklearn.preprocessing.StandardScaler(with_mean=False) dummy = sklearn.dummy.DummyClassifier(strategy="prior") @@ -440,14 +452,14 @@ def test_serialize_pipeline(self): scaler_name = "data" if LooseVersion(sklearn.__version__) < "0.22" else "_data" fixture_name = ( "sklearn.pipeline.Pipeline(" - "scaler=sklearn.preprocessing.{}.StandardScaler," - "dummy=sklearn.dummy.DummyClassifier)".format(scaler_name) + f"scaler=sklearn.preprocessing.{scaler_name}.StandardScaler," + "dummy=sklearn.dummy.DummyClassifier)" ) fixture_short_name = "sklearn.Pipeline(StandardScaler,DummyClassifier)" fixture_description = self._get_expected_pipeline_description(model) fixture_structure = { fixture_name: [], - "sklearn.preprocessing.{}.StandardScaler".format(scaler_name): ["scaler"], + f"sklearn.preprocessing.{scaler_name}.StandardScaler": ["scaler"], "sklearn.dummy.DummyClassifier": ["dummy"], } @@ -460,9 +472,9 @@ def test_serialize_pipeline(self): ) structure = serialization.get_structure("name") - self.assertEqual(serialization.name, fixture_name) - self.assertEqual(serialization.custom_name, fixture_short_name) - self.assertEqual(serialization.description, fixture_description) + assert serialization.name == fixture_name + assert serialization.custom_name == fixture_short_name + assert serialization.description == fixture_description self.assertDictEqual(structure, fixture_structure) # Comparing the pipeline @@ -470,38 +482,35 @@ def test_serialize_pipeline(self): # as value # memory parameter has been added in 0.19, verbose in 0.21 if LooseVersion(sklearn.__version__) < "0.19": - self.assertEqual(len(serialization.parameters), 1) + assert len(serialization.parameters) == 1 elif LooseVersion(sklearn.__version__) < "0.21": - self.assertEqual(len(serialization.parameters), 2) + assert len(serialization.parameters) == 2 else: - self.assertEqual(len(serialization.parameters), 3) + assert len(serialization.parameters) == 3 # Hard to compare two representations of a dict due to possibly # different sorting. Making a json makes it easier - self.assertEqual( - json.loads(serialization.parameters["steps"]), - [ - { - "oml-python:serialized_object": "component_reference", - "value": {"key": "scaler", "step_name": "scaler"}, - }, - { - "oml-python:serialized_object": "component_reference", - "value": {"key": "dummy", "step_name": "dummy"}, - }, - ], - ) + assert json.loads(serialization.parameters["steps"]) == [ + { + "oml-python:serialized_object": "component_reference", + "value": {"key": "scaler", "step_name": "scaler"}, + }, + { + "oml-python:serialized_object": "component_reference", + "value": {"key": "dummy", "step_name": "dummy"}, + }, + ] # Checking the sub-component - self.assertEqual(len(serialization.components), 2) - self.assertIsInstance(serialization.components["scaler"], OpenMLFlow) - self.assertIsInstance(serialization.components["dummy"], OpenMLFlow) + assert len(serialization.components) == 2 + assert isinstance(serialization.components["scaler"], OpenMLFlow) + assert isinstance(serialization.components["dummy"], OpenMLFlow) - self.assertEqual([step[0] for step in new_model.steps], [step[0] for step in model.steps]) - self.assertIsNot(new_model.steps[0][1], model.steps[0][1]) - self.assertIsNot(new_model.steps[1][1], model.steps[1][1]) + assert [step[0] for step in new_model.steps] == [step[0] for step in model.steps] + assert new_model.steps[0][1] is not model.steps[0][1] + assert new_model.steps[1][1] is not model.steps[1][1] - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_serialize_pipeline_clustering(self): scaler = sklearn.preprocessing.StandardScaler(with_mean=False) km = sklearn.cluster.KMeans() @@ -511,15 +520,15 @@ def test_serialize_pipeline_clustering(self): cluster_name = "k_means_" if LooseVersion(sklearn.__version__) < "0.22" else "_kmeans" fixture_name = ( "sklearn.pipeline.Pipeline(" - "scaler=sklearn.preprocessing.{}.StandardScaler," - "clusterer=sklearn.cluster.{}.KMeans)".format(scaler_name, cluster_name) + f"scaler=sklearn.preprocessing.{scaler_name}.StandardScaler," + f"clusterer=sklearn.cluster.{cluster_name}.KMeans)" ) fixture_short_name = "sklearn.Pipeline(StandardScaler,KMeans)" fixture_description = self._get_expected_pipeline_description(model) fixture_structure = { fixture_name: [], - "sklearn.preprocessing.{}.StandardScaler".format(scaler_name): ["scaler"], - "sklearn.cluster.{}.KMeans".format(cluster_name): ["clusterer"], + f"sklearn.preprocessing.{scaler_name}.StandardScaler": ["scaler"], + f"sklearn.cluster.{cluster_name}.KMeans": ["clusterer"], } serialization, new_model = self._serialization_test_helper( model, @@ -530,9 +539,9 @@ def test_serialize_pipeline_clustering(self): ) structure = serialization.get_structure("name") - self.assertEqual(serialization.name, fixture_name) - self.assertEqual(serialization.custom_name, fixture_short_name) - self.assertEqual(serialization.description, fixture_description) + assert serialization.name == fixture_name + assert serialization.custom_name == fixture_short_name + assert serialization.description == fixture_description self.assertDictEqual(structure, fixture_structure) # Comparing the pipeline @@ -540,37 +549,34 @@ def test_serialize_pipeline_clustering(self): # as value # memory parameter has been added in 0.19 if LooseVersion(sklearn.__version__) < "0.19": - self.assertEqual(len(serialization.parameters), 1) + assert len(serialization.parameters) == 1 elif LooseVersion(sklearn.__version__) < "0.21": - self.assertEqual(len(serialization.parameters), 2) + assert len(serialization.parameters) == 2 else: - self.assertEqual(len(serialization.parameters), 3) + assert len(serialization.parameters) == 3 # Hard to compare two representations of a dict due to possibly # different sorting. Making a json makes it easier - self.assertEqual( - json.loads(serialization.parameters["steps"]), - [ - { - "oml-python:serialized_object": "component_reference", - "value": {"key": "scaler", "step_name": "scaler"}, - }, - { - "oml-python:serialized_object": "component_reference", - "value": {"key": "clusterer", "step_name": "clusterer"}, - }, - ], - ) + assert json.loads(serialization.parameters["steps"]) == [ + { + "oml-python:serialized_object": "component_reference", + "value": {"key": "scaler", "step_name": "scaler"}, + }, + { + "oml-python:serialized_object": "component_reference", + "value": {"key": "clusterer", "step_name": "clusterer"}, + }, + ] # Checking the sub-component - self.assertEqual(len(serialization.components), 2) - self.assertIsInstance(serialization.components["scaler"], OpenMLFlow) - self.assertIsInstance(serialization.components["clusterer"], OpenMLFlow) + assert len(serialization.components) == 2 + assert isinstance(serialization.components["scaler"], OpenMLFlow) + assert isinstance(serialization.components["clusterer"], OpenMLFlow) - self.assertEqual([step[0] for step in new_model.steps], [step[0] for step in model.steps]) - self.assertIsNot(new_model.steps[0][1], model.steps[0][1]) - self.assertIsNot(new_model.steps[1][1], model.steps[1][1]) + assert [step[0] for step in new_model.steps] == [step[0] for step in model.steps] + assert new_model.steps[0][1] is not model.steps[0][1] + assert new_model.steps[1][1] is not model.steps[1][1] - @pytest.mark.sklearn + @pytest.mark.sklearn() @unittest.skipIf( LooseVersion(sklearn.__version__) < "0.20", reason="columntransformer introduction in 0.20.0", @@ -595,8 +601,8 @@ def test_serialize_column_transformer(self): scaler_name = "data" if LooseVersion(sklearn.__version__) < "0.22" else "_data" fixture = ( "sklearn.compose._column_transformer.ColumnTransformer(" - "numeric=sklearn.preprocessing.{}.StandardScaler," - "nominal=sklearn.preprocessing._encoders.OneHotEncoder,drop=drop)".format(scaler_name) + f"numeric=sklearn.preprocessing.{scaler_name}.StandardScaler," + "nominal=sklearn.preprocessing._encoders.OneHotEncoder,drop=drop)" ) fixture_short_name = "sklearn.ColumnTransformer" @@ -617,19 +623,19 @@ def test_serialize_column_transformer(self): fixture_structure = { fixture: [], - "sklearn.preprocessing.{}.StandardScaler".format(scaler_name): ["numeric"], + f"sklearn.preprocessing.{scaler_name}.StandardScaler": ["numeric"], "sklearn.preprocessing._encoders.OneHotEncoder": ["nominal"], "drop": ["drop"], } serialization = self.extension.model_to_flow(model) structure = serialization.get_structure("name") - self.assertEqual(serialization.name, fixture) - self.assertEqual(serialization.custom_name, fixture_short_name) - self.assertEqual(serialization.description, fixture_description) + assert serialization.name == fixture + assert serialization.custom_name == fixture_short_name + assert serialization.description == fixture_description self.assertDictEqual(structure, fixture_structure) - @pytest.mark.sklearn + @pytest.mark.sklearn() @unittest.skipIf( LooseVersion(sklearn.__version__) < "0.20", reason="columntransformer introduction in 0.20.0", @@ -650,7 +656,7 @@ def test_serialize_column_transformer_pipeline(self): remainder="passthrough", ) model = sklearn.pipeline.Pipeline( - steps=[("transformer", inner), ("classifier", sklearn.tree.DecisionTreeClassifier())] + steps=[("transformer", inner), ("classifier", sklearn.tree.DecisionTreeClassifier())], ) scaler_name = "data" if LooseVersion(sklearn.__version__) < "0.22" else "_data" tree_name = "tree" if LooseVersion(sklearn.__version__) < "0.22" else "_classes" @@ -658,20 +664,20 @@ def test_serialize_column_transformer_pipeline(self): "sklearn.pipeline.Pipeline(" "transformer=sklearn.compose._column_transformer." "ColumnTransformer(" - "numeric=sklearn.preprocessing.{}.StandardScaler," + f"numeric=sklearn.preprocessing.{scaler_name}.StandardScaler," "nominal=sklearn.preprocessing._encoders.OneHotEncoder)," - "classifier=sklearn.tree.{}.DecisionTreeClassifier)".format(scaler_name, tree_name) + f"classifier=sklearn.tree.{tree_name}.DecisionTreeClassifier)" ) fixture_structure = { - "sklearn.preprocessing.{}.StandardScaler".format(scaler_name): [ + f"sklearn.preprocessing.{scaler_name}.StandardScaler": [ "transformer", "numeric", ], "sklearn.preprocessing._encoders.OneHotEncoder": ["transformer", "nominal"], "sklearn.compose._column_transformer.ColumnTransformer(numeric=" - "sklearn.preprocessing.{}.StandardScaler,nominal=sklearn." - "preprocessing._encoders.OneHotEncoder)".format(scaler_name): ["transformer"], - "sklearn.tree.{}.DecisionTreeClassifier".format(tree_name): ["classifier"], + f"sklearn.preprocessing.{scaler_name}.StandardScaler,nominal=sklearn." + "preprocessing._encoders.OneHotEncoder)": ["transformer"], + f"sklearn.tree.{tree_name}.DecisionTreeClassifier": ["classifier"], fixture_name: [], } @@ -691,14 +697,15 @@ def test_serialize_column_transformer_pipeline(self): dependencies_mock_call_count=(5, 10), ) structure = serialization.get_structure("name") - self.assertEqual(serialization.name, fixture_name) - self.assertEqual(serialization.description, fixture_description) + assert serialization.name == fixture_name + assert serialization.description == fixture_description self.assertDictEqual(structure, fixture_structure) - @pytest.mark.sklearn + @pytest.mark.sklearn() @unittest.skipIf( - LooseVersion(sklearn.__version__) < "0.20", reason="Pipeline processing behaviour updated" + LooseVersion(sklearn.__version__) < "0.20", + reason="Pipeline processing behaviour updated", ) def test_serialize_feature_union(self): ohe_params = {"sparse": False} @@ -721,33 +728,30 @@ def test_serialize_feature_union(self): scaler_name = "data" if LooseVersion(sklearn.__version__) < "0.22" else "_data" fixture_name = ( "sklearn.pipeline.FeatureUnion(" - "ohe=sklearn.preprocessing.{}.OneHotEncoder," - "scaler=sklearn.preprocessing.{}.StandardScaler)".format( - module_name_encoder, scaler_name - ) + f"ohe=sklearn.preprocessing.{module_name_encoder}.OneHotEncoder," + f"scaler=sklearn.preprocessing.{scaler_name}.StandardScaler)" ) fixture_structure = { fixture_name: [], - "sklearn.preprocessing.{}." "OneHotEncoder".format(module_name_encoder): ["ohe"], - "sklearn.preprocessing.{}.StandardScaler".format(scaler_name): ["scaler"], + f"sklearn.preprocessing.{module_name_encoder}." "OneHotEncoder": ["ohe"], + f"sklearn.preprocessing.{scaler_name}.StandardScaler": ["scaler"], } - self.assertEqual(serialization.name, fixture_name) + assert serialization.name == fixture_name self.assertDictEqual(structure, fixture_structure) - self.assertEqual(new_model.transformer_list[0][0], fu.transformer_list[0][0]) - self.assertEqual( - new_model.transformer_list[0][1].get_params(), fu.transformer_list[0][1].get_params() + assert new_model.transformer_list[0][0] == fu.transformer_list[0][0] + assert ( + new_model.transformer_list[0][1].get_params() == fu.transformer_list[0][1].get_params() ) - self.assertEqual(new_model.transformer_list[1][0], fu.transformer_list[1][0]) - self.assertEqual( - new_model.transformer_list[1][1].get_params(), fu.transformer_list[1][1].get_params() + assert new_model.transformer_list[1][0] == fu.transformer_list[1][0] + assert ( + new_model.transformer_list[1][1].get_params() == fu.transformer_list[1][1].get_params() ) - self.assertEqual( - [step[0] for step in new_model.transformer_list], - [step[0] for step in fu.transformer_list], - ) - self.assertIsNot(new_model.transformer_list[0][1], fu.transformer_list[0][1]) - self.assertIsNot(new_model.transformer_list[1][1], fu.transformer_list[1][1]) + assert [step[0] for step in new_model.transformer_list] == [ + step[0] for step in fu.transformer_list + ] + assert new_model.transformer_list[0][1] is not fu.transformer_list[0][1] + assert new_model.transformer_list[1][1] is not fu.transformer_list[1][1] fu.set_params(scaler="drop") serialization, new_model = self._serialization_test_helper( @@ -757,15 +761,14 @@ def test_serialize_feature_union(self): subcomponent_parameters=("ohe", "transformer_list"), dependencies_mock_call_count=(3, 6), ) - self.assertEqual( - serialization.name, - "sklearn.pipeline.FeatureUnion(" - "ohe=sklearn.preprocessing.{}.OneHotEncoder," - "scaler=drop)".format(module_name_encoder), + assert ( + serialization.name == "sklearn.pipeline.FeatureUnion(" + f"ohe=sklearn.preprocessing.{module_name_encoder}.OneHotEncoder," + "scaler=drop)" ) - self.assertIs(new_model.transformer_list[1][1], "drop") + assert new_model.transformer_list[1][1] == "drop" - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_serialize_feature_union_switched_names(self): ohe_params = {"categories": "auto"} if LooseVersion(sklearn.__version__) >= "0.20" else {} ohe = sklearn.preprocessing.OneHotEncoder(**ohe_params) @@ -791,30 +794,26 @@ def test_serialize_feature_union_switched_names(self): # OneHotEncoder was moved to _encoders module in 0.20 module_name_encoder = "_encoders" if LooseVersion(sklearn.__version__) >= "0.20" else "data" scaler_name = "data" if LooseVersion(sklearn.__version__) < "0.22" else "_data" - self.assertEqual( - fu1_serialization.name, - "sklearn.pipeline.FeatureUnion(" - "ohe=sklearn.preprocessing.{}.OneHotEncoder," - "scaler=sklearn.preprocessing.{}.StandardScaler)".format( - module_name_encoder, scaler_name - ), + assert ( + fu1_serialization.name == "sklearn.pipeline.FeatureUnion(" + f"ohe=sklearn.preprocessing.{module_name_encoder}.OneHotEncoder," + f"scaler=sklearn.preprocessing.{scaler_name}.StandardScaler)" ) - self.assertEqual( - fu2_serialization.name, - "sklearn.pipeline.FeatureUnion(" - "scaler=sklearn.preprocessing.{}.OneHotEncoder," - "ohe=sklearn.preprocessing.{}.StandardScaler)".format(module_name_encoder, scaler_name), + assert ( + fu2_serialization.name == "sklearn.pipeline.FeatureUnion(" + f"scaler=sklearn.preprocessing.{module_name_encoder}.OneHotEncoder," + f"ohe=sklearn.preprocessing.{scaler_name}.StandardScaler)" ) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_serialize_complex_flow(self): ohe = sklearn.preprocessing.OneHotEncoder(handle_unknown="ignore") scaler = sklearn.preprocessing.StandardScaler(with_mean=False) boosting = sklearn.ensemble.AdaBoostClassifier( - base_estimator=sklearn.tree.DecisionTreeClassifier() + base_estimator=sklearn.tree.DecisionTreeClassifier(), ) model = sklearn.pipeline.Pipeline( - steps=[("ohe", ohe), ("scaler", scaler), ("boosting", boosting)] + steps=[("ohe", ohe), ("scaler", scaler), ("boosting", boosting)], ) parameter_grid = { "boosting__base_estimator__max_depth": scipy.stats.randint(1, 10), @@ -825,7 +824,9 @@ def test_serialize_complex_flow(self): parameter_grid = OrderedDict(sorted(parameter_grid.items())) cv = sklearn.model_selection.StratifiedKFold(n_splits=5, shuffle=True) rs = sklearn.model_selection.RandomizedSearchCV( - estimator=model, param_distributions=parameter_grid, cv=cv + estimator=model, + param_distributions=parameter_grid, + cv=cv, ) serialized, new_model = self._serialization_test_helper( rs, @@ -839,16 +840,17 @@ def test_serialize_complex_flow(self): module_name_encoder = "_encoders" if LooseVersion(sklearn.__version__) >= "0.20" else "data" ohe_name = "sklearn.preprocessing.%s.OneHotEncoder" % module_name_encoder scaler_name = "sklearn.preprocessing.{}.StandardScaler".format( - "data" if LooseVersion(sklearn.__version__) < "0.22" else "_data" + "data" if LooseVersion(sklearn.__version__) < "0.22" else "_data", ) tree_name = "sklearn.tree.{}.DecisionTreeClassifier".format( - "tree" if LooseVersion(sklearn.__version__) < "0.22" else "_classes" + "tree" if LooseVersion(sklearn.__version__) < "0.22" else "_classes", ) weight_name = "weight" if LooseVersion(sklearn.__version__) < "0.22" else "_weight" boosting_name = "sklearn.ensemble.{}_boosting.AdaBoostClassifier(base_estimator={})".format( - weight_name, tree_name + weight_name, + tree_name, ) - pipeline_name = "sklearn.pipeline.Pipeline(ohe=%s,scaler=%s," "boosting=%s)" % ( + pipeline_name = "sklearn.pipeline.Pipeline(ohe={},scaler={}," "boosting={})".format( ohe_name, scaler_name, boosting_name, @@ -864,10 +866,10 @@ def test_serialize_complex_flow(self): pipeline_name: ["estimator"], fixture_name: [], } - self.assertEqual(serialized.name, fixture_name) - self.assertEqual(structure, fixture_structure) + assert serialized.name == fixture_name + assert structure == fixture_structure - @pytest.mark.sklearn + @pytest.mark.sklearn() @unittest.skipIf( LooseVersion(sklearn.__version__) < "0.21", reason="Pipeline till 0.20 doesn't support 'passthrough'", @@ -878,53 +880,56 @@ def test_serialize_strings_as_pipeline_steps(self): # First check: test whether a passthrough in a pipeline is serialized correctly model = sklearn.pipeline.Pipeline(steps=[("transformer", "passthrough")]) serialized = self.extension.model_to_flow(model) - self.assertIsInstance(serialized, OpenMLFlow) - self.assertEqual(len(serialized.components), 1) - self.assertEqual(serialized.components["transformer"].name, "passthrough") + assert isinstance(serialized, OpenMLFlow) + assert len(serialized.components) == 1 + assert serialized.components["transformer"].name == "passthrough" serialized = self.extension._serialize_sklearn( - ("transformer", "passthrough"), parent_model=model + ("transformer", "passthrough"), + parent_model=model, ) - self.assertEqual(serialized, ("transformer", "passthrough")) + assert serialized == ("transformer", "passthrough") extracted_info = self.extension._extract_information_from_model(model) - self.assertEqual(len(extracted_info[2]), 1) - self.assertIsInstance(extracted_info[2]["transformer"], OpenMLFlow) - self.assertEqual(extracted_info[2]["transformer"].name, "passthrough") + assert len(extracted_info[2]) == 1 + assert isinstance(extracted_info[2]["transformer"], OpenMLFlow) + assert extracted_info[2]["transformer"].name == "passthrough" # Second check: test whether a lone passthrough in a column transformer is serialized # correctly model = sklearn.compose.ColumnTransformer([("passthrough", "passthrough", (0,))]) serialized = self.extension.model_to_flow(model) - self.assertIsInstance(serialized, OpenMLFlow) - self.assertEqual(len(serialized.components), 1) - self.assertEqual(serialized.components["passthrough"].name, "passthrough") + assert isinstance(serialized, OpenMLFlow) + assert len(serialized.components) == 1 + assert serialized.components["passthrough"].name == "passthrough" serialized = self.extension._serialize_sklearn( - ("passthrough", "passthrough"), parent_model=model + ("passthrough", "passthrough"), + parent_model=model, ) - self.assertEqual(serialized, ("passthrough", "passthrough")) + assert serialized == ("passthrough", "passthrough") extracted_info = self.extension._extract_information_from_model(model) - self.assertEqual(len(extracted_info[2]), 1) - self.assertIsInstance(extracted_info[2]["passthrough"], OpenMLFlow) - self.assertEqual(extracted_info[2]["passthrough"].name, "passthrough") + assert len(extracted_info[2]) == 1 + assert isinstance(extracted_info[2]["passthrough"], OpenMLFlow) + assert extracted_info[2]["passthrough"].name == "passthrough" # Third check: passthrough and drop in a column transformer model = sklearn.compose.ColumnTransformer( - [("passthrough", "passthrough", (0,)), ("drop", "drop", (1,))] + [("passthrough", "passthrough", (0,)), ("drop", "drop", (1,))], ) serialized = self.extension.model_to_flow(model) - self.assertIsInstance(serialized, OpenMLFlow) - self.assertEqual(len(serialized.components), 2) - self.assertEqual(serialized.components["passthrough"].name, "passthrough") - self.assertEqual(serialized.components["drop"].name, "drop") + assert isinstance(serialized, OpenMLFlow) + assert len(serialized.components) == 2 + assert serialized.components["passthrough"].name == "passthrough" + assert serialized.components["drop"].name == "drop" serialized = self.extension._serialize_sklearn( - ("passthrough", "passthrough"), parent_model=model + ("passthrough", "passthrough"), + parent_model=model, ) - self.assertEqual(serialized, ("passthrough", "passthrough")) + assert serialized == ("passthrough", "passthrough") extracted_info = self.extension._extract_information_from_model(model) - self.assertEqual(len(extracted_info[2]), 2) - self.assertIsInstance(extracted_info[2]["passthrough"], OpenMLFlow) - self.assertIsInstance(extracted_info[2]["drop"], OpenMLFlow) - self.assertEqual(extracted_info[2]["passthrough"].name, "passthrough") - self.assertEqual(extracted_info[2]["drop"].name, "drop") + assert len(extracted_info[2]) == 2 + assert isinstance(extracted_info[2]["passthrough"], OpenMLFlow) + assert isinstance(extracted_info[2]["drop"], OpenMLFlow) + assert extracted_info[2]["passthrough"].name == "passthrough" + assert extracted_info[2]["drop"].name == "drop" # Fourth check: having an actual preprocessor in the column transformer, too model = sklearn.compose.ColumnTransformer( @@ -932,50 +937,51 @@ def test_serialize_strings_as_pipeline_steps(self): ("passthrough", "passthrough", (0,)), ("drop", "drop", (1,)), ("test", sklearn.preprocessing.StandardScaler(), (2,)), - ] + ], ) serialized = self.extension.model_to_flow(model) - self.assertIsInstance(serialized, OpenMLFlow) - self.assertEqual(len(serialized.components), 3) - self.assertEqual(serialized.components["passthrough"].name, "passthrough") - self.assertEqual(serialized.components["drop"].name, "drop") + assert isinstance(serialized, OpenMLFlow) + assert len(serialized.components) == 3 + assert serialized.components["passthrough"].name == "passthrough" + assert serialized.components["drop"].name == "drop" serialized = self.extension._serialize_sklearn( - ("passthrough", "passthrough"), parent_model=model + ("passthrough", "passthrough"), + parent_model=model, ) - self.assertEqual(serialized, ("passthrough", "passthrough")) + assert serialized == ("passthrough", "passthrough") extracted_info = self.extension._extract_information_from_model(model) - self.assertEqual(len(extracted_info[2]), 3) - self.assertIsInstance(extracted_info[2]["passthrough"], OpenMLFlow) - self.assertIsInstance(extracted_info[2]["drop"], OpenMLFlow) - self.assertEqual(extracted_info[2]["passthrough"].name, "passthrough") - self.assertEqual(extracted_info[2]["drop"].name, "drop") + assert len(extracted_info[2]) == 3 + assert isinstance(extracted_info[2]["passthrough"], OpenMLFlow) + assert isinstance(extracted_info[2]["drop"], OpenMLFlow) + assert extracted_info[2]["passthrough"].name == "passthrough" + assert extracted_info[2]["drop"].name == "drop" # Fifth check: test whether a lone drop in a feature union is serialized correctly model = sklearn.pipeline.FeatureUnion([("drop", "drop")]) serialized = self.extension.model_to_flow(model) - self.assertIsInstance(serialized, OpenMLFlow) - self.assertEqual(len(serialized.components), 1) - self.assertEqual(serialized.components["drop"].name, "drop") + assert isinstance(serialized, OpenMLFlow) + assert len(serialized.components) == 1 + assert serialized.components["drop"].name == "drop" serialized = self.extension._serialize_sklearn(("drop", "drop"), parent_model=model) - self.assertEqual(serialized, ("drop", "drop")) + assert serialized == ("drop", "drop") extracted_info = self.extension._extract_information_from_model(model) - self.assertEqual(len(extracted_info[2]), 1) - self.assertIsInstance(extracted_info[2]["drop"], OpenMLFlow) - self.assertEqual(extracted_info[2]["drop"].name, "drop") + assert len(extracted_info[2]) == 1 + assert isinstance(extracted_info[2]["drop"], OpenMLFlow) + assert extracted_info[2]["drop"].name == "drop" - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_serialize_type(self): supported_types = [float, np.float32, np.float64, int, np.int32, np.int64] if LooseVersion(np.__version__) < "1.24": - supported_types.append(np.float) - supported_types.append(np.int) + supported_types.append(float) + supported_types.append(int) for supported_type in supported_types: serialized = self.extension.model_to_flow(supported_type) deserialized = self.extension.flow_to_model(serialized) - self.assertEqual(deserialized, supported_type) + assert deserialized == supported_type - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_serialize_rvs(self): supported_rvs = [ scipy.stats.norm(loc=1, scale=5), @@ -986,18 +992,18 @@ def test_serialize_rvs(self): for supported_rv in supported_rvs: serialized = self.extension.model_to_flow(supported_rv) deserialized = self.extension.flow_to_model(serialized) - self.assertEqual(type(deserialized.dist), type(supported_rv.dist)) + assert type(deserialized.dist) == type(supported_rv.dist) del deserialized.dist del supported_rv.dist - self.assertEqual(deserialized.__dict__, supported_rv.__dict__) + assert deserialized.__dict__ == supported_rv.__dict__ - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_serialize_function(self): serialized = self.extension.model_to_flow(sklearn.feature_selection.chi2) deserialized = self.extension.flow_to_model(serialized) - self.assertEqual(deserialized, sklearn.feature_selection.chi2) + assert deserialized == sklearn.feature_selection.chi2 - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_serialize_cvobject(self): methods = [sklearn.model_selection.KFold(3), sklearn.model_selection.LeaveOneOut()] fixtures = [ @@ -1016,13 +1022,13 @@ def test_serialize_cvobject(self): ("n_splits", "3"), ("random_state", "null"), ("shuffle", "false"), - ] + ], ), ), - ] + ], ), ), - ] + ], ), OrderedDict( [ @@ -1033,21 +1039,21 @@ def test_serialize_cvobject(self): [ ("name", "sklearn.model_selection._split.LeaveOneOut"), ("parameters", OrderedDict()), - ] + ], ), ), - ] + ], ), ] for method, fixture in zip(methods, fixtures): m = self.extension.model_to_flow(method) - self.assertEqual(m, fixture) + assert m == fixture m_new = self.extension.flow_to_model(m) - self.assertIsNot(m_new, m) - self.assertIsInstance(m_new, type(method)) + assert m_new is not m + assert isinstance(m_new, type(method)) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_serialize_simple_parameter_grid(self): # We cannot easily test for scipy random variables in here, but they # should be covered @@ -1058,7 +1064,7 @@ def test_serialize_simple_parameter_grid(self): [ OrderedDict([("C", [1, 10, 100, 1000]), ("kernel", ["linear"])]), OrderedDict( - [("C", [1, 10, 100, 1000]), ("gamma", [0.001, 0.0001]), ("kernel", ["rbf"])] + [("C", [1, 10, 100, 1000]), ("gamma", [0.001, 0.0001]), ("kernel", ["rbf"])], ), ], OrderedDict( @@ -1069,7 +1075,7 @@ def test_serialize_simple_parameter_grid(self): ("max_features", [1, 3, 10]), ("min_samples_leaf", [1, 3, 10]), ("min_samples_split", [1, 3, 10]), - ] + ], ), ] @@ -1077,28 +1083,30 @@ def test_serialize_simple_parameter_grid(self): serialized = self.extension.model_to_flow(grid) deserialized = self.extension.flow_to_model(serialized) - self.assertEqual(deserialized, grid) - self.assertIsNot(deserialized, grid) + assert deserialized == grid + assert deserialized is not grid # providing error_score because nan != nan hpo = sklearn.model_selection.GridSearchCV( - param_grid=grid, estimator=model, error_score=-1000 + param_grid=grid, + estimator=model, + error_score=-1000, ) serialized = self.extension.model_to_flow(hpo) deserialized = self.extension.flow_to_model(serialized) - self.assertEqual(hpo.param_grid, deserialized.param_grid) - self.assertEqual(hpo.estimator.get_params(), deserialized.estimator.get_params()) + assert hpo.param_grid == deserialized.param_grid + assert hpo.estimator.get_params() == deserialized.estimator.get_params() hpo_params = hpo.get_params(deep=False) deserialized_params = deserialized.get_params(deep=False) del hpo_params["estimator"] del deserialized_params["estimator"] - self.assertEqual(hpo_params, deserialized_params) + assert hpo_params == deserialized_params - @pytest.mark.sklearn + @pytest.mark.sklearn() @unittest.skip( "This feature needs further reworking. If we allow several " "components, we need to register them all in the downstream " - "flows. This is so far not implemented." + "flows. This is so far not implemented.", ) def test_serialize_advanced_grid(self): # TODO instead a GridSearchCV object should be serialized @@ -1120,7 +1128,7 @@ def test_serialize_advanced_grid(self): }, { "reduce_dim": [ - sklearn.feature_selection.SelectKBest(sklearn.feature_selection.chi2) + sklearn.feature_selection.SelectKBest(sklearn.feature_selection.chi2), ], "reduce_dim__k": N_FEATURES_OPTIONS, "classify__C": C_OPTIONS, @@ -1130,26 +1138,24 @@ def test_serialize_advanced_grid(self): serialized = self.extension.model_to_flow(grid) deserialized = self.extension.flow_to_model(serialized) - self.assertEqual( - grid[0]["reduce_dim"][0].get_params(), deserialized[0]["reduce_dim"][0].get_params() - ) - self.assertIsNot(grid[0]["reduce_dim"][0], deserialized[0]["reduce_dim"][0]) - self.assertEqual( - grid[0]["reduce_dim"][1].get_params(), deserialized[0]["reduce_dim"][1].get_params() + assert ( + grid[0]["reduce_dim"][0].get_params() == deserialized[0]["reduce_dim"][0].get_params() ) - self.assertIsNot(grid[0]["reduce_dim"][1], deserialized[0]["reduce_dim"][1]) - self.assertEqual( - grid[0]["reduce_dim__n_components"], deserialized[0]["reduce_dim__n_components"] + assert grid[0]["reduce_dim"][0] is not deserialized[0]["reduce_dim"][0] + assert ( + grid[0]["reduce_dim"][1].get_params() == deserialized[0]["reduce_dim"][1].get_params() ) - self.assertEqual(grid[0]["classify__C"], deserialized[0]["classify__C"]) - self.assertEqual( - grid[1]["reduce_dim"][0].get_params(), deserialized[1]["reduce_dim"][0].get_params() + assert grid[0]["reduce_dim"][1] is not deserialized[0]["reduce_dim"][1] + assert grid[0]["reduce_dim__n_components"] == deserialized[0]["reduce_dim__n_components"] + assert grid[0]["classify__C"] == deserialized[0]["classify__C"] + assert ( + grid[1]["reduce_dim"][0].get_params() == deserialized[1]["reduce_dim"][0].get_params() ) - self.assertIsNot(grid[1]["reduce_dim"][0], deserialized[1]["reduce_dim"][0]) - self.assertEqual(grid[1]["reduce_dim__k"], deserialized[1]["reduce_dim__k"]) - self.assertEqual(grid[1]["classify__C"], deserialized[1]["classify__C"]) + assert grid[1]["reduce_dim"][0] is not deserialized[1]["reduce_dim"][0] + assert grid[1]["reduce_dim__k"] == deserialized[1]["reduce_dim__k"] + assert grid[1]["classify__C"] == deserialized[1]["classify__C"] - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_serialize_advanced_grid_fails(self): # This unit test is checking that the test we skip above would actually fail @@ -1157,28 +1163,29 @@ def test_serialize_advanced_grid_fails(self): "base_estimator": [ sklearn.tree.DecisionTreeClassifier(), sklearn.tree.ExtraTreeClassifier(), - ] + ], } clf = sklearn.model_selection.GridSearchCV( sklearn.ensemble.BaggingClassifier(), param_grid=param_grid, ) - with self.assertRaisesRegex( - TypeError, re.compile(r".*OpenML.*Flow.*is not JSON serializable", flags=re.DOTALL) + with pytest.raises( + TypeError, + match=re.compile(r".*OpenML.*Flow.*is not JSON serializable", flags=re.DOTALL), ): self.extension.model_to_flow(clf) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_serialize_resampling(self): kfold = sklearn.model_selection.StratifiedKFold(n_splits=4, shuffle=True) serialized = self.extension.model_to_flow(kfold) deserialized = self.extension.flow_to_model(serialized) # Best approximation to get_params() - self.assertEqual(str(deserialized), str(kfold)) - self.assertIsNot(deserialized, kfold) + assert str(deserialized) == str(kfold) + assert deserialized is not kfold - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_hypothetical_parameter_values(self): # The hypothetical parameter values of true, 1, 0.1 formatted as a # string (and their correct serialization and deserialization) an only @@ -1189,21 +1196,21 @@ def test_hypothetical_parameter_values(self): serialized = self.extension.model_to_flow(model) serialized.external_version = "sklearn==test123" deserialized = self.extension.flow_to_model(serialized) - self.assertEqual(deserialized.get_params(), model.get_params()) - self.assertIsNot(deserialized, model) + assert deserialized.get_params() == model.get_params() + assert deserialized is not model - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_gaussian_process(self): opt = scipy.optimize.fmin_l_bfgs_b kernel = sklearn.gaussian_process.kernels.Matern() gp = sklearn.gaussian_process.GaussianProcessClassifier(kernel=kernel, optimizer=opt) - with self.assertRaisesRegex( + with pytest.raises( TypeError, - r"Matern\(length_scale=1, nu=1.5\), ", + match=r"Matern\(length_scale=1, nu=1.5\), ", ): self.extension.model_to_flow(gp) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_error_on_adding_component_multiple_times_to_flow(self): # this function implicitly checks # - openml.flows._check_multiple_occurence_of_component_in_flow() @@ -1211,24 +1218,24 @@ def test_error_on_adding_component_multiple_times_to_flow(self): pca2 = sklearn.decomposition.PCA() pipeline = sklearn.pipeline.Pipeline((("pca1", pca), ("pca2", pca2))) fixture = "Found a second occurence of component .*.PCA when trying to serialize Pipeline" - with self.assertRaisesRegex(ValueError, fixture): + with pytest.raises(ValueError, match=fixture): self.extension.model_to_flow(pipeline) fu = sklearn.pipeline.FeatureUnion((("pca1", pca), ("pca2", pca2))) fixture = ( "Found a second occurence of component .*.PCA when trying " "to serialize FeatureUnion" ) - with self.assertRaisesRegex(ValueError, fixture): + with pytest.raises(ValueError, match=fixture): self.extension.model_to_flow(fu) fs = sklearn.feature_selection.SelectKBest() fu2 = sklearn.pipeline.FeatureUnion((("pca1", pca), ("fs", fs))) pipeline2 = sklearn.pipeline.Pipeline((("fu", fu2), ("pca2", pca2))) fixture = "Found a second occurence of component .*.PCA when trying to serialize Pipeline" - with self.assertRaisesRegex(ValueError, fixture): + with pytest.raises(ValueError, match=fixture): self.extension.model_to_flow(pipeline2) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_subflow_version_propagated(self): this_directory = os.path.dirname(os.path.abspath(__file__)) tests_directory = os.path.abspath(os.path.join(this_directory, "..", "..")) @@ -1243,44 +1250,40 @@ def test_subflow_version_propagated(self): # I put the alternative travis-ci answer here as well. While it has a # different value, it is still correct as it is a propagation of the # subclasses' module name - self.assertEqual( - flow.external_version, - "%s,%s,%s" - % ( - self.extension._format_external_version("openml", openml.__version__), - self.extension._format_external_version("sklearn", sklearn.__version__), - self.extension._format_external_version("tests", "0.1"), - ), + assert flow.external_version == "{},{},{}".format( + self.extension._format_external_version("openml", openml.__version__), + self.extension._format_external_version("sklearn", sklearn.__version__), + self.extension._format_external_version("tests", "0.1"), ) - @pytest.mark.sklearn + @pytest.mark.sklearn() @mock.patch("warnings.warn") def test_check_dependencies(self, warnings_mock): dependencies = ["sklearn==0.1", "sklearn>=99.99.99", "sklearn>99.99.99"] for dependency in dependencies: self.assertRaises(ValueError, self.extension._check_dependencies, dependency) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_illegal_parameter_names(self): # illegal name: estimators clf1 = sklearn.ensemble.VotingClassifier( estimators=[ ("estimators", sklearn.ensemble.RandomForestClassifier()), ("whatevs", sklearn.ensemble.ExtraTreesClassifier()), - ] + ], ) clf2 = sklearn.ensemble.VotingClassifier( estimators=[ ("whatevs", sklearn.ensemble.RandomForestClassifier()), ("estimators", sklearn.ensemble.ExtraTreesClassifier()), - ] + ], ) cases = [clf1, clf2] for case in cases: self.assertRaises(PyOpenMLError, self.extension.model_to_flow, case) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_paralizable_check(self): # using this model should pass the test (if param distribution is # legal) @@ -1297,18 +1300,19 @@ def test_paralizable_check(self): sklearn.ensemble.RandomForestClassifier(n_jobs=5), sklearn.ensemble.RandomForestClassifier(n_jobs=-1), sklearn.pipeline.Pipeline( - steps=[("bag", sklearn.ensemble.BaggingClassifier(n_jobs=1))] + steps=[("bag", sklearn.ensemble.BaggingClassifier(n_jobs=1))], ), sklearn.pipeline.Pipeline( - steps=[("bag", sklearn.ensemble.BaggingClassifier(n_jobs=5))] + steps=[("bag", sklearn.ensemble.BaggingClassifier(n_jobs=5))], ), sklearn.pipeline.Pipeline( - steps=[("bag", sklearn.ensemble.BaggingClassifier(n_jobs=-1))] + steps=[("bag", sklearn.ensemble.BaggingClassifier(n_jobs=-1))], ), sklearn.model_selection.GridSearchCV(singlecore_bagging, legal_param_dist), sklearn.model_selection.GridSearchCV(multicore_bagging, legal_param_dist), sklearn.ensemble.BaggingClassifier( - n_jobs=-1, base_estimator=sklearn.ensemble.RandomForestClassifier(n_jobs=5) + n_jobs=-1, + base_estimator=sklearn.ensemble.RandomForestClassifier(n_jobs=5), ), ] illegal_models = [ @@ -1324,13 +1328,13 @@ def test_paralizable_check(self): X, y = sklearn.datasets.load_iris(return_X_y=True) for model, refit_time in zip(legal_models, has_refit_time): model.fit(X, y) - self.assertEqual(refit_time, hasattr(model, "refit_time_")) + assert refit_time == hasattr(model, "refit_time_") for model in illegal_models: - with self.assertRaises(PyOpenMLError): + with pytest.raises(PyOpenMLError): self.extension._prevent_optimize_n_jobs(model) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test__get_fn_arguments_with_defaults(self): sklearn_version = LooseVersion(sklearn.__version__) if sklearn_version < "0.19": @@ -1379,16 +1383,16 @@ def test__get_fn_arguments_with_defaults(self): for fn, num_params_with_defaults in fns: defaults, defaultless = self.extension._get_fn_arguments_with_defaults(fn) - self.assertIsInstance(defaults, dict) - self.assertIsInstance(defaultless, set) + assert isinstance(defaults, dict) + assert isinstance(defaultless, set) # check whether we have both defaults and defaultless params - self.assertEqual(len(defaults), num_params_with_defaults) - self.assertGreater(len(defaultless), 0) + assert len(defaults) == num_params_with_defaults + assert len(defaultless) > 0 # check no overlap self.assertSetEqual(set(defaults.keys()), set(defaults.keys()) - defaultless) self.assertSetEqual(defaultless, defaultless - set(defaults.keys())) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_deserialize_with_defaults(self): # used the 'initialize_with_defaults' flag of the deserialization # method to return a flow that contains default hyperparameter @@ -1424,7 +1428,7 @@ def test_deserialize_with_defaults(self): self.extension.model_to_flow(pipe_deserialized), ) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_deserialize_adaboost_with_defaults(self): # used the 'initialize_with_defaults' flag of the deserialization # method to return a flow that contains default hyperparameter @@ -1463,7 +1467,7 @@ def test_deserialize_adaboost_with_defaults(self): self.extension.model_to_flow(pipe_deserialized), ) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_deserialize_complex_with_defaults(self): # used the 'initialize_with_defaults' flag of the deserialization # method to return a flow that contains default hyperparameter @@ -1475,8 +1479,8 @@ def test_deserialize_complex_with_defaults(self): "Estimator", sklearn.ensemble.AdaBoostClassifier( sklearn.ensemble.BaggingClassifier( - sklearn.ensemble.GradientBoostingClassifier() - ) + sklearn.ensemble.GradientBoostingClassifier(), + ), ), ), ] @@ -1507,11 +1511,11 @@ def test_deserialize_complex_with_defaults(self): self.extension.model_to_flow(pipe_deserialized), ) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_openml_param_name_to_sklearn(self): scaler = sklearn.preprocessing.StandardScaler(with_mean=False) boosting = sklearn.ensemble.AdaBoostClassifier( - base_estimator=sklearn.tree.DecisionTreeClassifier() + base_estimator=sklearn.tree.DecisionTreeClassifier(), ) model = sklearn.pipeline.Pipeline(steps=[("scaler", scaler), ("boosting", boosting)]) flow = self.extension.model_to_flow(model) @@ -1524,7 +1528,7 @@ def test_openml_param_name_to_sklearn(self): setup = openml.setups.get_setup(run.setup_id) # make sure to test enough parameters - self.assertGreater(len(setup.parameters), 15) + assert len(setup.parameters) > 15 for parameter in setup.parameters.values(): sklearn_name = self.extension._openml_param_name_to_sklearn(parameter, flow) @@ -1539,32 +1543,30 @@ def test_openml_param_name_to_sklearn(self): subflow = flow.get_subflow(splitted[0:-1]) else: subflow = flow - openml_name = "%s(%s)_%s" % (subflow.name, subflow.version, splitted[-1]) - self.assertEqual(parameter.full_name, openml_name) + openml_name = f"{subflow.name}({subflow.version})_{splitted[-1]}" + assert parameter.full_name == openml_name - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_obtain_parameter_values_flow_not_from_server(self): model = sklearn.linear_model.LogisticRegression(solver="lbfgs") flow = self.extension.model_to_flow(model) logistic_name = "logistic" if LooseVersion(sklearn.__version__) < "0.22" else "_logistic" - msg = "Flow sklearn.linear_model.{}.LogisticRegression has no flow_id!".format( - logistic_name - ) + msg = f"Flow sklearn.linear_model.{logistic_name}.LogisticRegression has no flow_id!" - with self.assertRaisesRegex(ValueError, msg): + with pytest.raises(ValueError, match=msg): self.extension.obtain_parameter_values(flow) model = sklearn.ensemble.AdaBoostClassifier( base_estimator=sklearn.linear_model.LogisticRegression( solver="lbfgs", - ) + ), ) flow = self.extension.model_to_flow(model) flow.flow_id = 1 - with self.assertRaisesRegex(ValueError, msg): + with pytest.raises(ValueError, match=msg): self.extension.obtain_parameter_values(flow) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_obtain_parameter_values(self): model = sklearn.model_selection.RandomizedSearchCV( estimator=sklearn.ensemble.RandomForestClassifier(n_estimators=5), @@ -1584,24 +1586,25 @@ def test_obtain_parameter_values(self): flow.components["estimator"].flow_id = 2 parameters = self.extension.obtain_parameter_values(flow) for parameter in parameters: - self.assertIsNotNone(parameter["oml:component"], msg=parameter) + assert parameter["oml:component"] is not None, parameter if parameter["oml:name"] == "n_estimators": - self.assertEqual(parameter["oml:value"], "5") - self.assertEqual(parameter["oml:component"], 2) + assert parameter["oml:value"] == "5" + assert parameter["oml:component"] == 2 - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_numpy_type_allowed_in_flow(self): """Simple numpy types should be serializable.""" dt = sklearn.tree.DecisionTreeClassifier( - max_depth=np.float64(3.0), min_samples_leaf=np.int32(5) + max_depth=np.float64(3.0), + min_samples_leaf=np.int32(5), ) self.extension.model_to_flow(dt) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_numpy_array_not_allowed_in_flow(self): """Simple numpy arrays should not be serializable.""" bin = sklearn.preprocessing.MultiLabelBinarizer(classes=np.asarray([1, 2, 3])) - with self.assertRaises(TypeError): + with pytest.raises(TypeError): self.extension.model_to_flow(bin) @@ -1615,7 +1618,7 @@ def setUp(self): ################################################################################################ # Test methods for performing runs with this extension module - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_run_model_on_task(self): task = openml.tasks.get_task(1) # anneal; crossvalidation # using most_frequent imputer since dataset has mixed types and to keep things simple @@ -1623,11 +1626,11 @@ def test_run_model_on_task(self): [ ("imp", SimpleImputer(strategy="most_frequent")), ("dummy", sklearn.dummy.DummyClassifier()), - ] + ], ) openml.runs.run_model_on_task(pipe, task, dataset_format="array") - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_seed_model(self): # randomized models that are initialized without seeds, can be seeded randomized_clfs = [ @@ -1650,11 +1653,11 @@ def test_seed_model(self): const_probe = 42 all_params = clf.get_params() params = [key for key in all_params if key.endswith("random_state")] - self.assertGreater(len(params), 0) + assert len(params) > 0 # before param value is None for param in params: - self.assertIsNone(all_params[param]) + assert all_params[param] is None # now seed the params clf_seeded = self.extension.seed_model(clf, const_probe) @@ -1664,13 +1667,13 @@ def test_seed_model(self): # afterwards, param value is set for param in randstate_params: - self.assertIsInstance(new_params[param], int) - self.assertIsNotNone(new_params[param]) + assert isinstance(new_params[param], int) + assert new_params[param] is not None if idx == 1: - self.assertEqual(clf.cv.random_state, 56422) + assert clf.cv.random_state == 56422 - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_seed_model_raises(self): # the _set_model_seed_where_none should raise exception if random_state is # anything else than an int @@ -1680,10 +1683,10 @@ def test_seed_model_raises(self): ] for clf in randomized_clfs: - with self.assertRaises(ValueError): + with pytest.raises(ValueError): self.extension.seed_model(model=clf, seed=42) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_run_model_on_fold_classification_1_array(self): task = openml.tasks.get_task(1) # anneal; crossvalidation @@ -1695,7 +1698,7 @@ def test_run_model_on_fold_classification_1_array(self): y_test = y[test_indices] pipeline = sklearn.pipeline.Pipeline( - steps=[("imp", SimpleImputer()), ("clf", sklearn.tree.DecisionTreeClassifier())] + steps=[("imp", SimpleImputer()), ("clf", sklearn.tree.DecisionTreeClassifier())], ) # TODO add some mocking here to actually test the innards of this function, too! res = self.extension._run_model_on_fold( @@ -1711,26 +1714,27 @@ def test_run_model_on_fold_classification_1_array(self): y_hat, y_hat_proba, user_defined_measures, trace = res # predictions - self.assertIsInstance(y_hat, np.ndarray) - self.assertEqual(y_hat.shape, y_test.shape) - self.assertIsInstance(y_hat_proba, pd.DataFrame) - self.assertEqual(y_hat_proba.shape, (y_test.shape[0], 6)) + assert isinstance(y_hat, np.ndarray) + assert y_hat.shape == y_test.shape + assert isinstance(y_hat_proba, pd.DataFrame) + assert y_hat_proba.shape == (y_test.shape[0], 6) np.testing.assert_array_almost_equal(np.sum(y_hat_proba, axis=1), np.ones(y_test.shape)) # The class '4' (at index 3) is not present in the training data. We check that the # predicted probabilities for that class are zero! np.testing.assert_array_almost_equal( - y_hat_proba.iloc[:, 3].to_numpy(), np.zeros(y_test.shape) + y_hat_proba.iloc[:, 3].to_numpy(), + np.zeros(y_test.shape), ) for i in (0, 1, 2, 4, 5): - self.assertTrue(np.any(y_hat_proba.iloc[:, i].to_numpy() != np.zeros(y_test.shape))) + assert np.any(y_hat_proba.iloc[:, i].to_numpy() != np.zeros(y_test.shape)) # check user defined measures - fold_evaluations = collections.defaultdict(lambda: collections.defaultdict(dict)) + fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(lambda: collections.defaultdict(dict)) for measure in user_defined_measures: fold_evaluations[measure][0][0] = user_defined_measures[measure] # trace. SGD does not produce any - self.assertIsNone(trace) + assert trace is None self._check_fold_timing_evaluations( fold_evaluations, @@ -1740,7 +1744,7 @@ def test_run_model_on_fold_classification_1_array(self): check_scores=False, ) - @pytest.mark.sklearn + @pytest.mark.sklearn() @unittest.skipIf( LooseVersion(sklearn.__version__) < "0.21", reason="SimpleImputer, ColumnTransformer available only after 0.19 and " @@ -1767,7 +1771,7 @@ def test_run_model_on_fold_classification_1_dataframe(self): cont_imp = make_pipeline(CustomImputer(strategy="mean"), StandardScaler()) ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)]) pipeline = sklearn.pipeline.Pipeline( - steps=[("transform", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())] + steps=[("transform", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())], ) # TODO add some mocking here to actually test the innards of this function, too! res = self.extension._run_model_on_fold( @@ -1783,26 +1787,27 @@ def test_run_model_on_fold_classification_1_dataframe(self): y_hat, y_hat_proba, user_defined_measures, trace = res # predictions - self.assertIsInstance(y_hat, np.ndarray) - self.assertEqual(y_hat.shape, y_test.shape) - self.assertIsInstance(y_hat_proba, pd.DataFrame) - self.assertEqual(y_hat_proba.shape, (y_test.shape[0], 6)) + assert isinstance(y_hat, np.ndarray) + assert y_hat.shape == y_test.shape + assert isinstance(y_hat_proba, pd.DataFrame) + assert y_hat_proba.shape == (y_test.shape[0], 6) np.testing.assert_array_almost_equal(np.sum(y_hat_proba, axis=1), np.ones(y_test.shape)) # The class '4' (at index 3) is not present in the training data. We check that the # predicted probabilities for that class are zero! np.testing.assert_array_almost_equal( - y_hat_proba.iloc[:, 3].to_numpy(), np.zeros(y_test.shape) + y_hat_proba.iloc[:, 3].to_numpy(), + np.zeros(y_test.shape), ) for i in (0, 1, 2, 4, 5): - self.assertTrue(np.any(y_hat_proba.iloc[:, i].to_numpy() != np.zeros(y_test.shape))) + assert np.any(y_hat_proba.iloc[:, i].to_numpy() != np.zeros(y_test.shape)) # check user defined measures - fold_evaluations = collections.defaultdict(lambda: collections.defaultdict(dict)) + fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(lambda: collections.defaultdict(dict)) for measure in user_defined_measures: fold_evaluations[measure][0][0] = user_defined_measures[measure] # trace. SGD does not produce any - self.assertIsNone(trace) + assert trace is None self._check_fold_timing_evaluations( fold_evaluations, @@ -1812,7 +1817,7 @@ def test_run_model_on_fold_classification_1_dataframe(self): check_scores=False, ) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_run_model_on_fold_classification_2(self): task = openml.tasks.get_task(7) # kr-vs-kp; crossvalidation @@ -1841,22 +1846,22 @@ def test_run_model_on_fold_classification_2(self): y_hat, y_hat_proba, user_defined_measures, trace = res # predictions - self.assertIsInstance(y_hat, np.ndarray) - self.assertEqual(y_hat.shape, y_test.shape) - self.assertIsInstance(y_hat_proba, pd.DataFrame) - self.assertEqual(y_hat_proba.shape, (y_test.shape[0], 2)) + assert isinstance(y_hat, np.ndarray) + assert y_hat.shape == y_test.shape + assert isinstance(y_hat_proba, pd.DataFrame) + assert y_hat_proba.shape == (y_test.shape[0], 2) np.testing.assert_array_almost_equal(np.sum(y_hat_proba, axis=1), np.ones(y_test.shape)) for i in (0, 1): - self.assertTrue(np.any(y_hat_proba.to_numpy()[:, i] != np.zeros(y_test.shape))) + assert np.any(y_hat_proba.to_numpy()[:, i] != np.zeros(y_test.shape)) # check user defined measures - fold_evaluations = collections.defaultdict(lambda: collections.defaultdict(dict)) + fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(lambda: collections.defaultdict(dict)) for measure in user_defined_measures: fold_evaluations[measure][0][0] = user_defined_measures[measure] # check that it produced and returned a trace object of the correct length - self.assertIsInstance(trace, OpenMLRunTrace) - self.assertEqual(len(trace.trace_iterations), 2) + assert isinstance(trace, OpenMLRunTrace) + assert len(trace.trace_iterations) == 2 self._check_fold_timing_evaluations( fold_evaluations, @@ -1866,7 +1871,7 @@ def test_run_model_on_fold_classification_2(self): check_scores=False, ) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_run_model_on_fold_classification_3(self): class HardNaiveBayes(sklearn.naive_bayes.GaussianNB): # class for testing a naive bayes classifier that does not allow soft @@ -1887,7 +1892,9 @@ def predict_proba(*args, **kwargs): task = openml.tasks.get_task(task_id) X, y = task.get_X_and_y() train_indices, test_indices = task.get_train_test_split_indices( - repeat=0, fold=0, sample=0 + repeat=0, + fold=0, + sample=0, ) X_train = X[train_indices] y_train = y[train_indices] @@ -1896,10 +1903,10 @@ def predict_proba(*args, **kwargs): steps=[ ("imputer", SimpleImputer()), ("estimator", sklearn.naive_bayes.GaussianNB()), - ] + ], ) clf2 = sklearn.pipeline.Pipeline( - steps=[("imputer", SimpleImputer()), ("estimator", HardNaiveBayes())] + steps=[("imputer", SimpleImputer()), ("estimator", HardNaiveBayes())], ) pred_1, proba_1, _, _ = self.extension._run_model_on_fold( @@ -1925,19 +1932,18 @@ def predict_proba(*args, **kwargs): np.testing.assert_array_equal(pred_1, pred_2) np.testing.assert_array_almost_equal(np.sum(proba_1, axis=1), np.ones(X_test.shape[0])) # Test that there are predictions other than ones and zeros - self.assertLess( - np.sum(proba_1.to_numpy() == 0) + np.sum(proba_1.to_numpy() == 1), - X_test.shape[0] * len(task.class_labels), - ) + assert np.sum(proba_1.to_numpy() == 0) + np.sum(proba_1.to_numpy() == 1) < X_test.shape[ + 0 + ] * len(task.class_labels) np.testing.assert_array_almost_equal(np.sum(proba_2, axis=1), np.ones(X_test.shape[0])) # Test that there are only ones and zeros predicted - self.assertEqual( - np.sum(proba_2.to_numpy() == 0) + np.sum(proba_2.to_numpy() == 1), - X_test.shape[0] * len(task.class_labels), - ) + assert np.sum(proba_2.to_numpy() == 0) + np.sum( + proba_2.to_numpy() == 1 + ) == X_test.shape[0] * len(task.class_labels) - @pytest.mark.sklearn + @pytest.mark.sklearn() + @pytest.mark.production() def test_run_model_on_fold_regression(self): # There aren't any regression tasks on the test server openml.config.server = self.production_server @@ -1951,7 +1957,7 @@ def test_run_model_on_fold_regression(self): y_test = y[test_indices] pipeline = sklearn.pipeline.Pipeline( - steps=[("imp", SimpleImputer()), ("clf", sklearn.tree.DecisionTreeRegressor())] + steps=[("imp", SimpleImputer()), ("clf", sklearn.tree.DecisionTreeRegressor())], ) # TODO add some mocking here to actually test the innards of this function, too! res = self.extension._run_model_on_fold( @@ -1967,17 +1973,17 @@ def test_run_model_on_fold_regression(self): y_hat, y_hat_proba, user_defined_measures, trace = res # predictions - self.assertIsInstance(y_hat, np.ndarray) - self.assertEqual(y_hat.shape, y_test.shape) - self.assertIsNone(y_hat_proba) + assert isinstance(y_hat, np.ndarray) + assert y_hat.shape == y_test.shape + assert y_hat_proba is None # check user defined measures - fold_evaluations = collections.defaultdict(lambda: collections.defaultdict(dict)) + fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(lambda: collections.defaultdict(dict)) for measure in user_defined_measures: fold_evaluations[measure][0][0] = user_defined_measures[measure] # trace. SGD does not produce any - self.assertIsNone(trace) + assert trace is None self._check_fold_timing_evaluations( fold_evaluations, @@ -1987,7 +1993,8 @@ def test_run_model_on_fold_regression(self): check_scores=False, ) - @pytest.mark.sklearn + @pytest.mark.sklearn() + @pytest.mark.production() def test_run_model_on_fold_clustering(self): # There aren't any regression tasks on the test server openml.config.server = self.production_server @@ -1996,7 +2003,7 @@ def test_run_model_on_fold_clustering(self): X = task.get_X(dataset_format="array") pipeline = sklearn.pipeline.Pipeline( - steps=[("imp", SimpleImputer()), ("clf", sklearn.cluster.KMeans())] + steps=[("imp", SimpleImputer()), ("clf", sklearn.cluster.KMeans())], ) # TODO add some mocking here to actually test the innards of this function, too! res = self.extension._run_model_on_fold( @@ -2010,17 +2017,17 @@ def test_run_model_on_fold_clustering(self): y_hat, y_hat_proba, user_defined_measures, trace = res # predictions - self.assertIsInstance(y_hat, np.ndarray) - self.assertEqual(y_hat.shape, (X.shape[0],)) - self.assertIsNone(y_hat_proba) + assert isinstance(y_hat, np.ndarray) + assert y_hat.shape == (X.shape[0],) + assert y_hat_proba is None # check user defined measures - fold_evaluations = collections.defaultdict(lambda: collections.defaultdict(dict)) + fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(lambda: collections.defaultdict(dict)) for measure in user_defined_measures: fold_evaluations[measure][0][0] = user_defined_measures[measure] # trace. SGD does not produce any - self.assertIsNone(trace) + assert trace is None self._check_fold_timing_evaluations( fold_evaluations, @@ -2030,7 +2037,7 @@ def test_run_model_on_fold_clustering(self): check_scores=False, ) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test__extract_trace_data(self): param_grid = { "hidden_layer_sizes": [[5, 5], [10, 10], [20, 20]], @@ -2053,34 +2060,34 @@ def test__extract_trace_data(self): clf.fit(X[train], y[train]) # check num layers of MLP - self.assertIn(clf.best_estimator_.hidden_layer_sizes, param_grid["hidden_layer_sizes"]) + assert clf.best_estimator_.hidden_layer_sizes in param_grid["hidden_layer_sizes"] trace_list = self.extension._extract_trace_data(clf, rep_no=0, fold_no=0) trace = self.extension._obtain_arff_trace(clf, trace_list) - self.assertIsInstance(trace, OpenMLRunTrace) - self.assertIsInstance(trace_list, list) - self.assertEqual(len(trace_list), num_iters) + assert isinstance(trace, OpenMLRunTrace) + assert isinstance(trace_list, list) + assert len(trace_list) == num_iters for trace_iteration in iter(trace): - self.assertEqual(trace_iteration.repeat, 0) - self.assertEqual(trace_iteration.fold, 0) - self.assertGreaterEqual(trace_iteration.iteration, 0) - self.assertLessEqual(trace_iteration.iteration, num_iters) - self.assertIsNone(trace_iteration.setup_string) - self.assertIsInstance(trace_iteration.evaluation, float) - self.assertTrue(np.isfinite(trace_iteration.evaluation)) - self.assertIsInstance(trace_iteration.selected, bool) - - self.assertEqual(len(trace_iteration.parameters), len(param_grid)) + assert trace_iteration.repeat == 0 + assert trace_iteration.fold == 0 + assert trace_iteration.iteration >= 0 + assert trace_iteration.iteration <= num_iters + assert trace_iteration.setup_string is None + assert isinstance(trace_iteration.evaluation, float) + assert np.isfinite(trace_iteration.evaluation) + assert isinstance(trace_iteration.selected, bool) + + assert len(trace_iteration.parameters) == len(param_grid) for param in param_grid: # Prepend with the "parameter_" prefix param_in_trace = "parameter_%s" % param - self.assertIn(param_in_trace, trace_iteration.parameters) + assert param_in_trace in trace_iteration.parameters param_value = json.loads(trace_iteration.parameters[param_in_trace]) - self.assertTrue(param_value in param_grid[param]) + assert param_value in param_grid[param] - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_trim_flow_name(self): import re @@ -2097,10 +2104,8 @@ def test_trim_flow_name(self): short = "sklearn.Pipeline(ColumnTransformer,VarianceThreshold,SVC)" shorter = "sklearn.Pipeline(...,SVC)" long_stripped, _ = re.subn(r"\s", "", long) - self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped)) - self.assertEqual( - shorter, SklearnExtension.trim_flow_name(long_stripped, extra_trim_length=50) - ) + assert short == SklearnExtension.trim_flow_name(long_stripped) + assert shorter == SklearnExtension.trim_flow_name(long_stripped, extra_trim_length=50) long = """sklearn.pipeline.Pipeline( imputation=openmlstudy14.preprocessing.ConditionalImputer, @@ -2109,16 +2114,18 @@ def test_trim_flow_name(self): classifier=sklearn.ensemble.forest.RandomForestClassifier)""" short = "sklearn.Pipeline(ConditionalImputer,OneHotEncoder,VarianceThreshold,RandomForestClassifier)" # noqa: E501 long_stripped, _ = re.subn(r"\s", "", long) - self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped)) + assert short == SklearnExtension.trim_flow_name(long_stripped) long = """sklearn.pipeline.Pipeline( SimpleImputer=sklearn.preprocessing.imputation.Imputer, VarianceThreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, # noqa: E501 Estimator=sklearn.model_selection._search.RandomizedSearchCV( estimator=sklearn.tree.tree.DecisionTreeClassifier))""" - short = "sklearn.Pipeline(Imputer,VarianceThreshold,RandomizedSearchCV(DecisionTreeClassifier))" # noqa: E501 + short = ( + "sklearn.Pipeline(Imputer,VarianceThreshold,RandomizedSearchCV(DecisionTreeClassifier))" + ) long_stripped, _ = re.subn(r"\s", "", long) - self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped)) + assert short == SklearnExtension.trim_flow_name(long_stripped) long = """sklearn.model_selection._search.RandomizedSearchCV( estimator=sklearn.pipeline.Pipeline( @@ -2126,24 +2133,22 @@ def test_trim_flow_name(self): classifier=sklearn.ensemble.forest.RandomForestClassifier))""" short = "sklearn.RandomizedSearchCV(Pipeline(Imputer,RandomForestClassifier))" long_stripped, _ = re.subn(r"\s", "", long) - self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped)) + assert short == SklearnExtension.trim_flow_name(long_stripped) long = """sklearn.pipeline.FeatureUnion( pca=sklearn.decomposition.pca.PCA, svd=sklearn.decomposition.truncated_svd.TruncatedSVD)""" short = "sklearn.FeatureUnion(PCA,TruncatedSVD)" long_stripped, _ = re.subn(r"\s", "", long) - self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped)) + assert short == SklearnExtension.trim_flow_name(long_stripped) long = "sklearn.ensemble.forest.RandomForestClassifier" short = "sklearn.RandomForestClassifier" - self.assertEqual(short, SklearnExtension.trim_flow_name(long)) + assert short == SklearnExtension.trim_flow_name(long) - self.assertEqual( - "weka.IsolationForest", SklearnExtension.trim_flow_name("weka.IsolationForest") - ) + assert SklearnExtension.trim_flow_name("weka.IsolationForest") == "weka.IsolationForest" - @pytest.mark.sklearn + @pytest.mark.sklearn() @unittest.skipIf( LooseVersion(sklearn.__version__) < "0.21", reason="SimpleImputer, ColumnTransformer available only after 0.19 and " @@ -2157,7 +2162,8 @@ def test_run_on_model_with_empty_steps(self): task = openml.tasks.get_task(59) # mfeat-pixel; crossvalidation X, y, categorical_ind, feature_names = dataset.get_data( - target=dataset.default_target_attribute, dataset_format="array" + target=dataset.default_target_attribute, + dataset_format="array", ) categorical_ind = np.array(categorical_ind) (cat_idx,) = np.where(categorical_ind) @@ -2176,8 +2182,8 @@ def test_run_on_model_with_empty_steps(self): make_pipeline(SimpleImputer(strategy="median"), StandardScaler()), cont_idx.tolist(), ), - ] - ) + ], + ), ) clf = sklearn.pipeline.Pipeline( @@ -2185,7 +2191,7 @@ def test_run_on_model_with_empty_steps(self): ("dummystep", "passthrough"), # adding 'passthrough' as an estimator ("prep", clf), ("classifier", sklearn.svm.SVC(gamma="auto")), - ] + ], ) # adding 'drop' to a ColumnTransformer @@ -2197,43 +2203,42 @@ def test_run_on_model_with_empty_steps(self): # serializing model with non-actionable step run, flow = openml.runs.run_model_on_task(model=clf, task=task, return_flow=True) - self.assertEqual(len(flow.components), 3) - self.assertIsInstance(flow.components["dummystep"], OpenMLFlow) - self.assertEqual(flow.components["dummystep"].name, "passthrough") - self.assertIsInstance(flow.components["classifier"], OpenMLFlow) + assert len(flow.components) == 3 + assert isinstance(flow.components["dummystep"], OpenMLFlow) + assert flow.components["dummystep"].name == "passthrough" + assert isinstance(flow.components["classifier"], OpenMLFlow) if LooseVersion(sklearn.__version__) < "0.22": - self.assertEqual(flow.components["classifier"].name, "sklearn.svm.classes.SVC") + assert flow.components["classifier"].name == "sklearn.svm.classes.SVC" else: - self.assertEqual(flow.components["classifier"].name, "sklearn.svm._classes.SVC") - self.assertIsInstance(flow.components["prep"], OpenMLFlow) - self.assertEqual(flow.components["prep"].class_name, "sklearn.pipeline.Pipeline") - self.assertIsInstance(flow.components["prep"].components["columntransformer"], OpenMLFlow) - self.assertIsInstance( - flow.components["prep"].components["columntransformer"].components["cat"], - OpenMLFlow, + assert flow.components["classifier"].name == "sklearn.svm._classes.SVC" + assert isinstance(flow.components["prep"], OpenMLFlow) + assert flow.components["prep"].class_name == "sklearn.pipeline.Pipeline" + assert isinstance(flow.components["prep"].components["columntransformer"], OpenMLFlow) + assert isinstance( + flow.components["prep"].components["columntransformer"].components["cat"], OpenMLFlow ) - self.assertEqual( - flow.components["prep"].components["columntransformer"].components["cat"].name, "drop" + assert ( + flow.components["prep"].components["columntransformer"].components["cat"].name == "drop" ) # de-serializing flow to a model with non-actionable step model = self.extension.flow_to_model(flow) model.fit(X, y) - self.assertEqual(type(model), type(clf)) - self.assertNotEqual(model, clf) - self.assertEqual(len(model.named_steps), 3) - self.assertEqual(model.named_steps["dummystep"], "passthrough") + assert type(model) == type(clf) + assert model != clf + assert len(model.named_steps) == 3 + assert model.named_steps["dummystep"] == "passthrough" xml = flow._to_dict() new_model = self.extension.flow_to_model(OpenMLFlow._from_dict(xml)) new_model.fit(X, y) - self.assertEqual(type(new_model), type(clf)) - self.assertNotEqual(new_model, clf) - self.assertEqual(len(new_model.named_steps), 3) - self.assertEqual(new_model.named_steps["dummystep"], "passthrough") + assert type(new_model) == type(clf) + assert new_model != clf + assert len(new_model.named_steps) == 3 + assert new_model.named_steps["dummystep"] == "passthrough" - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_sklearn_serialization_with_none_step(self): msg = ( "Cannot serialize objects of None type. Please use a valid " @@ -2241,12 +2246,12 @@ def test_sklearn_serialization_with_none_step(self): "replaced with 'drop' or 'passthrough'." ) clf = sklearn.pipeline.Pipeline( - [("dummystep", None), ("classifier", sklearn.svm.SVC(gamma="auto"))] + [("dummystep", None), ("classifier", sklearn.svm.SVC(gamma="auto"))], ) - with self.assertRaisesRegex(ValueError, msg): + with pytest.raises(ValueError, match=msg): self.extension.model_to_flow(clf) - @pytest.mark.sklearn + @pytest.mark.sklearn() @unittest.skipIf( LooseVersion(sklearn.__version__) < "0.20", reason="columntransformer introduction in 0.20.0", @@ -2260,17 +2265,18 @@ def test_failed_serialization_of_custom_class(self): from sklearn.preprocessing import Imputer as SimpleImputer import sklearn.tree - from sklearn.pipeline import Pipeline, make_pipeline from sklearn.compose import ColumnTransformer + from sklearn.pipeline import Pipeline, make_pipeline from sklearn.preprocessing import OneHotEncoder, StandardScaler cat_imp = make_pipeline( - SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore") + SimpleImputer(strategy="most_frequent"), + OneHotEncoder(handle_unknown="ignore"), ) cont_imp = make_pipeline(CustomImputer(), StandardScaler()) ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)]) clf = Pipeline( - steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())] + steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())], ) # build a sklearn classifier task = openml.tasks.get_task(253) # profb; crossvalidation @@ -2282,7 +2288,7 @@ def test_failed_serialization_of_custom_class(self): else: raise Exception(e) - @pytest.mark.sklearn + @pytest.mark.sklearn() @unittest.skipIf( LooseVersion(sklearn.__version__) < "0.20", reason="columntransformer introduction in 0.20.0", @@ -2301,7 +2307,7 @@ def column_transformer_pipe(task_id): transformers=[ ("num", StandardScaler(), cont), ("cat", OneHotEncoder(handle_unknown="ignore"), cat), - ] + ], ) # make pipeline clf = SVC(gamma="scale", random_state=1) @@ -2309,11 +2315,10 @@ def column_transformer_pipe(task_id): # run task run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False) run.publish() - new_run = openml.runs.get_run(run.run_id) - return new_run + return openml.runs.get_run(run.run_id) run1 = column_transformer_pipe(11) # only categorical TestBase._mark_entity_for_removal("run", run1.run_id) run2 = column_transformer_pipe(23) # only numeric TestBase._mark_entity_for_removal("run", run2.run_id) - self.assertEqual(run1.setup_id, run2.setup_id) + assert run1.setup_id == run2.setup_id diff --git a/tests/test_flows/dummy_learn/dummy_forest.py b/tests/test_flows/dummy_learn/dummy_forest.py index 613f73852..65e79e760 100644 --- a/tests/test_flows/dummy_learn/dummy_forest.py +++ b/tests/test_flows/dummy_learn/dummy_forest.py @@ -1,7 +1,8 @@ # License: BSD 3-Clause +from __future__ import annotations -class DummyRegressor(object): +class DummyRegressor: def fit(self, X, y): return self diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py index 983ea206d..afa31ef63 100644 --- a/tests/test_flows/test_flow.py +++ b/tests/test_flows/test_flow.py @@ -1,14 +1,15 @@ # License: BSD 3-Clause +from __future__ import annotations import collections import copy -from distutils.version import LooseVersion import hashlib import re import time +from distutils.version import LooseVersion from unittest import mock -import pytest +import pytest import scipy.stats import sklearn import sklearn.datasets @@ -17,19 +18,18 @@ import sklearn.ensemble import sklearn.feature_selection import sklearn.model_selection +import sklearn.naive_bayes import sklearn.pipeline import sklearn.preprocessing -import sklearn.naive_bayes import sklearn.tree - import xmltodict import openml -from openml._api_calls import _perform_api_call import openml.exceptions import openml.extensions.sklearn -from openml.testing import TestBase, SimpleImputer import openml.utils +from openml._api_calls import _perform_api_call +from openml.testing import SimpleImputer, TestBase class TestFlow(TestBase): @@ -42,38 +42,40 @@ def setUp(self): def tearDown(self): super().tearDown() + @pytest.mark.production() def test_get_flow(self): # We need to use the production server here because 4024 is not the # test server openml.config.server = self.production_server flow = openml.flows.get_flow(4024) - self.assertIsInstance(flow, openml.OpenMLFlow) - self.assertEqual(flow.flow_id, 4024) - self.assertEqual(len(flow.parameters), 24) - self.assertEqual(len(flow.components), 1) - - subflow_1 = list(flow.components.values())[0] - self.assertIsInstance(subflow_1, openml.OpenMLFlow) - self.assertEqual(subflow_1.flow_id, 4025) - self.assertEqual(len(subflow_1.parameters), 14) - self.assertEqual(subflow_1.parameters["E"], "CC") - self.assertEqual(len(subflow_1.components), 1) - - subflow_2 = list(subflow_1.components.values())[0] - self.assertIsInstance(subflow_2, openml.OpenMLFlow) - self.assertEqual(subflow_2.flow_id, 4026) - self.assertEqual(len(subflow_2.parameters), 13) - self.assertEqual(subflow_2.parameters["I"], "10") - self.assertEqual(len(subflow_2.components), 1) - - subflow_3 = list(subflow_2.components.values())[0] - self.assertIsInstance(subflow_3, openml.OpenMLFlow) - self.assertEqual(subflow_3.flow_id, 1724) - self.assertEqual(len(subflow_3.parameters), 11) - self.assertEqual(subflow_3.parameters["L"], "-1") - self.assertEqual(len(subflow_3.components), 0) - + assert isinstance(flow, openml.OpenMLFlow) + assert flow.flow_id == 4024 + assert len(flow.parameters) == 24 + assert len(flow.components) == 1 + + subflow_1 = next(iter(flow.components.values())) + assert isinstance(subflow_1, openml.OpenMLFlow) + assert subflow_1.flow_id == 4025 + assert len(subflow_1.parameters) == 14 + assert subflow_1.parameters["E"] == "CC" + assert len(subflow_1.components) == 1 + + subflow_2 = next(iter(subflow_1.components.values())) + assert isinstance(subflow_2, openml.OpenMLFlow) + assert subflow_2.flow_id == 4026 + assert len(subflow_2.parameters) == 13 + assert subflow_2.parameters["I"] == "10" + assert len(subflow_2.components) == 1 + + subflow_3 = next(iter(subflow_2.components.values())) + assert isinstance(subflow_3, openml.OpenMLFlow) + assert subflow_3.flow_id == 1724 + assert len(subflow_3.parameters) == 11 + assert subflow_3.parameters["L"] == "-1" + assert len(subflow_3.components) == 0 + + @pytest.mark.production() def test_get_structure(self): # also responsible for testing: flow.get_subflow # We need to use the production server here because 4024 is not the @@ -85,33 +87,35 @@ def test_get_structure(self): flow_structure_id = flow.get_structure("flow_id") # components: root (filteredclassifier), multisearch, loginboost, # reptree - self.assertEqual(len(flow_structure_name), 4) - self.assertEqual(len(flow_structure_id), 4) + assert len(flow_structure_name) == 4 + assert len(flow_structure_id) == 4 for sub_flow_name, structure in flow_structure_name.items(): if len(structure) > 0: # skip root element subflow = flow.get_subflow(structure) - self.assertEqual(subflow.name, sub_flow_name) + assert subflow.name == sub_flow_name for sub_flow_id, structure in flow_structure_id.items(): if len(structure) > 0: # skip root element subflow = flow.get_subflow(structure) - self.assertEqual(subflow.flow_id, sub_flow_id) + assert subflow.flow_id == sub_flow_id def test_tagging(self): flows = openml.flows.list_flows(size=1, output_format="dataframe") flow_id = flows["id"].iloc[0] flow = openml.flows.get_flow(flow_id) - tag = "testing_tag_{}_{}".format(self.id(), time.time()) + # tags can be at most 64 alphanumeric (+ underscore) chars + unique_indicator = str(time.time()).replace(".", "") + tag = f"test_tag_TestFlow_{unique_indicator}" flows = openml.flows.list_flows(tag=tag, output_format="dataframe") - self.assertEqual(len(flows), 0) + assert len(flows) == 0 flow.push_tag(tag) flows = openml.flows.list_flows(tag=tag, output_format="dataframe") - self.assertEqual(len(flows), 1) - self.assertIn(flow_id, flows["id"]) + assert len(flows) == 1 + assert flow_id in flows["id"] flow.remove_tag(tag) flows = openml.flows.list_flows(tag=tag, output_format="dataframe") - self.assertEqual(len(flows), 0) + assert len(flows) == 0 def test_from_xml_to_xml(self): # Get the raw xml thing @@ -147,13 +151,13 @@ def test_from_xml_to_xml(self): ) new_xml = re.sub(r"^$", "", new_xml) - self.assertEqual(new_xml, flow_xml) + assert new_xml == flow_xml - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_to_xml_from_xml(self): scaler = sklearn.preprocessing.StandardScaler(with_mean=False) boosting = sklearn.ensemble.AdaBoostClassifier( - base_estimator=sklearn.tree.DecisionTreeClassifier() + base_estimator=sklearn.tree.DecisionTreeClassifier(), ) model = sklearn.pipeline.Pipeline(steps=(("scaler", scaler), ("boosting", boosting))) flow = self.extension.model_to_flow(model) @@ -166,9 +170,9 @@ def test_to_xml_from_xml(self): # Would raise exception if they are not legal openml.flows.functions.assert_flows_equal(new_flow, flow) - self.assertIsNot(new_flow, flow) + assert new_flow is not flow - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_publish_flow(self): flow = openml.OpenMLFlow( name="sklearn.dummy.DummyClassifier", @@ -190,70 +194,65 @@ def test_publish_flow(self): flow, _ = self._add_sentinel_to_flow_name(flow, None) flow.publish() - TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name)) + TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name) TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id)) - self.assertIsInstance(flow.flow_id, int) + assert isinstance(flow.flow_id, int) - @pytest.mark.sklearn + @pytest.mark.sklearn() @mock.patch("openml.flows.functions.flow_exists") def test_publish_existing_flow(self, flow_exists_mock): clf = sklearn.tree.DecisionTreeClassifier(max_depth=2) flow = self.extension.model_to_flow(clf) flow_exists_mock.return_value = 1 - with self.assertRaises(openml.exceptions.PyOpenMLError) as context_manager: + with pytest.raises(openml.exceptions.PyOpenMLError, match="OpenMLFlow already exists"): flow.publish(raise_error_if_exists=True) - TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name)) - TestBase.logger.info( - "collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id) - ) - self.assertTrue("OpenMLFlow already exists" in context_manager.exception.message) + TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name) + TestBase.logger.info( + "collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id), + ) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_publish_flow_with_similar_components(self): clf = sklearn.ensemble.VotingClassifier( - [("lr", sklearn.linear_model.LogisticRegression(solver="lbfgs"))] + [("lr", sklearn.linear_model.LogisticRegression(solver="lbfgs"))], ) flow = self.extension.model_to_flow(clf) flow, _ = self._add_sentinel_to_flow_name(flow, None) flow.publish() - TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name)) + TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name) TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id)) # For a flow where both components are published together, the upload # date should be equal - self.assertEqual( - flow.upload_date, - flow.components["lr"].upload_date, - msg=( - flow.name, - flow.flow_id, - flow.components["lr"].name, - flow.components["lr"].flow_id, - ), + assert flow.upload_date == flow.components["lr"].upload_date, ( + flow.name, + flow.flow_id, + flow.components["lr"].name, + flow.components["lr"].flow_id, ) clf1 = sklearn.tree.DecisionTreeClassifier(max_depth=2) flow1 = self.extension.model_to_flow(clf1) flow1, sentinel = self._add_sentinel_to_flow_name(flow1, None) flow1.publish() - TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name)) + TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name) TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], flow1.flow_id)) # In order to assign different upload times to the flows! time.sleep(1) clf2 = sklearn.ensemble.VotingClassifier( - [("dt", sklearn.tree.DecisionTreeClassifier(max_depth=2))] + [("dt", sklearn.tree.DecisionTreeClassifier(max_depth=2))], ) flow2 = self.extension.model_to_flow(clf2) flow2, _ = self._add_sentinel_to_flow_name(flow2, sentinel) flow2.publish() - TestBase._mark_entity_for_removal("flow", (flow2.flow_id, flow2.name)) + TestBase._mark_entity_for_removal("flow", flow2.flow_id, flow2.name) TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], flow2.flow_id)) # If one component was published before the other, the components in # the flow should have different upload dates - self.assertNotEqual(flow2.upload_date, flow2.components["dt"].upload_date) + assert flow2.upload_date != flow2.components["dt"].upload_date clf3 = sklearn.ensemble.AdaBoostClassifier(sklearn.tree.DecisionTreeClassifier(max_depth=3)) flow3 = self.extension.model_to_flow(clf3) @@ -261,27 +260,27 @@ def test_publish_flow_with_similar_components(self): # Child flow has different parameter. Check for storing the flow # correctly on the server should thus not check the child's parameters! flow3.publish() - TestBase._mark_entity_for_removal("flow", (flow3.flow_id, flow3.name)) + TestBase._mark_entity_for_removal("flow", flow3.flow_id, flow3.name) TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], flow3.flow_id)) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_semi_legal_flow(self): # TODO: Test if parameters are set correctly! # should not throw error as it contains two differentiable forms of # Bagging i.e., Bagging(Bagging(J48)) and Bagging(J48) semi_legal = sklearn.ensemble.BaggingClassifier( base_estimator=sklearn.ensemble.BaggingClassifier( - base_estimator=sklearn.tree.DecisionTreeClassifier() - ) + base_estimator=sklearn.tree.DecisionTreeClassifier(), + ), ) flow = self.extension.model_to_flow(semi_legal) flow, _ = self._add_sentinel_to_flow_name(flow, None) flow.publish() - TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name)) + TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name) TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id)) - @pytest.mark.sklearn + @pytest.mark.sklearn() @mock.patch("openml.flows.functions.get_flow") @mock.patch("openml.flows.functions.flow_exists") @mock.patch("openml._api_calls._perform_api_call") @@ -297,22 +296,15 @@ def test_publish_error(self, api_call_mock, flow_exists_mock, get_flow_mock): flow.publish() # Not collecting flow_id for deletion since this is a test for failed upload - self.assertEqual(api_call_mock.call_count, 1) - self.assertEqual(get_flow_mock.call_count, 1) - self.assertEqual(flow_exists_mock.call_count, 1) + assert api_call_mock.call_count == 1 + assert get_flow_mock.call_count == 1 + assert flow_exists_mock.call_count == 1 flow_copy = copy.deepcopy(flow) flow_copy.name = flow_copy.name[:-1] get_flow_mock.return_value = flow_copy flow_exists_mock.return_value = 1 - with self.assertRaises(ValueError) as context_manager: - flow.publish() - TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name)) - TestBase.logger.info( - "collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id) - ) - if LooseVersion(sklearn.__version__) < "0.22": fixture = ( "The flow on the server is inconsistent with the local flow. " @@ -334,11 +326,17 @@ def test_publish_error(self, api_call_mock, flow_exists_mock, get_flow_mock): "'sklearn.ensemble._forest.RandomForestClassifier'" "\nvs\n'sklearn.ensemble._forest.RandomForestClassifie'.'" ) + with pytest.raises(ValueError, match=fixture): + flow.publish() + + TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name) + TestBase.logger.info( + "collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id), + ) - self.assertEqual(context_manager.exception.args[0], fixture) - self.assertEqual(get_flow_mock.call_count, 2) + assert get_flow_mock.call_count == 2 - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_illegal_flow(self): # should throw error as it contains two imputers illegal = sklearn.pipeline.Pipeline( @@ -346,7 +344,7 @@ def test_illegal_flow(self): ("imputer1", SimpleImputer()), ("imputer2", SimpleImputer()), ("classif", sklearn.tree.DecisionTreeClassifier()), - ] + ], ) self.assertRaises(ValueError, self.extension.model_to_flow, illegal) @@ -358,16 +356,15 @@ def get_sentinel(): md5 = hashlib.md5() md5.update(str(time.time()).encode("utf-8")) sentinel = md5.hexdigest()[:10] - sentinel = "TEST%s" % sentinel - return sentinel + return "TEST%s" % sentinel name = get_sentinel() + get_sentinel() version = get_sentinel() flow_id = openml.flows.flow_exists(name, version) - self.assertFalse(flow_id) + assert not flow_id - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_existing_flow_exists(self): # create a flow nb = sklearn.naive_bayes.GaussianNB() @@ -391,9 +388,9 @@ def test_existing_flow_exists(self): flow, _ = self._add_sentinel_to_flow_name(flow, None) # publish the flow flow = flow.publish() - TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name)) + TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name) TestBase.logger.info( - "collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id) + "collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id), ) # redownload the flow flow = openml.flows.get_flow(flow.flow_id) @@ -404,9 +401,9 @@ def test_existing_flow_exists(self): flow.name, flow.external_version, ) - self.assertEqual(downloaded_flow_id, flow.flow_id) + assert downloaded_flow_id == flow.flow_id - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_sklearn_to_upload_to_flow(self): iris = sklearn.datasets.load_iris() X = iris.data @@ -420,14 +417,15 @@ def test_sklearn_to_upload_to_flow(self): scaler = sklearn.preprocessing.StandardScaler(with_mean=False) pca = sklearn.decomposition.TruncatedSVD() fs = sklearn.feature_selection.SelectPercentile( - score_func=sklearn.feature_selection.f_classif, percentile=30 + score_func=sklearn.feature_selection.f_classif, + percentile=30, ) fu = sklearn.pipeline.FeatureUnion(transformer_list=[("pca", pca), ("fs", fs)]) boosting = sklearn.ensemble.AdaBoostClassifier( - base_estimator=sklearn.tree.DecisionTreeClassifier() + base_estimator=sklearn.tree.DecisionTreeClassifier(), ) model = sklearn.pipeline.Pipeline( - steps=[("ohe", ohe), ("scaler", scaler), ("fu", fu), ("boosting", boosting)] + steps=[("ohe", ohe), ("scaler", scaler), ("fu", fu), ("boosting", boosting)], ) parameter_grid = { "boosting__n_estimators": [1, 5, 10, 100], @@ -436,7 +434,9 @@ def test_sklearn_to_upload_to_flow(self): } cv = sklearn.model_selection.StratifiedKFold(n_splits=5, shuffle=True) rs = sklearn.model_selection.RandomizedSearchCV( - estimator=model, param_distributions=parameter_grid, cv=cv + estimator=model, + param_distributions=parameter_grid, + cv=cv, ) rs.fit(X, y) flow = self.extension.model_to_flow(rs) @@ -451,9 +451,9 @@ def test_sklearn_to_upload_to_flow(self): flow, sentinel = self._add_sentinel_to_flow_name(flow, None) flow.publish() - TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name)) + TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name) TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id)) - self.assertIsInstance(flow.flow_id, int) + assert isinstance(flow.flow_id, int) # Check whether we can load the flow again # Remove the sentinel from the name again so that we can reinstantiate @@ -463,7 +463,7 @@ def test_sklearn_to_upload_to_flow(self): local_xml = flow._to_xml() server_xml = new_flow._to_xml() - for i in range(10): + for _i in range(10): # Make sure that we replace all occurences of two newlines local_xml = local_xml.replace(sentinel, "") local_xml = ( @@ -484,19 +484,19 @@ def test_sklearn_to_upload_to_flow(self): ) server_xml = re.sub(r"^$", "", server_xml) - self.assertEqual(server_xml, local_xml) + assert server_xml == local_xml # Would raise exception if they are not equal! openml.flows.functions.assert_flows_equal(new_flow, flow) - self.assertIsNot(new_flow, flow) + assert new_flow is not flow # OneHotEncoder was moved to _encoders module in 0.20 module_name_encoder = "_encoders" if LooseVersion(sklearn.__version__) >= "0.20" else "data" if LooseVersion(sklearn.__version__) < "0.22": fixture_name = ( - "%ssklearn.model_selection._search.RandomizedSearchCV(" + f"{sentinel}sklearn.model_selection._search.RandomizedSearchCV(" "estimator=sklearn.pipeline.Pipeline(" - "ohe=sklearn.preprocessing.%s.OneHotEncoder," + f"ohe=sklearn.preprocessing.{module_name_encoder}.OneHotEncoder," "scaler=sklearn.preprocessing.data.StandardScaler," "fu=sklearn.pipeline.FeatureUnion(" "pca=sklearn.decomposition.truncated_svd.TruncatedSVD," @@ -504,7 +504,6 @@ def test_sklearn_to_upload_to_flow(self): "sklearn.feature_selection.univariate_selection.SelectPercentile)," "boosting=sklearn.ensemble.weight_boosting.AdaBoostClassifier(" "base_estimator=sklearn.tree.tree.DecisionTreeClassifier)))" - % (sentinel, module_name_encoder) ) else: # sklearn.sklearn.preprocessing.data -> sklearn.sklearn.preprocessing._data @@ -514,9 +513,9 @@ def test_sklearn_to_upload_to_flow(self): # sklearn.ensemble.weight_boosting -> sklearn.ensemble._weight_boosting # sklearn.tree.tree.DecisionTree... -> sklearn.tree._classes.DecisionTree... fixture_name = ( - "%ssklearn.model_selection._search.RandomizedSearchCV(" + f"{sentinel}sklearn.model_selection._search.RandomizedSearchCV(" "estimator=sklearn.pipeline.Pipeline(" - "ohe=sklearn.preprocessing.%s.OneHotEncoder," + f"ohe=sklearn.preprocessing.{module_name_encoder}.OneHotEncoder," "scaler=sklearn.preprocessing._data.StandardScaler," "fu=sklearn.pipeline.FeatureUnion(" "pca=sklearn.decomposition._truncated_svd.TruncatedSVD," @@ -524,44 +523,44 @@ def test_sklearn_to_upload_to_flow(self): "sklearn.feature_selection._univariate_selection.SelectPercentile)," "boosting=sklearn.ensemble._weight_boosting.AdaBoostClassifier(" "base_estimator=sklearn.tree._classes.DecisionTreeClassifier)))" - % (sentinel, module_name_encoder) ) - self.assertEqual(new_flow.name, fixture_name) + assert new_flow.name == fixture_name new_flow.model.fit(X, y) def test_extract_tags(self): flow_xml = "study_14" flow_dict = xmltodict.parse(flow_xml) tags = openml.utils.extract_xml_tags("oml:tag", flow_dict) - self.assertEqual(tags, ["study_14"]) + assert tags == ["study_14"] flow_xml = "OpenmlWeka\n" "weka" flow_dict = xmltodict.parse(flow_xml) tags = openml.utils.extract_xml_tags("oml:tag", flow_dict["oml:flow"]) - self.assertEqual(tags, ["OpenmlWeka", "weka"]) + assert tags == ["OpenmlWeka", "weka"] + @pytest.mark.production() def test_download_non_scikit_learn_flows(self): openml.config.server = self.production_server flow = openml.flows.get_flow(6742) - self.assertIsInstance(flow, openml.OpenMLFlow) - self.assertEqual(flow.flow_id, 6742) - self.assertEqual(len(flow.parameters), 19) - self.assertEqual(len(flow.components), 1) - self.assertIsNone(flow.model) - - subflow_1 = list(flow.components.values())[0] - self.assertIsInstance(subflow_1, openml.OpenMLFlow) - self.assertEqual(subflow_1.flow_id, 6743) - self.assertEqual(len(subflow_1.parameters), 8) - self.assertEqual(subflow_1.parameters["U"], "0") - self.assertEqual(len(subflow_1.components), 1) - self.assertIsNone(subflow_1.model) - - subflow_2 = list(subflow_1.components.values())[0] - self.assertIsInstance(subflow_2, openml.OpenMLFlow) - self.assertEqual(subflow_2.flow_id, 5888) - self.assertEqual(len(subflow_2.parameters), 4) - self.assertIsNone(subflow_2.parameters["batch-size"]) - self.assertEqual(len(subflow_2.components), 0) - self.assertIsNone(subflow_2.model) + assert isinstance(flow, openml.OpenMLFlow) + assert flow.flow_id == 6742 + assert len(flow.parameters) == 19 + assert len(flow.components) == 1 + assert flow.model is None + + subflow_1 = next(iter(flow.components.values())) + assert isinstance(subflow_1, openml.OpenMLFlow) + assert subflow_1.flow_id == 6743 + assert len(subflow_1.parameters) == 8 + assert subflow_1.parameters["U"] == "0" + assert len(subflow_1.components) == 1 + assert subflow_1.model is None + + subflow_2 = next(iter(subflow_1.components.values())) + assert isinstance(subflow_2, openml.OpenMLFlow) + assert subflow_2.flow_id == 5888 + assert len(subflow_2.parameters) == 4 + assert subflow_2.parameters["batch-size"] is None + assert len(subflow_2.components) == 0 + assert subflow_2.model is None diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py index 3814a8f9d..68d49eafa 100644 --- a/tests/test_flows/test_flow_functions.py +++ b/tests/test_flows/test_flow_functions.py @@ -1,24 +1,24 @@ # License: BSD 3-Clause +from __future__ import annotations -from collections import OrderedDict import copy import functools import unittest +from collections import OrderedDict +from distutils.version import LooseVersion from unittest import mock from unittest.mock import patch -from distutils.version import LooseVersion - +import pandas as pd +import pytest import requests import sklearn from sklearn import ensemble -import pandas as pd -import pytest import openml +import openml.extensions.sklearn from openml.exceptions import OpenMLNotAuthorizedError, OpenMLServerException from openml.testing import TestBase, create_request_response -import openml.extensions.sklearn @pytest.mark.usefixtures("long_version") @@ -26,61 +26,66 @@ class TestFlowFunctions(TestBase): _multiprocess_can_split_ = True def setUp(self): - super(TestFlowFunctions, self).setUp() + super().setUp() def tearDown(self): - super(TestFlowFunctions, self).tearDown() + super().tearDown() def _check_flow(self, flow): - self.assertEqual(type(flow), dict) - self.assertEqual(len(flow), 6) - self.assertIsInstance(flow["id"], int) - self.assertIsInstance(flow["name"], str) - self.assertIsInstance(flow["full_name"], str) - self.assertIsInstance(flow["version"], str) + assert type(flow) == dict + assert len(flow) == 6 + assert isinstance(flow["id"], int) + assert isinstance(flow["name"], str) + assert isinstance(flow["full_name"], str) + assert isinstance(flow["version"], str) # There are some runs on openml.org that can have an empty external version ext_version_str_or_none = ( isinstance(flow["external_version"], str) or flow["external_version"] is None ) - self.assertTrue(ext_version_str_or_none) + assert ext_version_str_or_none + @pytest.mark.production() def test_list_flows(self): openml.config.server = self.production_server # We can only perform a smoke test here because we test on dynamic # data from the internet... flows = openml.flows.list_flows(output_format="dataframe") # 3000 as the number of flows on openml.org - self.assertGreaterEqual(len(flows), 1500) + assert len(flows) >= 1500 for flow in flows.to_dict(orient="index").values(): self._check_flow(flow) + @pytest.mark.production() def test_list_flows_output_format(self): openml.config.server = self.production_server # We can only perform a smoke test here because we test on dynamic # data from the internet... flows = openml.flows.list_flows(output_format="dataframe") - self.assertIsInstance(flows, pd.DataFrame) - self.assertGreaterEqual(len(flows), 1500) + assert isinstance(flows, pd.DataFrame) + assert len(flows) >= 1500 + @pytest.mark.production() def test_list_flows_empty(self): openml.config.server = self.production_server flows = openml.flows.list_flows(tag="NoOneEverUsesThisTag123", output_format="dataframe") assert flows.empty + @pytest.mark.production() def test_list_flows_by_tag(self): openml.config.server = self.production_server flows = openml.flows.list_flows(tag="weka", output_format="dataframe") - self.assertGreaterEqual(len(flows), 5) + assert len(flows) >= 5 for flow in flows.to_dict(orient="index").values(): self._check_flow(flow) + @pytest.mark.production() def test_list_flows_paginate(self): openml.config.server = self.production_server size = 10 maximum = 100 for i in range(0, maximum, size): flows = openml.flows.list_flows(offset=i, size=size, output_format="dataframe") - self.assertGreaterEqual(size, len(flows)) + assert size >= len(flows) for flow in flows.to_dict(orient="index").values(): self._check_flow(flow) @@ -112,10 +117,7 @@ def test_are_flows_equal(self): ]: new_flow = copy.deepcopy(flow) setattr(new_flow, attribute, new_value) - self.assertNotEqual( - getattr(flow, attribute), - getattr(new_flow, attribute), - ) + assert getattr(flow, attribute) != getattr(new_flow, attribute) self.assertRaises( ValueError, openml.flows.functions.assert_flows_equal, @@ -138,10 +140,7 @@ def test_are_flows_equal(self): ]: new_flow = copy.deepcopy(flow) setattr(new_flow, attribute, new_value) - self.assertNotEqual( - getattr(flow, attribute), - getattr(new_flow, attribute), - ) + assert getattr(flow, attribute) != getattr(new_flow, attribute) openml.flows.functions.assert_flows_equal(flow, new_flow) # Now test for parameters @@ -158,12 +157,18 @@ def test_are_flows_equal(self): parent_flow.components["subflow"] = subflow openml.flows.functions.assert_flows_equal(parent_flow, parent_flow) self.assertRaises( - ValueError, openml.flows.functions.assert_flows_equal, parent_flow, subflow + ValueError, + openml.flows.functions.assert_flows_equal, + parent_flow, + subflow, ) new_flow = copy.deepcopy(parent_flow) new_flow.components["subflow"].name = "Subflow name" self.assertRaises( - ValueError, openml.flows.functions.assert_flows_equal, parent_flow, new_flow + ValueError, + openml.flows.functions.assert_flows_equal, + parent_flow, + new_flow, ) def test_are_flows_equal_ignore_parameter_values(self): @@ -272,7 +277,7 @@ def test_are_flows_equal_ignore_if_older(self): ) assert_flows_equal(flow, flow, ignore_parameter_values_on_older_children=None) - @pytest.mark.sklearn + @pytest.mark.sklearn() @unittest.skipIf( LooseVersion(sklearn.__version__) < "0.20", reason="OrdinalEncoder introduced in 0.20. " @@ -290,31 +295,32 @@ def test_sklearn_to_flow_list_of_lists(self): # Test flow is accepted by server self._add_sentinel_to_flow_name(flow) flow.publish() - TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name)) + TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name) TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id)) # Test deserialization works server_flow = openml.flows.get_flow(flow.flow_id, reinstantiate=True) - self.assertEqual(server_flow.parameters["categories"], "[[0, 1], [0, 1]]") - self.assertEqual(server_flow.model.categories, flow.model.categories) + assert server_flow.parameters["categories"] == "[[0, 1], [0, 1]]" + assert server_flow.model.categories == flow.model.categories + @pytest.mark.production() def test_get_flow1(self): # Regression test for issue #305 # Basically, this checks that a flow without an external version can be loaded openml.config.server = self.production_server flow = openml.flows.get_flow(1) - self.assertIsNone(flow.external_version) + assert flow.external_version is None - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_get_flow_reinstantiate_model(self): model = ensemble.RandomForestClassifier(n_estimators=33) extension = openml.extensions.get_extension_by_model(model) flow = extension.model_to_flow(model) flow.publish(raise_error_if_exists=False) - TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name)) + TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name) TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id)) downloaded_flow = openml.flows.get_flow(flow.flow_id, reinstantiate=True) - self.assertIsInstance(downloaded_flow.model, sklearn.ensemble.RandomForestClassifier) + assert isinstance(downloaded_flow.model, sklearn.ensemble.RandomForestClassifier) def test_get_flow_reinstantiate_model_no_extension(self): # Flow 10 is a WEKA flow @@ -326,11 +332,12 @@ def test_get_flow_reinstantiate_model_no_extension(self): reinstantiate=True, ) - @pytest.mark.sklearn + @pytest.mark.sklearn() @unittest.skipIf( LooseVersion(sklearn.__version__) == "0.19.1", reason="Requires scikit-learn!=0.19.1, because target flow is from that version.", ) + @pytest.mark.production() def test_get_flow_with_reinstantiate_strict_with_wrong_version_raises_exception(self): openml.config.server = self.production_server flow = 8175 @@ -344,44 +351,47 @@ def test_get_flow_with_reinstantiate_strict_with_wrong_version_raises_exception( strict_version=True, ) - @pytest.mark.sklearn + @pytest.mark.sklearn() @unittest.skipIf( LooseVersion(sklearn.__version__) < "1" and LooseVersion(sklearn.__version__) != "1.0.0", - reason="Requires scikit-learn < 1.0.1." + reason="Requires scikit-learn < 1.0.1.", # Because scikit-learn dropped min_impurity_split hyperparameter in 1.0, # and the requested flow is from 1.0.0 exactly. ) + @pytest.mark.production() def test_get_flow_reinstantiate_flow_not_strict_post_1(self): openml.config.server = self.production_server flow = openml.flows.get_flow(flow_id=19190, reinstantiate=True, strict_version=False) assert flow.flow_id is None assert "sklearn==1.0.0" not in flow.dependencies - @pytest.mark.sklearn + @pytest.mark.sklearn() @unittest.skipIf( (LooseVersion(sklearn.__version__) < "0.23.2") - or ("1.0" < LooseVersion(sklearn.__version__)), - reason="Requires scikit-learn 0.23.2 or ~0.24." + or (LooseVersion(sklearn.__version__) > "1.0"), + reason="Requires scikit-learn 0.23.2 or ~0.24.", # Because these still have min_impurity_split, but with new scikit-learn module structure." ) + @pytest.mark.production() def test_get_flow_reinstantiate_flow_not_strict_023_and_024(self): openml.config.server = self.production_server flow = openml.flows.get_flow(flow_id=18587, reinstantiate=True, strict_version=False) assert flow.flow_id is None assert "sklearn==0.23.1" not in flow.dependencies - @pytest.mark.sklearn + @pytest.mark.sklearn() @unittest.skipIf( - "0.23" < LooseVersion(sklearn.__version__), + LooseVersion(sklearn.__version__) > "0.23", reason="Requires scikit-learn<=0.23, because the scikit-learn module structure changed.", ) + @pytest.mark.production() def test_get_flow_reinstantiate_flow_not_strict_pre_023(self): openml.config.server = self.production_server flow = openml.flows.get_flow(flow_id=8175, reinstantiate=True, strict_version=False) assert flow.flow_id is None assert "sklearn==0.19.1" not in flow.dependencies - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_get_flow_id(self): if self.long_version: list_all = openml.utils._list_all @@ -390,27 +400,28 @@ def test_get_flow_id(self): with patch("openml.utils._list_all", list_all): clf = sklearn.tree.DecisionTreeClassifier() flow = openml.extensions.get_extension_by_model(clf).model_to_flow(clf).publish() - TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name)) + TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name) TestBase.logger.info( - "collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id) + "collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id), ) - self.assertEqual(openml.flows.get_flow_id(model=clf, exact_version=True), flow.flow_id) + assert openml.flows.get_flow_id(model=clf, exact_version=True) == flow.flow_id flow_ids = openml.flows.get_flow_id(model=clf, exact_version=False) - self.assertIn(flow.flow_id, flow_ids) - self.assertGreater(len(flow_ids), 0) + assert flow.flow_id in flow_ids + assert len(flow_ids) > 0 # Check that the output of get_flow_id is identical if only the name is given, no matter # whether exact_version is set to True or False. flow_ids_exact_version_True = openml.flows.get_flow_id( - name=flow.name, exact_version=True + name=flow.name, + exact_version=True, ) flow_ids_exact_version_False = openml.flows.get_flow_id( name=flow.name, exact_version=False, ) - self.assertEqual(flow_ids_exact_version_True, flow_ids_exact_version_False) - self.assertIn(flow.flow_id, flow_ids_exact_version_True) + assert flow_ids_exact_version_True == flow_ids_exact_version_False + assert flow.flow_id in flow_ids_exact_version_True def test_delete_flow(self): flow = openml.OpenMLFlow( @@ -431,7 +442,7 @@ def test_delete_flow(self): flow.publish() _flow_id = flow.flow_id - self.assertTrue(openml.flows.delete_flow(_flow_id)) + assert openml.flows.delete_flow(_flow_id) @mock.patch.object(requests.Session, "delete") @@ -439,7 +450,8 @@ def test_delete_flow_not_owned(mock_delete, test_files_directory, test_api_key): openml.config.start_using_configuration_for_example() content_file = test_files_directory / "mock_responses" / "flows" / "flow_delete_not_owned.xml" mock_delete.return_value = create_request_response( - status_code=412, content_filepath=content_file + status_code=412, + content_filepath=content_file, ) with pytest.raises( @@ -460,7 +472,8 @@ def test_delete_flow_with_run(mock_delete, test_files_directory, test_api_key): openml.config.start_using_configuration_for_example() content_file = test_files_directory / "mock_responses" / "flows" / "flow_delete_has_runs.xml" mock_delete.return_value = create_request_response( - status_code=412, content_filepath=content_file + status_code=412, + content_filepath=content_file, ) with pytest.raises( @@ -481,7 +494,8 @@ def test_delete_subflow(mock_delete, test_files_directory, test_api_key): openml.config.start_using_configuration_for_example() content_file = test_files_directory / "mock_responses" / "flows" / "flow_delete_is_subflow.xml" mock_delete.return_value = create_request_response( - status_code=412, content_filepath=content_file + status_code=412, + content_filepath=content_file, ) with pytest.raises( @@ -502,7 +516,8 @@ def test_delete_flow_success(mock_delete, test_files_directory, test_api_key): openml.config.start_using_configuration_for_example() content_file = test_files_directory / "mock_responses" / "flows" / "flow_delete_successful.xml" mock_delete.return_value = create_request_response( - status_code=200, content_filepath=content_file + status_code=200, + content_filepath=content_file, ) success = openml.flows.delete_flow(33364) @@ -520,7 +535,8 @@ def test_delete_unknown_flow(mock_delete, test_files_directory, test_api_key): openml.config.start_using_configuration_for_example() content_file = test_files_directory / "mock_responses" / "flows" / "flow_delete_not_exist.xml" mock_delete.return_value = create_request_response( - status_code=412, content_filepath=content_file + status_code=412, + content_filepath=content_file, ) with pytest.raises( diff --git a/tests/test_openml/test_api_calls.py b/tests/test_openml/test_api_calls.py index 4a4764bed..8c4c03276 100644 --- a/tests/test_openml/test_api_calls.py +++ b/tests/test_openml/test_api_calls.py @@ -1,15 +1,16 @@ +from __future__ import annotations + import unittest.mock +import pytest + import openml import openml.testing class TestConfig(openml.testing.TestBase): def test_too_long_uri(self): - with self.assertRaisesRegex( - openml.exceptions.OpenMLServerError, - "URI too long!", - ): + with pytest.raises(openml.exceptions.OpenMLServerError, match="URI too long!"): openml.datasets.list_datasets(data_id=list(range(10000)), output_format="dataframe") @unittest.mock.patch("time.sleep") @@ -25,9 +26,7 @@ def test_retry_on_database_error(self, Session_class_mock, _): "" ) Session_class_mock.return_value.__enter__.return_value.get.return_value = response_mock - with self.assertRaisesRegex( - openml.exceptions.OpenMLServerException, "/abc returned code 107" - ): + with pytest.raises(openml.exceptions.OpenMLServerException, match="/abc returned code 107"): openml._api_calls._send_request("get", "/abc", {}) - self.assertEqual(Session_class_mock.return_value.__enter__.return_value.get.call_count, 20) + assert Session_class_mock.return_value.__enter__.return_value.get.call_count == 20 diff --git a/tests/test_openml/test_config.py b/tests/test_openml/test_config.py index ba70689a1..bfb88a5db 100644 --- a/tests/test_openml/test_config.py +++ b/tests/test_openml/test_config.py @@ -1,30 +1,33 @@ # License: BSD 3-Clause +from __future__ import annotations -import tempfile import os +import tempfile import unittest.mock +from copy import copy +from pathlib import Path + +import pytest import openml.config import openml.testing class TestConfig(openml.testing.TestBase): - @unittest.mock.patch("os.path.expanduser") @unittest.mock.patch("openml.config.openml_logger.warning") @unittest.mock.patch("openml.config._create_log_handlers") @unittest.skipIf(os.name == "nt", "https://github.com/openml/openml-python/issues/1033") - def test_non_writable_home(self, log_handler_mock, warnings_mock, expanduser_mock): + def test_non_writable_home(self, log_handler_mock, warnings_mock): with tempfile.TemporaryDirectory(dir=self.workdir) as td: - expanduser_mock.side_effect = ( - os.path.join(td, "openmldir"), - os.path.join(td, "cachedir"), - ) os.chmod(td, 0o444) - openml.config._setup() + _dd = copy(openml.config._defaults) + _dd["cachedir"] = Path(td) / "something-else" + openml.config._setup(_dd) - self.assertEqual(warnings_mock.call_count, 2) - self.assertEqual(log_handler_mock.call_count, 1) - self.assertFalse(log_handler_mock.call_args_list[0][1]["create_file_handler"]) + assert warnings_mock.call_count == 2 + assert log_handler_mock.call_count == 1 + assert not log_handler_mock.call_args_list[0][1]["create_file_handler"] + assert openml.config._root_cache_directory == Path(td) / "something-else" @unittest.mock.patch("os.path.expanduser") def test_XDG_directories_do_not_exist(self, expanduser_mock): @@ -39,20 +42,20 @@ def side_effect(path_): def test_get_config_as_dict(self): """Checks if the current configuration is returned accurately as a dict.""" config = openml.config.get_config_as_dict() - _config = dict() + _config = {} _config["apikey"] = "610344db6388d9ba34f6db45a3cf71de" _config["server"] = "https://test.openml.org/api/v1/xml" _config["cachedir"] = self.workdir _config["avoid_duplicate_runs"] = False _config["connection_n_retries"] = 20 _config["retry_policy"] = "robot" - self.assertIsInstance(config, dict) - self.assertEqual(len(config), 6) + assert isinstance(config, dict) + assert len(config) == 6 self.assertDictEqual(config, _config) def test_setup_with_config(self): """Checks if the OpenML configuration can be updated using _setup().""" - _config = dict() + _config = {} _config["apikey"] = "610344db6388d9ba34f6db45a3cf71de" _config["server"] = "https://www.openml.org/api/v1/xml" _config["cachedir"] = self.workdir @@ -67,6 +70,7 @@ def test_setup_with_config(self): class TestConfigurationForExamples(openml.testing.TestBase): + @pytest.mark.production() def test_switch_to_example_configuration(self): """Verifies the test configuration is loaded properly.""" # Below is the default test key which would be used anyway, but just for clarity: @@ -75,9 +79,10 @@ def test_switch_to_example_configuration(self): openml.config.start_using_configuration_for_example() - self.assertEqual(openml.config.apikey, "c0c42819af31e706efe1f4b88c23c6c1") - self.assertEqual(openml.config.server, self.test_server) + assert openml.config.apikey == "c0c42819af31e706efe1f4b88c23c6c1" + assert openml.config.server == self.test_server + @pytest.mark.production() def test_switch_from_example_configuration(self): """Verifies the previous configuration is loaded after stopping.""" # Below is the default test key which would be used anyway, but just for clarity: @@ -87,16 +92,19 @@ def test_switch_from_example_configuration(self): openml.config.start_using_configuration_for_example() openml.config.stop_using_configuration_for_example() - self.assertEqual(openml.config.apikey, "610344db6388d9ba34f6db45a3cf71de") - self.assertEqual(openml.config.server, self.production_server) + assert openml.config.apikey == "610344db6388d9ba34f6db45a3cf71de" + assert openml.config.server == self.production_server def test_example_configuration_stop_before_start(self): """Verifies an error is raised is `stop_...` is called before `start_...`.""" error_regex = ".*stop_use_example_configuration.*start_use_example_configuration.*first" self.assertRaisesRegex( - RuntimeError, error_regex, openml.config.stop_using_configuration_for_example + RuntimeError, + error_regex, + openml.config.stop_using_configuration_for_example, ) + @pytest.mark.production() def test_example_configuration_start_twice(self): """Checks that the original config can be returned to if `start..` is called twice.""" openml.config.apikey = "610344db6388d9ba34f6db45a3cf71de" @@ -106,5 +114,5 @@ def test_example_configuration_start_twice(self): openml.config.start_using_configuration_for_example() openml.config.stop_using_configuration_for_example() - self.assertEqual(openml.config.apikey, "610344db6388d9ba34f6db45a3cf71de") - self.assertEqual(openml.config.server, self.production_server) + assert openml.config.apikey == "610344db6388d9ba34f6db45a3cf71de" + assert openml.config.server == self.production_server diff --git a/tests/test_openml/test_openml.py b/tests/test_openml/test_openml.py index 93d2e6925..998046726 100644 --- a/tests/test_openml/test_openml.py +++ b/tests/test_openml/test_openml.py @@ -1,9 +1,10 @@ # License: BSD 3-Clause +from __future__ import annotations from unittest import mock -from openml.testing import TestBase import openml +from openml.testing import TestBase class TestInit(TestBase): @@ -22,21 +23,21 @@ def test_populate_cache( task_mock, ): openml.populate_cache(task_ids=[1, 2], dataset_ids=[3, 4], flow_ids=[5, 6], run_ids=[7, 8]) - self.assertEqual(run_mock.call_count, 2) + assert run_mock.call_count == 2 for argument, fixture in zip(run_mock.call_args_list, [(7,), (8,)]): - self.assertEqual(argument[0], fixture) + assert argument[0] == fixture - self.assertEqual(flow_mock.call_count, 2) + assert flow_mock.call_count == 2 for argument, fixture in zip(flow_mock.call_args_list, [(5,), (6,)]): - self.assertEqual(argument[0], fixture) + assert argument[0] == fixture - self.assertEqual(dataset_mock.call_count, 2) + assert dataset_mock.call_count == 2 for argument, fixture in zip( dataset_mock.call_args_list, [(3,), (4,)], ): - self.assertEqual(argument[0], fixture) + assert argument[0] == fixture - self.assertEqual(task_mock.call_count, 2) + assert task_mock.call_count == 2 for argument, fixture in zip(task_mock.call_args_list, [(1,), (2,)]): - self.assertEqual(argument[0], fixture) + assert argument[0] == fixture diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py index 0396d0f19..ce46b6548 100644 --- a/tests/test_runs/test_run.py +++ b/tests/test_runs/test_run.py @@ -1,24 +1,24 @@ # License: BSD 3-Clause +from __future__ import annotations -import numpy as np -import random import os +import random from time import time +import numpy as np +import pytest import xmltodict +from sklearn.base import clone from sklearn.dummy import DummyClassifier from sklearn.linear_model import LinearRegression -from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline -from sklearn.base import clone +from sklearn.tree import DecisionTreeClassifier -from openml import OpenMLRun -from openml.testing import TestBase, SimpleImputer import openml import openml.extensions.sklearn - -import pytest +from openml import OpenMLRun +from openml.testing import SimpleImputer, TestBase class TestRun(TestBase): @@ -30,22 +30,24 @@ def test_tagging(self): assert not runs.empty, "Test server state is incorrect" run_id = runs["run_id"].iloc[0] run = openml.runs.get_run(run_id) - tag = "testing_tag_{}_{}".format(self.id(), time()) + # tags can be at most 64 alphanumeric (+ underscore) chars + unique_indicator = str(time()).replace(".", "") + tag = f"test_tag_TestRun_{unique_indicator}" runs = openml.runs.list_runs(tag=tag, output_format="dataframe") - self.assertEqual(len(runs), 0) + assert len(runs) == 0 run.push_tag(tag) runs = openml.runs.list_runs(tag=tag, output_format="dataframe") - self.assertEqual(len(runs), 1) - self.assertIn(run_id, runs["run_id"]) + assert len(runs) == 1 + assert run_id in runs["run_id"] run.remove_tag(tag) runs = openml.runs.list_runs(tag=tag, output_format="dataframe") - self.assertEqual(len(runs), 0) + assert len(runs) == 0 @staticmethod def _test_prediction_data_equal(run, run_prime): # Determine which attributes are numeric and which not num_cols = np.array( - [d_type == "NUMERIC" for _, d_type in run._generate_arff_dict()["attributes"]] + [d_type == "NUMERIC" for _, d_type in run._generate_arff_dict()["attributes"]], ) # Get run data consistently # (For run from server, .data_content does not exist) @@ -68,15 +70,12 @@ def _test_run_obj_equals(self, run, run_prime): # should be none or empty other = getattr(run_prime, dictionary) if other is not None: - self.assertDictEqual(other, dict()) - self.assertEqual(run._to_xml(), run_prime._to_xml()) + self.assertDictEqual(other, {}) + assert run._to_xml() == run_prime._to_xml() self._test_prediction_data_equal(run, run_prime) # Test trace - if run.trace is not None: - run_trace_content = run.trace.trace_to_arff()["data"] - else: - run_trace_content = None + run_trace_content = run.trace.trace_to_arff()["data"] if run.trace is not None else None if run_prime.trace is not None: run_prime_trace_content = run_prime.trace.trace_to_arff()["data"] @@ -88,7 +87,7 @@ def _test_run_obj_equals(self, run, run_prime): def _check_array(array, type_): for line in array: for entry in line: - self.assertIsInstance(entry, type_) + assert isinstance(entry, type_) int_part = [line[:3] for line in run_trace_content] _check_array(int_part, int) @@ -106,25 +105,25 @@ def _check_array(array, type_): bool_part = [line[4] for line in run_trace_content] bool_part_prime = [line[4] for line in run_prime_trace_content] for bp, bpp in zip(bool_part, bool_part_prime): - self.assertIn(bp, ["true", "false"]) - self.assertIn(bpp, ["true", "false"]) + assert bp in ["true", "false"] + assert bpp in ["true", "false"] string_part = np.array(run_trace_content)[:, 5:] string_part_prime = np.array(run_prime_trace_content)[:, 5:] np.testing.assert_array_almost_equal(int_part, int_part_prime) np.testing.assert_array_almost_equal(float_part, float_part_prime) - self.assertEqual(bool_part, bool_part_prime) + assert bool_part == bool_part_prime np.testing.assert_array_equal(string_part, string_part_prime) else: - self.assertIsNone(run_prime_trace_content) + assert run_prime_trace_content is None - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_to_from_filesystem_vanilla(self): model = Pipeline( [ ("imputer", SimpleImputer(strategy="mean")), ("classifier", DecisionTreeClassifier(max_depth=1)), - ] + ], ) task = openml.tasks.get_task(119) # diabetes; crossvalidation run = openml.runs.run_model_on_task( @@ -144,23 +143,23 @@ def test_to_from_filesystem_vanilla(self): run_prime = openml.runs.OpenMLRun.from_filesystem(cache_path) # The flow has been uploaded to server, so only the reference flow_id should be present - self.assertTrue(run_prime.flow_id is not None) - self.assertTrue(run_prime.flow is None) + assert run_prime.flow_id is not None + assert run_prime.flow is None self._test_run_obj_equals(run, run_prime) run_prime.publish() TestBase._mark_entity_for_removal("run", run_prime.run_id) TestBase.logger.info( - "collected from {}: {}".format(__file__.split("/")[-1], run_prime.run_id) + "collected from {}: {}".format(__file__.split("/")[-1], run_prime.run_id), ) - @pytest.mark.sklearn + @pytest.mark.sklearn() @pytest.mark.flaky() def test_to_from_filesystem_search(self): model = Pipeline( [ ("imputer", SimpleImputer(strategy="mean")), ("classifier", DecisionTreeClassifier(max_depth=1)), - ] + ], ) model = GridSearchCV( estimator=model, @@ -186,13 +185,13 @@ def test_to_from_filesystem_search(self): run_prime.publish() TestBase._mark_entity_for_removal("run", run_prime.run_id) TestBase.logger.info( - "collected from {}: {}".format(__file__.split("/")[-1], run_prime.run_id) + "collected from {}: {}".format(__file__.split("/")[-1], run_prime.run_id), ) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_to_from_filesystem_no_model(self): model = Pipeline( - [("imputer", SimpleImputer(strategy="mean")), ("classifier", DummyClassifier())] + [("imputer", SimpleImputer(strategy="mean")), ("classifier", DummyClassifier())], ) task = openml.tasks.get_task(119) # diabetes; crossvalidation run = openml.runs.run_model_on_task(model=model, task=task, add_local_measures=False) @@ -211,7 +210,7 @@ def _get_models_tasks_for_tests(): [ ("imputer", SimpleImputer(strategy="mean")), ("classifier", DummyClassifier(strategy="prior")), - ] + ], ) model_reg = Pipeline( [ @@ -221,7 +220,7 @@ def _get_models_tasks_for_tests(): # LR because dummy does not produce enough float-like values LinearRegression(), ), - ] + ], ) task_clf = openml.tasks.get_task(119) # diabetes; hold out validation @@ -256,7 +255,7 @@ def assert_run_prediction_data(task, run, model): # Get stored data for fold saved_fold_data = run.predictions[run.predictions["fold"] == fold_id].sort_values( - by="row_id" + by="row_id", ) saved_y_pred = saved_fold_data["prediction"].values gt_key = "truth" if "truth" in list(saved_fold_data) else "correct" @@ -272,7 +271,7 @@ def assert_run_prediction_data(task, run, model): assert_method(y_pred, saved_y_pred) assert_method(y_test, saved_y_test) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_publish_with_local_loaded_flow(self): """ Publish a run tied to a local flow after it has first been saved to @@ -284,7 +283,7 @@ def test_publish_with_local_loaded_flow(self): # Make sure the flow does not exist on the server yet. flow = extension.model_to_flow(model) self._add_sentinel_to_flow_name(flow) - self.assertFalse(openml.flows.flow_exists(flow.name, flow.external_version)) + assert not openml.flows.flow_exists(flow.name, flow.external_version) run = openml.runs.run_flow_on_task( flow=flow, @@ -295,7 +294,7 @@ def test_publish_with_local_loaded_flow(self): ) # Make sure that the flow has not been uploaded as requested. - self.assertFalse(openml.flows.flow_exists(flow.name, flow.external_version)) + assert not openml.flows.flow_exists(flow.name, flow.external_version) # Make sure that the prediction data stored in the run is correct. self.assert_run_prediction_data(task, run, clone(model)) @@ -309,14 +308,14 @@ def test_publish_with_local_loaded_flow(self): # Clean up TestBase._mark_entity_for_removal("run", loaded_run.run_id) TestBase.logger.info( - "collected from {}: {}".format(__file__.split("/")[-1], loaded_run.run_id) + "collected from {}: {}".format(__file__.split("/")[-1], loaded_run.run_id), ) # make sure the flow is published as part of publishing the run. - self.assertTrue(openml.flows.flow_exists(flow.name, flow.external_version)) + assert openml.flows.flow_exists(flow.name, flow.external_version) openml.runs.get_run(loaded_run.run_id) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_offline_and_online_run_identical(self): extension = openml.extensions.sklearn.SklearnExtension() @@ -324,7 +323,7 @@ def test_offline_and_online_run_identical(self): # Make sure the flow does not exist on the server yet. flow = extension.model_to_flow(model) self._add_sentinel_to_flow_name(flow) - self.assertFalse(openml.flows.flow_exists(flow.name, flow.external_version)) + assert not openml.flows.flow_exists(flow.name, flow.external_version) run = openml.runs.run_flow_on_task( flow=flow, @@ -335,7 +334,7 @@ def test_offline_and_online_run_identical(self): ) # Make sure that the flow has not been uploaded as requested. - self.assertFalse(openml.flows.flow_exists(flow.name, flow.external_version)) + assert not openml.flows.flow_exists(flow.name, flow.external_version) # Load from filesystem cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128))) @@ -347,7 +346,7 @@ def test_offline_and_online_run_identical(self): # Publish and test for offline - online run.publish() - self.assertTrue(openml.flows.flow_exists(flow.name, flow.external_version)) + assert openml.flows.flow_exists(flow.name, flow.external_version) try: online_run = openml.runs.get_run(run.run_id, ignore_cache=True) @@ -356,7 +355,7 @@ def test_offline_and_online_run_identical(self): # Clean up TestBase._mark_entity_for_removal("run", run.run_id) TestBase.logger.info( - "collected from {}: {}".format(__file__.split("/")[-1], loaded_run.run_id) + "collected from {}: {}".format(__file__.split("/")[-1], loaded_run.run_id), ) def test_run_setup_string_included_in_xml(self): diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index 8f3c0a71b..edd7e0198 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -1,57 +1,60 @@ # License: BSD 3-Clause -import arff -from distutils.version import LooseVersion +from __future__ import annotations + +import ast import os import random import time -import sys -import ast +import unittest +import warnings +from distutils.version import LooseVersion from unittest import mock -import numpy as np +import arff import joblib +import numpy as np +import pandas as pd +import pytest import requests +import sklearn from joblib import parallel_backend +from sklearn.dummy import DummyClassifier +from sklearn.ensemble import BaggingClassifier, RandomForestClassifier +from sklearn.feature_selection import VarianceThreshold +from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier +from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold +from sklearn.model_selection._search import BaseSearchCV +from sklearn.naive_bayes import GaussianNB +from sklearn.pipeline import Pipeline, make_pipeline +from sklearn.preprocessing import OneHotEncoder, StandardScaler +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier import openml -import openml.exceptions import openml._api_calls -import sklearn -import unittest -import warnings -import pandas as pd -import pytest - +import openml.exceptions import openml.extensions.sklearn -from openml.testing import TestBase, SimpleImputer, CustomImputer, create_request_response +from openml.exceptions import ( + OpenMLNotAuthorizedError, + OpenMLServerException, +) from openml.extensions.sklearn import cat, cont from openml.runs.functions import ( _run_task_get_arffcontent, - run_exists, - format_prediction, delete_run, + format_prediction, + run_exists, ) from openml.runs.trace import OpenMLRunTrace from openml.tasks import TaskType -from openml.testing import check_task_existence -from openml.exceptions import ( - OpenMLServerException, - OpenMLNotAuthorizedError, +from openml.testing import ( + CustomImputer, + SimpleImputer, + TestBase, + check_task_existence, + create_request_response, ) -from sklearn.naive_bayes import GaussianNB -from sklearn.model_selection._search import BaseSearchCV -from sklearn.tree import DecisionTreeClassifier - -from sklearn.dummy import DummyClassifier -from sklearn.preprocessing import StandardScaler, OneHotEncoder -from sklearn.feature_selection import VarianceThreshold -from sklearn.linear_model import LogisticRegression, SGDClassifier, LinearRegression -from sklearn.ensemble import RandomForestClassifier, BaggingClassifier -from sklearn.svm import SVC -from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold -from sklearn.pipeline import Pipeline, make_pipeline - class TestRun(TestBase): _multiprocess_can_split_ = True @@ -131,14 +134,12 @@ def _wait_for_processed_run(self, run_id, max_waiting_time_seconds): return raise RuntimeError( - "Could not find any evaluations! Please check whether run {} was " - "evaluated correctly on the server".format(run_id) + f"Could not find any evaluations! Please check whether run {run_id} was " + "evaluated correctly on the server", ) def _assert_predictions_equal(self, predictions, predictions_prime): - self.assertEqual( - np.array(predictions_prime["data"]).shape, np.array(predictions["data"]).shape - ) + assert np.array(predictions_prime["data"]).shape == np.array(predictions["data"]).shape # The original search model does not submit confidence # bounds, so we can not compare the arff line @@ -150,14 +151,14 @@ def _assert_predictions_equal(self, predictions, predictions_prime): for col_idx in compare_slice: val_1 = predictions["data"][idx][col_idx] val_2 = predictions_prime["data"][idx][col_idx] - if type(val_1) == float or type(val_2) == float: + if isinstance(val_1, float) or isinstance(val_2, float): self.assertAlmostEqual( float(val_1), float(val_2), places=6, ) else: - self.assertEqual(val_1, val_2) + assert val_1 == val_2 def _rerun_model_and_compare_predictions(self, run_id, model_prime, seed, create_task_obj): run = openml.runs.get_run(run_id) @@ -211,7 +212,7 @@ def _perform_run( Runs a classifier on a task, and performs some basic checks. Also uploads the run. - Parameters: + Parameters ---------- task_id : int @@ -238,8 +239,8 @@ def _perform_run( sentinel: optional, str in case the sentinel should be user specified - Returns: - -------- + Returns + ------- run: OpenMLRun The performed run (with run id) """ @@ -262,13 +263,13 @@ def _remove_random_state(flow): flow, _ = self._add_sentinel_to_flow_name(flow, sentinel) if not openml.flows.flow_exists(flow.name, flow.external_version): flow.publish() - TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name)) - TestBase.logger.info("collected from test_run_functions: {}".format(flow.flow_id)) + TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name) + TestBase.logger.info(f"collected from test_run_functions: {flow.flow_id}") task = openml.tasks.get_task(task_id) X, y = task.get_X_and_y() - self.assertEqual(np.count_nonzero(np.isnan(X)), n_missing_vals) + assert np.count_nonzero(np.isnan(X)) == n_missing_vals run = openml.runs.run_flow_on_task( flow=flow, task=task, @@ -277,9 +278,9 @@ def _remove_random_state(flow): ) run_ = run.publish() TestBase._mark_entity_for_removal("run", run.run_id) - TestBase.logger.info("collected from test_run_functions: {}".format(run.run_id)) - self.assertEqual(run_, run) - self.assertIsInstance(run.dataset_id, int) + TestBase.logger.info(f"collected from test_run_functions: {run.run_id}") + assert run_ == run + assert isinstance(run.dataset_id, int) # This is only a smoke check right now # TODO add a few asserts here @@ -290,7 +291,7 @@ def _remove_random_state(flow): run.trace.trace_to_arff() # check arff output - self.assertEqual(len(run.data_content), num_instances) + assert len(run.data_content) == num_instances if check_setup: # test the initialize setup function @@ -307,14 +308,14 @@ def _remove_random_state(flow): flow.class_name, flow.flow_id, ) - self.assertIn("random_state", flow.parameters, error_msg) + assert "random_state" in flow.parameters, error_msg # If the flow is initialized from a model without a random # state, the flow is on the server without any random state - self.assertEqual(flow.parameters["random_state"], "null") + assert flow.parameters["random_state"] == "null" # As soon as a flow is run, a random state is set in the model. # If a flow is re-instantiated - self.assertEqual(flow_local.parameters["random_state"], flow_expected_rsv) - self.assertEqual(flow_server.parameters["random_state"], flow_expected_rsv) + assert flow_local.parameters["random_state"] == flow_expected_rsv + assert flow_server.parameters["random_state"] == flow_expected_rsv _remove_random_state(flow_local) _remove_random_state(flow_server) openml.flows.assert_flows_equal(flow_local, flow_server) @@ -325,7 +326,7 @@ def _remove_random_state(flow): ) flow_server2 = self.extension.model_to_flow(clf_server2) if flow.class_name not in classes_without_random_state: - self.assertEqual(flow_server2.parameters["random_state"], flow_expected_rsv) + assert flow_server2.parameters["random_state"] == flow_expected_rsv _remove_random_state(flow_server2) openml.flows.assert_flows_equal(flow_local, flow_server2) @@ -345,7 +346,12 @@ def _remove_random_state(flow): return run def _check_sample_evaluations( - self, sample_evaluations, num_repeats, num_folds, num_samples, max_time_allowed=60000 + self, + sample_evaluations, + num_repeats, + num_folds, + num_samples, + max_time_allowed=60000, ): """ Checks whether the right timing measures are attached to the run @@ -356,7 +362,6 @@ def _check_sample_evaluations( default max_time_allowed (per fold, in milli seconds) = 1 minute, quite pessimistic """ - # a dict mapping from openml measure to a tuple with the minimum and # maximum allowed value check_measures = { @@ -370,31 +375,28 @@ def _check_sample_evaluations( "predictive_accuracy": (0, 1), } - self.assertIsInstance(sample_evaluations, dict) - if sys.version_info[:2] >= (3, 3): - # this only holds if we are allowed to record time (otherwise some - # are missing) - self.assertEqual(set(sample_evaluations.keys()), set(check_measures.keys())) + assert isinstance(sample_evaluations, dict) + assert set(sample_evaluations.keys()) == set(check_measures.keys()) - for measure in check_measures.keys(): + for measure in check_measures: if measure in sample_evaluations: num_rep_entrees = len(sample_evaluations[measure]) - self.assertEqual(num_rep_entrees, num_repeats) + assert num_rep_entrees == num_repeats for rep in range(num_rep_entrees): num_fold_entrees = len(sample_evaluations[measure][rep]) - self.assertEqual(num_fold_entrees, num_folds) + assert num_fold_entrees == num_folds for fold in range(num_fold_entrees): num_sample_entrees = len(sample_evaluations[measure][rep][fold]) - self.assertEqual(num_sample_entrees, num_samples) + assert num_sample_entrees == num_samples for sample in range(num_sample_entrees): evaluation = sample_evaluations[measure][rep][fold][sample] - self.assertIsInstance(evaluation, float) + assert isinstance(evaluation, float) if not (os.environ.get("CI_WINDOWS") or os.name == "nt"): # Windows seems to get an eval-time of 0 sometimes. - self.assertGreater(evaluation, 0) - self.assertLess(evaluation, max_time_allowed) + assert evaluation > 0 + assert evaluation < max_time_allowed - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_run_regression_on_classif_task(self): task_id = 115 # diabetes; crossvalidation @@ -402,8 +404,8 @@ def test_run_regression_on_classif_task(self): task = openml.tasks.get_task(task_id) # internally dataframe is loaded and targets are categorical # which LinearRegression() cannot handle - with self.assertRaisesRegex( - AttributeError, "'LinearRegression' object has no attribute 'classes_'" + with pytest.raises( + AttributeError, match="'LinearRegression' object has no attribute 'classes_'" ): openml.runs.run_model_on_task( model=clf, @@ -412,7 +414,7 @@ def test_run_regression_on_classif_task(self): dataset_format="array", ) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_check_erronous_sklearn_flow_fails(self): task_id = 115 # diabetes; crossvalidation task = openml.tasks.get_task(task_id) @@ -431,7 +433,7 @@ def test_check_erronous_sklearn_flow_fails(self): exceptions = (ValueError, InvalidParameterError) except ImportError: exceptions = (ValueError,) - with self.assertRaises(exceptions): + with pytest.raises(exceptions): openml.runs.run_model_on_task( task=task, model=clf, @@ -492,18 +494,18 @@ def determine_grid_size(param_grid): scores = run.get_metric_fn(metric) # compare with the scores in user defined measures scores_provided = [] - for rep in run.fold_evaluations[metric_name].keys(): - for fold in run.fold_evaluations[metric_name][rep].keys(): + for rep in run.fold_evaluations[metric_name]: + for fold in run.fold_evaluations[metric_name][rep]: scores_provided.append(run.fold_evaluations[metric_name][rep][fold]) - self.assertEqual(sum(scores_provided), sum(scores)) + assert sum(scores_provided) == sum(scores) if isinstance(clf, BaseSearchCV): trace_content = run.trace.trace_to_arff()["data"] if isinstance(clf, GridSearchCV): grid_iterations = determine_grid_size(clf.param_grid) - self.assertEqual(len(trace_content), grid_iterations * num_folds) + assert len(trace_content) == grid_iterations * num_folds else: - self.assertEqual(len(trace_content), num_iterations * num_folds) + assert len(trace_content) == num_iterations * num_folds # downloads the best model based on the optimization trace # suboptimal (slow), and not guaranteed to work if evaluation @@ -521,24 +523,41 @@ def determine_grid_size(param_grid): raise e self._rerun_model_and_compare_predictions( - run.run_id, model_prime, seed, create_task_obj=True + run.run_id, + model_prime, + seed, + create_task_obj=True, ) self._rerun_model_and_compare_predictions( - run.run_id, model_prime, seed, create_task_obj=False + run.run_id, + model_prime, + seed, + create_task_obj=False, ) else: run_downloaded = openml.runs.get_run(run.run_id) sid = run_downloaded.setup_id model_prime = openml.setups.initialize_model(sid) self._rerun_model_and_compare_predictions( - run.run_id, model_prime, seed, create_task_obj=True + run.run_id, + model_prime, + seed, + create_task_obj=True, ) self._rerun_model_and_compare_predictions( - run.run_id, model_prime, seed, create_task_obj=False + run.run_id, + model_prime, + seed, + create_task_obj=False, ) # todo: check if runtime is present - self._check_fold_timing_evaluations(run.fold_evaluations, 1, num_folds, task_type=task_type) + self._check_fold_timing_evaluations( + fold_evaluations=run.fold_evaluations, + num_repeats=1, + num_folds=num_folds, + task_type=task_type + ) # Check if run string and print representation do not run into an error # The above check already verifies that all columns needed for supported @@ -550,7 +569,13 @@ def determine_grid_size(param_grid): return run def _run_and_upload_classification( - self, clf, task_id, n_missing_vals, n_test_obs, flow_expected_rsv, sentinel=None + self, + clf, + task_id, + n_missing_vals, + n_test_obs, + flow_expected_rsv, + sentinel=None, ): num_folds = 1 # because of holdout num_iterations = 5 # for base search algorithms @@ -573,7 +598,13 @@ def _run_and_upload_classification( ) def _run_and_upload_regression( - self, clf, task_id, n_missing_vals, n_test_obs, flow_expected_rsv, sentinel=None + self, + clf, + task_id, + n_missing_vals, + n_test_obs, + flow_expected_rsv, + sentinel=None, ): num_folds = 10 # because of cross-validation num_iterations = 5 # for base search algorithms @@ -595,7 +626,7 @@ def _run_and_upload_regression( sentinel=sentinel, ) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_run_and_upload_logistic_regression(self): lr = LogisticRegression(solver="lbfgs", max_iter=1000) task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"] @@ -603,7 +634,7 @@ def test_run_and_upload_logistic_regression(self): n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"] self._run_and_upload_classification(lr, task_id, n_missing_vals, n_test_obs, "62501") - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_run_and_upload_linear_regression(self): lr = LinearRegression() task_id = self.TEST_SERVER_TASK_REGRESSION["task_id"] @@ -627,26 +658,26 @@ def test_run_and_upload_linear_regression(self): raise Exception(repr(e)) # mark to remove the uploaded task TestBase._mark_entity_for_removal("task", task_id) - TestBase.logger.info("collected from test_run_functions: {}".format(task_id)) + TestBase.logger.info(f"collected from test_run_functions: {task_id}") n_missing_vals = self.TEST_SERVER_TASK_REGRESSION["n_missing_vals"] n_test_obs = self.TEST_SERVER_TASK_REGRESSION["n_test_obs"] self._run_and_upload_regression(lr, task_id, n_missing_vals, n_test_obs, "62501") - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_run_and_upload_pipeline_dummy_pipeline(self): pipeline1 = Pipeline( steps=[ ("scaler", StandardScaler(with_mean=False)), ("dummy", DummyClassifier(strategy="prior")), - ] + ], ) task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"] n_missing_vals = self.TEST_SERVER_TASK_SIMPLE["n_missing_vals"] n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"] self._run_and_upload_classification(pipeline1, task_id, n_missing_vals, n_test_obs, "62501") - @pytest.mark.sklearn + @pytest.mark.sklearn() @unittest.skipIf( LooseVersion(sklearn.__version__) < "0.20", reason="columntransformer introduction in 0.20.0", @@ -661,7 +692,8 @@ def get_ct_cf(nominal_indices, numeric_indices): ( "numeric", make_pipeline( - SimpleImputer(strategy="mean"), sklearn.preprocessing.StandardScaler() + SimpleImputer(strategy="mean"), + sklearn.preprocessing.StandardScaler(), ), numeric_indices, ), @@ -680,7 +712,7 @@ def get_ct_cf(nominal_indices, numeric_indices): steps=[ ("transformer", inner), ("classifier", sklearn.tree.DecisionTreeClassifier()), - ] + ], ) sentinel = self._get_sentinel() @@ -709,7 +741,7 @@ def get_ct_cf(nominal_indices, numeric_indices): sentinel=sentinel, ) - @pytest.mark.sklearn + @pytest.mark.sklearn() @unittest.skip("https://github.com/openml/OpenML/issues/1180") @unittest.skipIf( LooseVersion(sklearn.__version__) < "0.20", @@ -718,7 +750,8 @@ def get_ct_cf(nominal_indices, numeric_indices): @mock.patch("warnings.warn") def test_run_and_upload_knn_pipeline(self, warnings_mock): cat_imp = make_pipeline( - SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore") + SimpleImputer(strategy="most_frequent"), + OneHotEncoder(handle_unknown="ignore"), ) cont_imp = make_pipeline(CustomImputer(), StandardScaler()) from sklearn.compose import ColumnTransformer @@ -733,12 +766,12 @@ def test_run_and_upload_knn_pipeline(self, warnings_mock): "Estimator", RandomizedSearchCV( KNeighborsClassifier(), - {"n_neighbors": [x for x in range(2, 10)]}, + {"n_neighbors": list(range(2, 10))}, cv=3, n_iter=10, ), ), - ] + ], ) task_id = self.TEST_SERVER_TASK_MISSING_VALS["task_id"] @@ -758,9 +791,9 @@ def test_run_and_upload_knn_pipeline(self, warnings_mock): for _warnings in warnings_mock.call_args_list: if _warnings[0][0] == warning_msg: call_count += 1 - self.assertEqual(call_count, 3) + assert call_count == 3 - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_run_and_upload_gridsearch(self): gridsearch = GridSearchCV( BaggingClassifier(base_estimator=SVC()), @@ -777,9 +810,9 @@ def test_run_and_upload_gridsearch(self): n_test_obs=n_test_obs, flow_expected_rsv="62501", ) - self.assertEqual(len(run.trace.trace_iterations), 9) + assert len(run.trace.trace_iterations) == 9 - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_run_and_upload_randomsearch(self): randomsearch = RandomizedSearchCV( RandomForestClassifier(n_estimators=5), @@ -807,11 +840,11 @@ def test_run_and_upload_randomsearch(self): n_test_obs=n_test_obs, flow_expected_rsv="12172", ) - self.assertEqual(len(run.trace.trace_iterations), 5) + assert len(run.trace.trace_iterations) == 5 trace = openml.runs.get_run_trace(run.run_id) - self.assertEqual(len(trace.trace_iterations), 5) + assert len(trace.trace_iterations) == 5 - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_run_and_upload_maskedarrays(self): # This testcase is important for 2 reasons: # 1) it verifies the correct handling of masked arrays (not all @@ -829,12 +862,16 @@ def test_run_and_upload_maskedarrays(self): n_missing_vals = self.TEST_SERVER_TASK_SIMPLE["n_missing_vals"] n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"] self._run_and_upload_classification( - gridsearch, task_id, n_missing_vals, n_test_obs, "12172" + gridsearch, + task_id, + n_missing_vals, + n_test_obs, + "12172", ) ########################################################################## - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_learning_curve_task_1(self): task_id = 801 # diabates dataset num_test_instances = 6144 # for learning curve @@ -847,14 +884,18 @@ def test_learning_curve_task_1(self): steps=[ ("scaler", StandardScaler(with_mean=False)), ("dummy", DummyClassifier(strategy="prior")), - ] + ], ) run = self._perform_run( - task_id, num_test_instances, num_missing_vals, pipeline1, flow_expected_rsv="62501" + task_id, + num_test_instances, + num_missing_vals, + pipeline1, + flow_expected_rsv="62501", ) self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_learning_curve_task_2(self): task_id = 801 # diabates dataset num_test_instances = 6144 # for learning curve @@ -873,20 +914,24 @@ def test_learning_curve_task_2(self): DecisionTreeClassifier(), { "min_samples_split": [2**x for x in range(1, 8)], - "min_samples_leaf": [2**x for x in range(0, 7)], + "min_samples_leaf": [2**x for x in range(7)], }, cv=3, n_iter=10, ), ), - ] + ], ) run = self._perform_run( - task_id, num_test_instances, num_missing_vals, pipeline2, flow_expected_rsv="62501" + task_id, + num_test_instances, + num_missing_vals, + pipeline2, + flow_expected_rsv="62501", ) self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples) - @pytest.mark.sklearn + @pytest.mark.sklearn() @unittest.skipIf( LooseVersion(sklearn.__version__) < "0.21", reason="Pipelines don't support indexing (used for the assert check)", @@ -911,7 +956,7 @@ def test_initialize_cv_from_run(self): n_iter=2, ), ), - ] + ], ) task = openml.tasks.get_task(11) # kr-vs-kp; holdout @@ -923,22 +968,22 @@ def test_initialize_cv_from_run(self): ) run_ = run.publish() TestBase._mark_entity_for_removal("run", run.run_id) - TestBase.logger.info("collected from test_run_functions: {}".format(run.run_id)) + TestBase.logger.info(f"collected from test_run_functions: {run.run_id}") run = openml.runs.get_run(run_.run_id) modelR = openml.runs.initialize_model_from_run(run_id=run.run_id) modelS = openml.setups.initialize_model(setup_id=run.setup_id) - self.assertEqual(modelS[-1].cv.random_state, 62501) - self.assertEqual(modelR[-1].cv.random_state, 62501) + assert modelS[-1].cv.random_state == 62501 + assert modelR[-1].cv.random_state == 62501 def _test_local_evaluations(self, run): # compare with the scores in user defined measures accuracy_scores_provided = [] - for rep in run.fold_evaluations["predictive_accuracy"].keys(): - for fold in run.fold_evaluations["predictive_accuracy"][rep].keys(): + for rep in run.fold_evaluations["predictive_accuracy"]: + for fold in run.fold_evaluations["predictive_accuracy"][rep]: accuracy_scores_provided.append( - run.fold_evaluations["predictive_accuracy"][rep][fold] + run.fold_evaluations["predictive_accuracy"][rep][fold], ) accuracy_scores = run.get_metric_fn(sklearn.metrics.accuracy_score) np.testing.assert_array_almost_equal(accuracy_scores_provided, accuracy_scores) @@ -955,17 +1000,17 @@ def _test_local_evaluations(self, run): tests.append((sklearn.metrics.jaccard_similarity_score, {})) else: tests.append((sklearn.metrics.jaccard_score, {})) - for test_idx, test in enumerate(tests): + for _test_idx, test in enumerate(tests): alt_scores = run.get_metric_fn( sklearn_fn=test[0], kwargs=test[1], ) - self.assertEqual(len(alt_scores), 10) + assert len(alt_scores) == 10 for idx in range(len(alt_scores)): - self.assertGreaterEqual(alt_scores[idx], 0) - self.assertLessEqual(alt_scores[idx], 1) + assert alt_scores[idx] >= 0 + assert alt_scores[idx] <= 1 - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_local_run_swapped_parameter_order_model(self): clf = DecisionTreeClassifier() australian_task = 595 # Australian; crossvalidation @@ -981,7 +1026,7 @@ def test_local_run_swapped_parameter_order_model(self): self._test_local_evaluations(run) - @pytest.mark.sklearn + @pytest.mark.sklearn() @unittest.skipIf( LooseVersion(sklearn.__version__) < "0.20", reason="SimpleImputer doesn't handle mixed type DataFrame as input", @@ -993,7 +1038,7 @@ def test_local_run_swapped_parameter_order_flow(self): ("imputer", SimpleImputer(strategy="most_frequent")), ("encoder", OneHotEncoder(handle_unknown="ignore")), ("estimator", RandomForestClassifier(n_estimators=10)), - ] + ], ) flow = self.extension.model_to_flow(clf) @@ -1010,7 +1055,7 @@ def test_local_run_swapped_parameter_order_flow(self): self._test_local_evaluations(run) - @pytest.mark.sklearn + @pytest.mark.sklearn() @unittest.skipIf( LooseVersion(sklearn.__version__) < "0.20", reason="SimpleImputer doesn't handle mixed type DataFrame as input", @@ -1022,7 +1067,7 @@ def test_local_run_metric_score(self): ("imputer", SimpleImputer(strategy="most_frequent")), ("encoder", OneHotEncoder(handle_unknown="ignore")), ("estimator", RandomForestClassifier(n_estimators=10)), - ] + ], ) # download task @@ -1038,6 +1083,7 @@ def test_local_run_metric_score(self): self._test_local_evaluations(run) + @pytest.mark.production() def test_online_run_metric_score(self): openml.config.server = self.production_server @@ -1047,7 +1093,7 @@ def test_online_run_metric_score(self): self._test_local_evaluations(run) - @pytest.mark.sklearn + @pytest.mark.sklearn() @unittest.skipIf( LooseVersion(sklearn.__version__) < "0.20", reason="SimpleImputer doesn't handle mixed type DataFrame as input", @@ -1058,7 +1104,7 @@ def test_initialize_model_from_run(self): ("Imputer", SimpleImputer(strategy="most_frequent")), ("VarianceThreshold", VarianceThreshold(threshold=0.05)), ("Estimator", GaussianNB()), - ] + ], ) task_meta_data = { "task_type": TaskType.SUPERVISED_CLASSIFICATION, @@ -1084,7 +1130,7 @@ def test_initialize_model_from_run(self): raise Exception(repr(e)) # mark to remove the uploaded task TestBase._mark_entity_for_removal("task", task_id) - TestBase.logger.info("collected from test_run_functions: {}".format(task_id)) + TestBase.logger.info(f"collected from test_run_functions: {task_id}") task = openml.tasks.get_task(task_id) run = openml.runs.run_model_on_task( @@ -1094,7 +1140,7 @@ def test_initialize_model_from_run(self): ) run_ = run.publish() TestBase._mark_entity_for_removal("run", run_.run_id) - TestBase.logger.info("collected from test_run_functions: {}".format(run_.run_id)) + TestBase.logger.info(f"collected from test_run_functions: {run_.run_id}") run = openml.runs.get_run(run_.run_id) modelR = openml.runs.initialize_model_from_run(run_id=run.run_id) @@ -1106,10 +1152,10 @@ def test_initialize_model_from_run(self): openml.flows.assert_flows_equal(flowR, flowL) openml.flows.assert_flows_equal(flowS, flowL) - self.assertEqual(flowS.components["Imputer"].parameters["strategy"], '"most_frequent"') - self.assertEqual(flowS.components["VarianceThreshold"].parameters["threshold"], "0.05") + assert flowS.components["Imputer"].parameters["strategy"] == '"most_frequent"' + assert flowS.components["VarianceThreshold"].parameters["threshold"] == "0.05" - @pytest.mark.sklearn + @pytest.mark.sklearn() @unittest.skipIf( LooseVersion(sklearn.__version__) < "0.20", reason="SimpleImputer doesn't handle mixed type DataFrame as input", @@ -1125,14 +1171,14 @@ def test__run_exists(self): ("Imputer", SimpleImputer(strategy="mean")), ("VarianceThreshold", VarianceThreshold(threshold=0.05)), ("Estimator", DecisionTreeClassifier(max_depth=4)), - ] + ], ), sklearn.pipeline.Pipeline( steps=[ ("Imputer", SimpleImputer(strategy="most_frequent")), ("VarianceThreshold", VarianceThreshold(threshold=0.1)), ("Estimator", DecisionTreeClassifier(max_depth=4)), - ] + ], ), ] @@ -1143,28 +1189,32 @@ def test__run_exists(self): # first populate the server with this run. # skip run if it was already performed. run = openml.runs.run_model_on_task( - model=clf, task=task, seed=rs, avoid_duplicate_runs=True, upload_flow=True + model=clf, + task=task, + seed=rs, + avoid_duplicate_runs=True, + upload_flow=True, ) run.publish() TestBase._mark_entity_for_removal("run", run.run_id) - TestBase.logger.info("collected from test_run_functions: {}".format(run.run_id)) + TestBase.logger.info(f"collected from test_run_functions: {run.run_id}") except openml.exceptions.PyOpenMLError: # run already existed. Great. pass flow = self.extension.model_to_flow(clf) flow_exists = openml.flows.flow_exists(flow.name, flow.external_version) - self.assertGreater(flow_exists, 0, "Server says flow from run does not exist.") + assert flow_exists > 0, "Server says flow from run does not exist." # Do NOT use get_flow reinitialization, this potentially sets # hyperparameter values wrong. Rather use the local model. downloaded_flow = openml.flows.get_flow(flow_exists) downloaded_flow.model = clf setup_exists = openml.setups.setup_exists(downloaded_flow) - self.assertGreater(setup_exists, 0, "Server says setup of run does not exist.") + assert setup_exists > 0, "Server says setup of run does not exist." run_ids = run_exists(task.task_id, setup_exists) - self.assertTrue(run_ids, msg=(run_ids, clf)) + assert run_ids, (run_ids, clf) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_run_with_illegal_flow_id(self): # check the case where the user adds an illegal flow id to a # non-existing flo @@ -1174,16 +1224,16 @@ def test_run_with_illegal_flow_id(self): flow, _ = self._add_sentinel_to_flow_name(flow, None) flow.flow_id = -1 expected_message_regex = ( - "Flow does not exist on the server, " "but 'flow.flow_id' is not None." + r"Flow does not exist on the server, but 'flow.flow_id' is not None." ) - with self.assertRaisesRegex(openml.exceptions.PyOpenMLError, expected_message_regex): + with pytest.raises(openml.exceptions.PyOpenMLError, match=expected_message_regex): openml.runs.run_flow_on_task( task=task, flow=flow, avoid_duplicate_runs=True, ) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_run_with_illegal_flow_id_after_load(self): # Same as `test_run_with_illegal_flow_id`, but test this error is also # caught if the run is stored to and loaded from disk first. @@ -1193,7 +1243,10 @@ def test_run_with_illegal_flow_id_after_load(self): flow, _ = self._add_sentinel_to_flow_name(flow, None) flow.flow_id = -1 run = openml.runs.run_flow_on_task( - task=task, flow=flow, avoid_duplicate_runs=False, upload_flow=False + task=task, + flow=flow, + avoid_duplicate_runs=False, + upload_flow=False, ) cache_path = os.path.join( @@ -1205,14 +1258,14 @@ def test_run_with_illegal_flow_id_after_load(self): loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path) expected_message_regex = ( - "Flow does not exist on the server, " "but 'flow.flow_id' is not None." + r"Flow does not exist on the server, but 'flow.flow_id' is not None." ) - with self.assertRaisesRegex(openml.exceptions.PyOpenMLError, expected_message_regex): + with pytest.raises(openml.exceptions.PyOpenMLError, match=expected_message_regex): loaded_run.publish() TestBase._mark_entity_for_removal("run", loaded_run.run_id) - TestBase.logger.info("collected from test_run_functions: {}".format(loaded_run.run_id)) + TestBase.logger.info(f"collected from test_run_functions: {loaded_run.run_id}") - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_run_with_illegal_flow_id_1(self): # Check the case where the user adds an illegal flow id to an existing # flow. Comes to a different value error than the previous test @@ -1221,8 +1274,8 @@ def test_run_with_illegal_flow_id_1(self): flow_orig = self.extension.model_to_flow(clf) try: flow_orig.publish() # ensures flow exist on server - TestBase._mark_entity_for_removal("flow", (flow_orig.flow_id, flow_orig.name)) - TestBase.logger.info("collected from test_run_functions: {}".format(flow_orig.flow_id)) + TestBase._mark_entity_for_removal("flow", flow_orig.flow_id, flow_orig.name) + TestBase.logger.info(f"collected from test_run_functions: {flow_orig.flow_id}") except openml.exceptions.OpenMLServerException: # flow already exists pass @@ -1230,14 +1283,14 @@ def test_run_with_illegal_flow_id_1(self): flow_new.flow_id = -1 expected_message_regex = "Local flow_id does not match server flow_id: " "'-1' vs '[0-9]+'" - with self.assertRaisesRegex(openml.exceptions.PyOpenMLError, expected_message_regex): + with pytest.raises(openml.exceptions.PyOpenMLError, match=expected_message_regex): openml.runs.run_flow_on_task( task=task, flow=flow_new, avoid_duplicate_runs=True, ) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_run_with_illegal_flow_id_1_after_load(self): # Same as `test_run_with_illegal_flow_id_1`, but test this error is # also caught if the run is stored to and loaded from disk first. @@ -1246,8 +1299,8 @@ def test_run_with_illegal_flow_id_1_after_load(self): flow_orig = self.extension.model_to_flow(clf) try: flow_orig.publish() # ensures flow exist on server - TestBase._mark_entity_for_removal("flow", (flow_orig.flow_id, flow_orig.name)) - TestBase.logger.info("collected from test_run_functions: {}".format(flow_orig.flow_id)) + TestBase._mark_entity_for_removal("flow", flow_orig.flow_id, flow_orig.name) + TestBase.logger.info(f"collected from test_run_functions: {flow_orig.flow_id}") except openml.exceptions.OpenMLServerException: # flow already exists pass @@ -1255,7 +1308,10 @@ def test_run_with_illegal_flow_id_1_after_load(self): flow_new.flow_id = -1 run = openml.runs.run_flow_on_task( - task=task, flow=flow_new, avoid_duplicate_runs=False, upload_flow=False + task=task, + flow=flow_new, + avoid_duplicate_runs=False, + upload_flow=False, ) cache_path = os.path.join( @@ -1268,10 +1324,12 @@ def test_run_with_illegal_flow_id_1_after_load(self): expected_message_regex = "Local flow_id does not match server flow_id: " "'-1' vs '[0-9]+'" self.assertRaisesRegex( - openml.exceptions.PyOpenMLError, expected_message_regex, loaded_run.publish + openml.exceptions.PyOpenMLError, + expected_message_regex, + loaded_run.publish, ) - @pytest.mark.sklearn + @pytest.mark.sklearn() @unittest.skipIf( LooseVersion(sklearn.__version__) < "0.20", reason="OneHotEncoder cannot handle mixed type DataFrame as input", @@ -1283,7 +1341,8 @@ def test__run_task_get_arffcontent(self): num_repeats = 1 clf = make_pipeline( - OneHotEncoder(handle_unknown="ignore"), SGDClassifier(loss="log", random_state=1) + OneHotEncoder(handle_unknown="ignore"), + SGDClassifier(loss="log", random_state=1), ) res = openml.runs.functions._run_task_get_arffcontent( extension=self.extension, @@ -1294,46 +1353,50 @@ def test__run_task_get_arffcontent(self): ) arff_datacontent, trace, fold_evaluations, _ = res # predictions - self.assertIsInstance(arff_datacontent, list) + assert isinstance(arff_datacontent, list) # trace. SGD does not produce any - self.assertIsInstance(trace, type(None)) + assert isinstance(trace, type(None)) task_type = TaskType.SUPERVISED_CLASSIFICATION self._check_fold_timing_evaluations( - fold_evaluations, num_repeats, num_folds, task_type=task_type + fold_evaluations=fold_evaluations, + num_repeats=num_repeats, + num_folds=num_folds, + task_type=task_type, ) # 10 times 10 fold CV of 150 samples - self.assertEqual(len(arff_datacontent), num_instances * num_repeats) + assert len(arff_datacontent) == num_instances * num_repeats for arff_line in arff_datacontent: # check number columns - self.assertEqual(len(arff_line), 8) + assert len(arff_line) == 8 # check repeat - self.assertGreaterEqual(arff_line[0], 0) - self.assertLessEqual(arff_line[0], num_repeats - 1) + assert arff_line[0] >= 0 + assert arff_line[0] <= num_repeats - 1 # check fold - self.assertGreaterEqual(arff_line[1], 0) - self.assertLessEqual(arff_line[1], num_folds - 1) + assert arff_line[1] >= 0 + assert arff_line[1] <= num_folds - 1 # check row id - self.assertGreaterEqual(arff_line[2], 0) - self.assertLessEqual(arff_line[2], num_instances - 1) + assert arff_line[2] >= 0 + assert arff_line[2] <= num_instances - 1 # check prediction and ground truth columns - self.assertIn(arff_line[4], ["won", "nowin"]) - self.assertIn(arff_line[5], ["won", "nowin"]) + assert arff_line[4] in ["won", "nowin"] + assert arff_line[5] in ["won", "nowin"] # check confidences self.assertAlmostEqual(sum(arff_line[6:]), 1.0) def test__create_trace_from_arff(self): - with open(self.static_cache_dir + "/misc/trace.arff", "r") as arff_file: + with open(self.static_cache_dir / "misc" / "trace.arff") as arff_file: trace_arff = arff.load(arff_file) OpenMLRunTrace.trace_from_arff(trace_arff) + @pytest.mark.production() def test_get_run(self): # this run is not available on test openml.config.server = self.production_server run = openml.runs.get_run(473351) - self.assertEqual(run.dataset_id, 357) - self.assertEqual(run.evaluations["f_measure"], 0.841225) + assert run.dataset_id == 357 + assert run.evaluations["f_measure"] == 0.841225 for i, value in [ (0, 0.840918), (1, 0.839458), @@ -1346,7 +1409,7 @@ def test_get_run(self): (8, 0.84218), (9, 0.844014), ]: - self.assertEqual(run.fold_evaluations["f_measure"][0][i], value) + assert run.fold_evaluations["f_measure"][0][i] == value assert "weka" in run.tags assert "weka_3.7.12" in run.tags assert run.predictions_url == ( @@ -1360,14 +1423,15 @@ def _check_run(self, run): # They are run_id, task_id, task_type_id, setup_id, flow_id, uploader, upload_time # error_message and run_details exist, too, but are not used so far. We need to update # this check once they are used! - self.assertIsInstance(run, dict) + assert isinstance(run, dict) assert len(run) == 8, str(run) + @pytest.mark.production() def test_get_runs_list(self): # TODO: comes from live, no such lists on test openml.config.server = self.production_server runs = openml.runs.list_runs(id=[2], show_errors=True, output_format="dataframe") - self.assertEqual(len(runs), 1) + assert len(runs) == 1 for run in runs.to_dict(orient="index").values(): self._check_run(run) @@ -1377,26 +1441,28 @@ def test_list_runs_empty(self): def test_list_runs_output_format(self): runs = openml.runs.list_runs(size=1000, output_format="dataframe") - self.assertIsInstance(runs, pd.DataFrame) + assert isinstance(runs, pd.DataFrame) + @pytest.mark.production() def test_get_runs_list_by_task(self): # TODO: comes from live, no such lists on test openml.config.server = self.production_server task_ids = [20] runs = openml.runs.list_runs(task=task_ids, output_format="dataframe") - self.assertGreaterEqual(len(runs), 590) + assert len(runs) >= 590 for run in runs.to_dict(orient="index").values(): - self.assertIn(run["task_id"], task_ids) + assert run["task_id"] in task_ids self._check_run(run) num_runs = len(runs) task_ids.append(21) runs = openml.runs.list_runs(task=task_ids, output_format="dataframe") - self.assertGreaterEqual(len(runs), num_runs + 1) + assert len(runs) >= num_runs + 1 for run in runs.to_dict(orient="index").values(): - self.assertIn(run["task_id"], task_ids) + assert run["task_id"] in task_ids self._check_run(run) + @pytest.mark.production() def test_get_runs_list_by_uploader(self): # TODO: comes from live, no such lists on test openml.config.server = self.production_server @@ -1404,38 +1470,40 @@ def test_get_runs_list_by_uploader(self): uploader_ids = [29] runs = openml.runs.list_runs(uploader=uploader_ids, output_format="dataframe") - self.assertGreaterEqual(len(runs), 2) + assert len(runs) >= 2 for run in runs.to_dict(orient="index").values(): - self.assertIn(run["uploader"], uploader_ids) + assert run["uploader"] in uploader_ids self._check_run(run) num_runs = len(runs) uploader_ids.append(274) runs = openml.runs.list_runs(uploader=uploader_ids, output_format="dataframe") - self.assertGreaterEqual(len(runs), num_runs + 1) + assert len(runs) >= num_runs + 1 for run in runs.to_dict(orient="index").values(): - self.assertIn(run["uploader"], uploader_ids) + assert run["uploader"] in uploader_ids self._check_run(run) + @pytest.mark.production() def test_get_runs_list_by_flow(self): # TODO: comes from live, no such lists on test openml.config.server = self.production_server flow_ids = [1154] runs = openml.runs.list_runs(flow=flow_ids, output_format="dataframe") - self.assertGreaterEqual(len(runs), 1) + assert len(runs) >= 1 for run in runs.to_dict(orient="index").values(): - self.assertIn(run["flow_id"], flow_ids) + assert run["flow_id"] in flow_ids self._check_run(run) num_runs = len(runs) flow_ids.append(1069) runs = openml.runs.list_runs(flow=flow_ids, output_format="dataframe") - self.assertGreaterEqual(len(runs), num_runs + 1) + assert len(runs) >= num_runs + 1 for run in runs.to_dict(orient="index").values(): - self.assertIn(run["flow_id"], flow_ids) + assert run["flow_id"] in flow_ids self._check_run(run) + @pytest.mark.production() def test_get_runs_pagination(self): # TODO: comes from live, no such lists on test openml.config.server = self.production_server @@ -1444,12 +1512,16 @@ def test_get_runs_pagination(self): max = 100 for i in range(0, max, size): runs = openml.runs.list_runs( - offset=i, size=size, uploader=uploader_ids, output_format="dataframe" + offset=i, + size=size, + uploader=uploader_ids, + output_format="dataframe", ) - self.assertGreaterEqual(size, len(runs)) + assert size >= len(runs) for run in runs.to_dict(orient="index").values(): - self.assertIn(run["uploader"], uploader_ids) + assert run["uploader"] in uploader_ids + @pytest.mark.production() def test_get_runs_list_by_filters(self): # TODO: comes from live, no such lists on test openml.config.server = self.production_server @@ -1468,30 +1540,34 @@ def test_get_runs_list_by_filters(self): # openml.runs.list_runs) runs = openml.runs.list_runs(id=ids, output_format="dataframe") - self.assertEqual(len(runs), 2) + assert len(runs) == 2 runs = openml.runs.list_runs(task=tasks, output_format="dataframe") - self.assertGreaterEqual(len(runs), 2) + assert len(runs) >= 2 runs = openml.runs.list_runs(uploader=uploaders_2, output_format="dataframe") - self.assertGreaterEqual(len(runs), 10) + assert len(runs) >= 10 runs = openml.runs.list_runs(flow=flows, output_format="dataframe") - self.assertGreaterEqual(len(runs), 100) + assert len(runs) >= 100 runs = openml.runs.list_runs( - id=ids, task=tasks, uploader=uploaders_1, output_format="dataframe" + id=ids, + task=tasks, + uploader=uploaders_1, + output_format="dataframe", ) - self.assertEqual(len(runs), 2) + assert len(runs) == 2 + @pytest.mark.production() def test_get_runs_list_by_tag(self): # TODO: comes from live, no such lists on test # Unit test works on production server only openml.config.server = self.production_server runs = openml.runs.list_runs(tag="curves", output_format="dataframe") - self.assertGreaterEqual(len(runs), 1) + assert len(runs) >= 1 - @pytest.mark.sklearn + @pytest.mark.sklearn() @unittest.skipIf( LooseVersion(sklearn.__version__) < "0.20", reason="columntransformer introduction in 0.20.0", @@ -1505,12 +1581,13 @@ def test_run_on_dataset_with_missing_labels_dataframe(self): from sklearn.compose import ColumnTransformer cat_imp = make_pipeline( - SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore") + SimpleImputer(strategy="most_frequent"), + OneHotEncoder(handle_unknown="ignore"), ) cont_imp = make_pipeline(CustomImputer(), StandardScaler()) ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)]) model = Pipeline( - steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())] + steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())], ) # build a sklearn classifier data_content, _, _, _ = _run_task_get_arffcontent( @@ -1522,12 +1599,12 @@ def test_run_on_dataset_with_missing_labels_dataframe(self): ) # 2 folds, 5 repeats; keep in mind that this task comes from the test # server, the task on the live server is different - self.assertEqual(len(data_content), 4490) + assert len(data_content) == 4490 for row in data_content: # repeat, fold, row_id, 6 confidences, prediction and correct label - self.assertEqual(len(row), 12) + assert len(row) == 12 - @pytest.mark.sklearn + @pytest.mark.sklearn() @unittest.skipIf( LooseVersion(sklearn.__version__) < "0.20", reason="columntransformer introduction in 0.20.0", @@ -1548,12 +1625,13 @@ def test_run_on_dataset_with_missing_labels_array(self): from sklearn.compose import ColumnTransformer cat_imp = make_pipeline( - SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore") + SimpleImputer(strategy="most_frequent"), + OneHotEncoder(handle_unknown="ignore"), ) cont_imp = make_pipeline(CustomImputer(), StandardScaler()) ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)]) model = Pipeline( - steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())] + steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())], ) # build a sklearn classifier data_content, _, _, _ = _run_task_get_arffcontent( @@ -1565,10 +1643,10 @@ def test_run_on_dataset_with_missing_labels_array(self): ) # 2 folds, 5 repeats; keep in mind that this task comes from the test # server, the task on the live server is different - self.assertEqual(len(data_content), 4490) + assert len(data_content) == 4490 for row in data_content: # repeat, fold, row_id, 6 confidences, prediction and correct label - self.assertEqual(len(row), 12) + assert len(row) == 12 def test_get_cached_run(self): openml.config.set_root_cache_directory(self.static_cache_dir) @@ -1576,16 +1654,16 @@ def test_get_cached_run(self): def test_get_uncached_run(self): openml.config.set_root_cache_directory(self.static_cache_dir) - with self.assertRaises(openml.exceptions.OpenMLCacheException): + with pytest.raises(openml.exceptions.OpenMLCacheException): openml.runs.functions._get_cached_run(10) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_run_flow_on_task_downloaded_flow(self): model = sklearn.ensemble.RandomForestClassifier(n_estimators=33) flow = self.extension.model_to_flow(model) flow.publish(raise_error_if_exists=False) - TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name)) - TestBase.logger.info("collected from test_run_functions: {}".format(flow.flow_id)) + TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name) + TestBase.logger.info(f"collected from test_run_functions: {flow.flow_id}") downloaded_flow = openml.flows.get_flow(flow.flow_id) task = openml.tasks.get_task(self.TEST_SERVER_TASK_SIMPLE["task_id"]) @@ -1600,49 +1678,51 @@ def test_run_flow_on_task_downloaded_flow(self): TestBase._mark_entity_for_removal("run", run.run_id) TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], run.run_id)) + @pytest.mark.production() def test_format_prediction_non_supervised(self): # non-supervised tasks don't exist on the test server openml.config.server = self.production_server clustering = openml.tasks.get_task(126033, download_data=False) ignored_input = [0] * 5 - with self.assertRaisesRegex( - NotImplementedError, r"Formatting for is not supported." + with pytest.raises( + NotImplementedError, match=r"Formatting for is not supported." ): format_prediction(clustering, *ignored_input) def test_format_prediction_classification_no_probabilities(self): classification = openml.tasks.get_task( - self.TEST_SERVER_TASK_SIMPLE["task_id"], download_data=False + self.TEST_SERVER_TASK_SIMPLE["task_id"], + download_data=False, ) ignored_input = [0] * 5 - with self.assertRaisesRegex(ValueError, "`proba` is required for classification task"): + with pytest.raises(ValueError, match="`proba` is required for classification task"): format_prediction(classification, *ignored_input, proba=None) def test_format_prediction_classification_incomplete_probabilities(self): classification = openml.tasks.get_task( - self.TEST_SERVER_TASK_SIMPLE["task_id"], download_data=False + self.TEST_SERVER_TASK_SIMPLE["task_id"], + download_data=False, ) ignored_input = [0] * 5 incomplete_probabilities = {c: 0.2 for c in classification.class_labels[1:]} - with self.assertRaisesRegex(ValueError, "Each class should have a predicted probability"): + with pytest.raises(ValueError, match="Each class should have a predicted probability"): format_prediction(classification, *ignored_input, proba=incomplete_probabilities) def test_format_prediction_task_without_classlabels_set(self): classification = openml.tasks.get_task( - self.TEST_SERVER_TASK_SIMPLE["task_id"], download_data=False + self.TEST_SERVER_TASK_SIMPLE["task_id"], + download_data=False, ) classification.class_labels = None ignored_input = [0] * 5 - with self.assertRaisesRegex( - ValueError, "The classification task must have class labels set" - ): + with pytest.raises(ValueError, match="The classification task must have class labels set"): format_prediction(classification, *ignored_input, proba={}) def test_format_prediction_task_learning_curve_sample_not_set(self): learning_curve = openml.tasks.get_task(801, download_data=False) # diabetes;crossvalidation probabilities = {c: 0.2 for c in learning_curve.class_labels} ignored_input = [0] * 5 - with self.assertRaisesRegex(ValueError, "`sample` can not be none for LearningCurveTask"): + with pytest.raises(ValueError, match="`sample` can not be none for LearningCurveTask"): format_prediction(learning_curve, *ignored_input, sample=None, proba=probabilities) def test_format_prediction_task_regression(self): @@ -1665,14 +1745,14 @@ def test_format_prediction_task_regression(self): raise Exception(repr(e)) # mark to remove the uploaded task TestBase._mark_entity_for_removal("task", task_id) - TestBase.logger.info("collected from test_run_functions: {}".format(task_id)) + TestBase.logger.info(f"collected from test_run_functions: {task_id}") regression = openml.tasks.get_task(task_id, download_data=False) ignored_input = [0] * 5 res = format_prediction(regression, *ignored_input) self.assertListEqual(res, [0] * 5) - @pytest.mark.sklearn + @pytest.mark.sklearn() @unittest.skipIf( LooseVersion(sklearn.__version__) < "0.21", reason="couldn't perform local tests successfully w/o bloating RAM", @@ -1703,12 +1783,12 @@ def test__run_task_get_arffcontent_2(self, parallel_mock): # The _prevent_optimize_n_jobs() is a function executed within the _run_model_on_fold() # block and mocking this function doesn't affect rest of the pipeline, but is adequately # indicative if _run_model_on_fold() is being called or not. - self.assertEqual(parallel_mock.call_count, 0) - self.assertIsInstance(res[0], list) - self.assertEqual(len(res[0]), num_instances) - self.assertEqual(len(res[0][0]), line_length) - self.assertEqual(len(res[2]), 7) - self.assertEqual(len(res[3]), 7) + assert parallel_mock.call_count == 0 + assert isinstance(res[0], list) + assert len(res[0]) == num_instances + assert len(res[0][0]) == line_length + assert len(res[2]) == 7 + assert len(res[3]) == 7 expected_scores = [ 0.965625, 0.94375, @@ -1723,10 +1803,12 @@ def test__run_task_get_arffcontent_2(self, parallel_mock): ] scores = [v for k, v in res[2]["predictive_accuracy"][0].items()] np.testing.assert_array_almost_equal( - scores, expected_scores, decimal=2 if os.name == "nt" else 7 + scores, + expected_scores, + decimal=2 if os.name == "nt" else 7, ) - @pytest.mark.sklearn + @pytest.mark.sklearn() @unittest.skipIf( LooseVersion(sklearn.__version__) < "0.21", reason="couldn't perform local tests successfully w/o bloating RAM", @@ -1760,7 +1842,9 @@ def test_joblib_backends(self, parallel_mock): }, random_state=1, cv=sklearn.model_selection.StratifiedKFold( - n_splits=2, shuffle=True, random_state=1 + n_splits=2, + shuffle=True, + random_state=1, ), n_iter=5, n_jobs=n_jobs, @@ -1774,14 +1858,14 @@ def test_joblib_backends(self, parallel_mock): dataset_format="array", # "dataframe" would require handling of categoricals n_jobs=n_jobs, ) - self.assertEqual(type(res[0]), list) - self.assertEqual(len(res[0]), num_instances) - self.assertEqual(len(res[0][0]), line_length) + assert type(res[0]) == list + assert len(res[0]) == num_instances + assert len(res[0][0]) == line_length # usercpu_time_millis_* not recorded when n_jobs > 1 # *_time_millis_* not recorded when n_jobs = -1 - self.assertEqual(len(res[2]["predictive_accuracy"][0]), 10) - self.assertEqual(len(res[3]["predictive_accuracy"][0]), 10) - self.assertEqual(parallel_mock.call_count, call_count) + assert len(res[2]["predictive_accuracy"][0]) == 10 + assert len(res[3]["predictive_accuracy"][0]) == 10 + assert parallel_mock.call_count == call_count @unittest.skipIf( LooseVersion(sklearn.__version__) < "0.20", @@ -1790,17 +1874,17 @@ def test_joblib_backends(self, parallel_mock): def test_delete_run(self): rs = 1 clf = sklearn.pipeline.Pipeline( - steps=[("imputer", SimpleImputer()), ("estimator", DecisionTreeClassifier())] + steps=[("imputer", SimpleImputer()), ("estimator", DecisionTreeClassifier())], ) task = openml.tasks.get_task(32) # diabetes; crossvalidation run = openml.runs.run_model_on_task(model=clf, task=task, seed=rs) run.publish() TestBase._mark_entity_for_removal("run", run.run_id) - TestBase.logger.info("collected from test_run_functions: {}".format(run.run_id)) + TestBase.logger.info(f"collected from test_run_functions: {run.run_id}") _run_id = run.run_id - self.assertTrue(delete_run(_run_id)) + assert delete_run(_run_id) @mock.patch.object(requests.Session, "delete") @@ -1808,7 +1892,8 @@ def test_delete_run_not_owned(mock_delete, test_files_directory, test_api_key): openml.config.start_using_configuration_for_example() content_file = test_files_directory / "mock_responses" / "runs" / "run_delete_not_owned.xml" mock_delete.return_value = create_request_response( - status_code=412, content_filepath=content_file + status_code=412, + content_filepath=content_file, ) with pytest.raises( @@ -1829,7 +1914,8 @@ def test_delete_run_success(mock_delete, test_files_directory, test_api_key): openml.config.start_using_configuration_for_example() content_file = test_files_directory / "mock_responses" / "runs" / "run_delete_successful.xml" mock_delete.return_value = create_request_response( - status_code=200, content_filepath=content_file + status_code=200, + content_filepath=content_file, ) success = openml.runs.delete_run(10591880) @@ -1847,7 +1933,8 @@ def test_delete_unknown_run(mock_delete, test_files_directory, test_api_key): openml.config.start_using_configuration_for_example() content_file = test_files_directory / "mock_responses" / "runs" / "run_delete_not_exist.xml" mock_delete.return_value = create_request_response( - status_code=412, content_filepath=content_file + status_code=412, + content_filepath=content_file, ) with pytest.raises( diff --git a/tests/test_runs/test_trace.py b/tests/test_runs/test_trace.py index d08c99e88..bdf9de42d 100644 --- a/tests/test_runs/test_trace.py +++ b/tests/test_runs/test_trace.py @@ -1,4 +1,7 @@ # License: BSD 3-Clause +from __future__ import annotations + +import pytest from openml.runs import OpenMLRunTrace, OpenMLTraceIteration from openml.testing import TestBase @@ -23,30 +26,21 @@ def test_get_selected_iteration(self): trace = OpenMLRunTrace(-1, trace_iterations=trace_iterations) # This next one should simply not fail - self.assertEqual(trace.get_selected_iteration(2, 2), 2) - with self.assertRaisesRegex( - ValueError, - "Could not find the selected iteration for rep/fold 3/3", + assert trace.get_selected_iteration(2, 2) == 2 + with pytest.raises( + ValueError, match="Could not find the selected iteration for rep/fold 3/3" ): trace.get_selected_iteration(3, 3) def test_initialization(self): """Check all different ways to fail the initialization""" - with self.assertRaisesRegex( - ValueError, - "Trace content not available.", - ): + with pytest.raises(ValueError, match="Trace content not available."): OpenMLRunTrace.generate(attributes="foo", content=None) - with self.assertRaisesRegex( - ValueError, - "Trace attributes not available.", - ): + with pytest.raises(ValueError, match="Trace attributes not available."): OpenMLRunTrace.generate(attributes=None, content="foo") - with self.assertRaisesRegex(ValueError, "Trace content is empty."): + with pytest.raises(ValueError, match="Trace content is empty."): OpenMLRunTrace.generate(attributes="foo", content=[]) - with self.assertRaisesRegex( - ValueError, "Trace_attributes and trace_content not compatible:" - ): + with pytest.raises(ValueError, match="Trace_attributes and trace_content not compatible:"): OpenMLRunTrace.generate(attributes=["abc"], content=[[1, 2]]) def test_duplicate_name(self): @@ -61,8 +55,9 @@ def test_duplicate_name(self): ("repeat", "NUMERICAL"), ] trace_content = [[0, 0, 0, 0.5, "true", 1], [0, 0, 0, 0.9, "false", 2]] - with self.assertRaisesRegex( - ValueError, "Either `setup_string` or `parameters` needs to be passed as argument." + with pytest.raises( + ValueError, + match="Either `setup_string` or `parameters` needs to be passed as argument.", ): OpenMLRunTrace.generate(trace_attributes, trace_content) @@ -75,8 +70,9 @@ def test_duplicate_name(self): ("sunshine", "NUMERICAL"), ] trace_content = [[0, 0, 0, 0.5, "true", 1], [0, 0, 0, 0.9, "false", 2]] - with self.assertRaisesRegex( + with pytest.raises( ValueError, - "Encountered unknown attribute sunshine that does not start with " "prefix parameter_", + match="Encountered unknown attribute sunshine that does not start with " + "prefix parameter_", ): OpenMLRunTrace.generate(trace_attributes, trace_content) diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py index ef1acc405..9e357f6aa 100644 --- a/tests/test_setups/test_setup_functions.py +++ b/tests/test_setups/test_setup_functions.py @@ -1,20 +1,21 @@ # License: BSD 3-Clause +from __future__ import annotations import hashlib import time import unittest.mock +from typing import Dict + +import pandas as pd +import pytest +import sklearn.base +import sklearn.naive_bayes +import sklearn.tree import openml import openml.exceptions import openml.extensions.sklearn from openml.testing import TestBase -from typing import Dict -import pandas as pd -import pytest - -import sklearn.tree -import sklearn.naive_bayes -import sklearn.base def get_sentinel(): @@ -24,8 +25,7 @@ def get_sentinel(): md5 = hashlib.md5() md5.update(str(time.time()).encode("utf-8")) sentinel = md5.hexdigest()[:10] - sentinel = "TEST%s" % sentinel - return sentinel + return "TEST%s" % sentinel class TestSetupFunctions(TestBase): @@ -35,37 +35,37 @@ def setUp(self): self.extension = openml.extensions.sklearn.SklearnExtension() super().setUp() - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_nonexisting_setup_exists(self): # first publish a non-existing flow sentinel = get_sentinel() # because of the sentinel, we can not use flows that contain subflows dectree = sklearn.tree.DecisionTreeClassifier() flow = self.extension.model_to_flow(dectree) - flow.name = "TEST%s%s" % (sentinel, flow.name) + flow.name = f"TEST{sentinel}{flow.name}" flow.publish() - TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name)) + TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name) TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id)) # although the flow exists (created as of previous statement), # we can be sure there are no setups (yet) as it was just created # and hasn't been ran setup_id = openml.setups.setup_exists(flow) - self.assertFalse(setup_id) + assert not setup_id def _existing_setup_exists(self, classif): flow = self.extension.model_to_flow(classif) - flow.name = "TEST%s%s" % (get_sentinel(), flow.name) + flow.name = f"TEST{get_sentinel()}{flow.name}" flow.publish() - TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name)) + TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name) TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id)) # although the flow exists, we can be sure there are no # setups (yet) as it hasn't been ran setup_id = openml.setups.setup_exists(flow) - self.assertFalse(setup_id) + assert not setup_id setup_id = openml.setups.setup_exists(flow) - self.assertFalse(setup_id) + assert not setup_id # now run the flow on an easy task: task = openml.tasks.get_task(115) # diabetes; crossvalidation @@ -80,9 +80,9 @@ def _existing_setup_exists(self, classif): # execute the function we are interested in setup_id = openml.setups.setup_exists(flow) - self.assertEqual(setup_id, run.setup_id) + assert setup_id == run.setup_id - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_existing_setup_exists_1(self): def side_effect(self): self.var_smoothing = 1e-9 @@ -97,12 +97,12 @@ def side_effect(self): nb = sklearn.naive_bayes.GaussianNB() self._existing_setup_exists(nb) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_exisiting_setup_exists_2(self): # Check a flow with one hyperparameter self._existing_setup_exists(sklearn.naive_bayes.GaussianNB()) - @pytest.mark.sklearn + @pytest.mark.sklearn() def test_existing_setup_exists_3(self): # Check a flow with many hyperparameters self._existing_setup_exists( @@ -112,7 +112,7 @@ def test_existing_setup_exists_3(self): # Not setting the random state will make this flow fail as running it # will add a random random_state. random_state=1, - ) + ), ) def test_get_setup(self): @@ -128,10 +128,11 @@ def test_get_setup(self): current = openml.setups.get_setup(setups[idx]) assert current.flow_id > 0 if num_params[idx] == 0: - self.assertIsNone(current.parameters) + assert current.parameters is None else: - self.assertEqual(len(current.parameters), num_params[idx]) + assert len(current.parameters) == num_params[idx] + @pytest.mark.production() def test_setup_list_filter_flow(self): openml.config.server = self.production_server @@ -139,49 +140,47 @@ def test_setup_list_filter_flow(self): setups = openml.setups.list_setups(flow=flow_id) - self.assertGreater(len(setups), 0) # TODO: please adjust 0 - for setup_id in setups.keys(): - self.assertEqual(setups[setup_id].flow_id, flow_id) + assert len(setups) > 0 # TODO: please adjust 0 + for setup_id in setups: + assert setups[setup_id].flow_id == flow_id def test_list_setups_empty(self): setups = openml.setups.list_setups(setup=[0]) if len(setups) > 0: raise ValueError("UnitTest Outdated, got somehow results") - self.assertIsInstance(setups, dict) + assert isinstance(setups, dict) + @pytest.mark.production() def test_list_setups_output_format(self): openml.config.server = self.production_server flow_id = 6794 setups = openml.setups.list_setups(flow=flow_id, output_format="object", size=10) - self.assertIsInstance(setups, Dict) - self.assertIsInstance(setups[list(setups.keys())[0]], openml.setups.setup.OpenMLSetup) - self.assertEqual(len(setups), 10) + assert isinstance(setups, Dict) + assert isinstance(setups[next(iter(setups.keys()))], openml.setups.setup.OpenMLSetup) + assert len(setups) == 10 setups = openml.setups.list_setups(flow=flow_id, output_format="dataframe", size=10) - self.assertIsInstance(setups, pd.DataFrame) - self.assertEqual(len(setups), 10) + assert isinstance(setups, pd.DataFrame) + assert len(setups) == 10 # TODO: [0.15] Remove section as `dict` is no longer supported. with pytest.warns(FutureWarning): setups = openml.setups.list_setups(flow=flow_id, output_format="dict", size=10) - self.assertIsInstance(setups, Dict) - self.assertIsInstance(setups[list(setups.keys())[0]], Dict) - self.assertEqual(len(setups), 10) + assert isinstance(setups, Dict) + assert isinstance(setups[next(iter(setups.keys()))], Dict) + assert len(setups) == 10 def test_setuplist_offset(self): - # TODO: remove after pull on live for better testing - # openml.config.server = self.production_server - size = 10 setups = openml.setups.list_setups(offset=0, size=size) - self.assertEqual(len(setups), size) + assert len(setups) == size setups2 = openml.setups.list_setups(offset=size, size=size) - self.assertEqual(len(setups2), size) + assert len(setups2) == size all = set(setups.keys()).union(setups2.keys()) - self.assertEqual(len(all), size * 2) + assert len(all) == size * 2 def test_get_cached_setup(self): openml.config.set_root_cache_directory(self.static_cache_dir) @@ -189,5 +188,5 @@ def test_get_cached_setup(self): def test_get_uncached_setup(self): openml.config.set_root_cache_directory(self.static_cache_dir) - with self.assertRaises(openml.exceptions.OpenMLCacheException): + with pytest.raises(openml.exceptions.OpenMLCacheException): openml.setups.functions._get_cached_setup(10) diff --git a/tests/test_study/test_study_examples.py b/tests/test_study/test_study_examples.py index cc3294085..b3f418756 100644 --- a/tests/test_study/test_study_examples.py +++ b/tests/test_study/test_study_examples.py @@ -1,19 +1,21 @@ # License: BSD 3-Clause +from __future__ import annotations -from openml.testing import TestBase -from openml.extensions.sklearn import cat, cont +import unittest +from distutils.version import LooseVersion import pytest import sklearn -import unittest -from distutils.version import LooseVersion + +from openml.extensions.sklearn import cat, cont +from openml.testing import TestBase class TestStudyFunctions(TestBase): _multiprocess_can_split_ = True """Test the example code of Bischl et al. (2018)""" - @pytest.mark.sklearn + @pytest.mark.sklearn() @unittest.skipIf( LooseVersion(sklearn.__version__) < "0.24", reason="columntransformer introduction in 0.24.0", @@ -38,35 +40,38 @@ def test_Figure1a(self): run.publish() # publish the experiment on OpenML (optional) print('URL for run: %s/run/%d' %(openml.config.server,run.run_id)) """ # noqa: E501 - import openml import sklearn.metrics import sklearn.tree + from sklearn.compose import ColumnTransformer from sklearn.impute import SimpleImputer from sklearn.pipeline import Pipeline, make_pipeline - from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder, StandardScaler + import openml + benchmark_suite = openml.study.get_study("OpenML100", "tasks") # obtain the benchmark suite cat_imp = OneHotEncoder(handle_unknown="ignore") cont_imp = make_pipeline(SimpleImputer(strategy="median"), StandardScaler()) ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)]) clf = Pipeline( - steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())] + steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())], ) # build a sklearn classifier for task_id in benchmark_suite.tasks[:1]: # iterate over all tasks task = openml.tasks.get_task(task_id) # download the OpenML task X, y = task.get_X_and_y() # get the data (not used in this example) openml.config.apikey = openml.config.apikey # set the OpenML Api Key run = openml.runs.run_model_on_task( - clf, task, avoid_duplicate_runs=False + clf, + task, + avoid_duplicate_runs=False, ) # run classifier on splits (requires API key) score = run.get_metric_fn(sklearn.metrics.accuracy_score) # print accuracy score TestBase.logger.info( - "Data set: %s; Accuracy: %0.2f" % (task.get_dataset().name, score.mean()) + f"Data set: {task.get_dataset().name}; Accuracy: {score.mean():0.2f}", ) run.publish() # publish the experiment on OpenML (optional) TestBase._mark_entity_for_removal("run", run.run_id) TestBase.logger.info( - "collected from {}: {}".format(__file__.split("/")[-1], run.run_id) + "collected from {}: {}".format(__file__.split("/")[-1], run.run_id), ) TestBase.logger.info("URL for run: %s/run/%d" % (openml.config.server, run.run_id)) diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py index bfbbbee49..721c81f9e 100644 --- a/tests/test_study/test_study_functions.py +++ b/tests/test_study/test_study_functions.py @@ -1,70 +1,75 @@ # License: BSD 3-Clause -from typing import Optional, List +from __future__ import annotations + +import pandas as pd +import pytest import openml import openml.study from openml.testing import TestBase -import pandas as pd -import pytest class TestStudyFunctions(TestBase): _multiprocess_can_split_ = True + @pytest.mark.production() def test_get_study_old(self): openml.config.server = self.production_server study = openml.study.get_study(34) - self.assertEqual(len(study.data), 105) - self.assertEqual(len(study.tasks), 105) - self.assertEqual(len(study.flows), 27) - self.assertEqual(len(study.setups), 30) - self.assertIsNone(study.runs) + assert len(study.data) == 105 + assert len(study.tasks) == 105 + assert len(study.flows) == 27 + assert len(study.setups) == 30 + assert study.runs is None + @pytest.mark.production() def test_get_study_new(self): openml.config.server = self.production_server study = openml.study.get_study(123) - self.assertEqual(len(study.data), 299) - self.assertEqual(len(study.tasks), 299) - self.assertEqual(len(study.flows), 5) - self.assertEqual(len(study.setups), 1253) - self.assertEqual(len(study.runs), 1693) + assert len(study.data) == 299 + assert len(study.tasks) == 299 + assert len(study.flows) == 5 + assert len(study.setups) == 1253 + assert len(study.runs) == 1693 + @pytest.mark.production() def test_get_openml100(self): openml.config.server = self.production_server study = openml.study.get_study("OpenML100", "tasks") - self.assertIsInstance(study, openml.study.OpenMLBenchmarkSuite) + assert isinstance(study, openml.study.OpenMLBenchmarkSuite) study_2 = openml.study.get_suite("OpenML100") - self.assertIsInstance(study_2, openml.study.OpenMLBenchmarkSuite) - self.assertEqual(study.study_id, study_2.study_id) + assert isinstance(study_2, openml.study.OpenMLBenchmarkSuite) + assert study.study_id == study_2.study_id + @pytest.mark.production() def test_get_study_error(self): openml.config.server = self.production_server - with self.assertRaisesRegex( - ValueError, - "Unexpected entity type 'task' reported by the server, expected 'run'", + with pytest.raises( + ValueError, match="Unexpected entity type 'task' reported by the server, expected 'run'" ): openml.study.get_study(99) + @pytest.mark.production() def test_get_suite(self): openml.config.server = self.production_server study = openml.study.get_suite(99) - self.assertEqual(len(study.data), 72) - self.assertEqual(len(study.tasks), 72) - self.assertIsNone(study.flows) - self.assertIsNone(study.runs) - self.assertIsNone(study.setups) + assert len(study.data) == 72 + assert len(study.tasks) == 72 + assert study.flows is None + assert study.runs is None + assert study.setups is None + @pytest.mark.production() def test_get_suite_error(self): openml.config.server = self.production_server - with self.assertRaisesRegex( - ValueError, - "Unexpected entity type 'run' reported by the server, expected 'task'", + with pytest.raises( + ValueError, match="Unexpected entity type 'run' reported by the server, expected 'task'" ): openml.study.get_suite(123) @@ -84,20 +89,20 @@ def test_publish_benchmark_suite(self): TestBase._mark_entity_for_removal("study", study.id) TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], study.id)) - self.assertGreater(study.id, 0) + assert study.id > 0 # verify main meta data study_downloaded = openml.study.get_suite(study.id) - self.assertEqual(study_downloaded.alias, fixture_alias) - self.assertEqual(study_downloaded.name, fixture_name) - self.assertEqual(study_downloaded.description, fixture_descr) - self.assertEqual(study_downloaded.main_entity_type, "task") + assert study_downloaded.alias == fixture_alias + assert study_downloaded.name == fixture_name + assert study_downloaded.description == fixture_descr + assert study_downloaded.main_entity_type == "task" # verify resources - self.assertIsNone(study_downloaded.flows) - self.assertIsNone(study_downloaded.setups) - self.assertIsNone(study_downloaded.runs) - self.assertGreater(len(study_downloaded.data), 0) - self.assertLessEqual(len(study_downloaded.data), len(fixture_task_ids)) + assert study_downloaded.flows is None + assert study_downloaded.setups is None + assert study_downloaded.runs is None + assert len(study_downloaded.data) > 0 + assert len(study_downloaded.data) <= len(fixture_task_ids) self.assertSetEqual(set(study_downloaded.tasks), set(fixture_task_ids)) # attach more tasks @@ -114,11 +119,11 @@ def test_publish_benchmark_suite(self): # test status update function openml.study.update_suite_status(study.id, "deactivated") study_downloaded = openml.study.get_suite(study.id) - self.assertEqual(study_downloaded.status, "deactivated") + assert study_downloaded.status == "deactivated" # can't delete study, now it's not longer in preparation def _test_publish_empty_study_is_allowed(self, explicit: bool): - runs: Optional[List[int]] = [] if explicit else None + runs: list[int] | None = [] if explicit else None kind = "explicit" if explicit else "implicit" study = openml.study.create_study( @@ -131,10 +136,10 @@ def _test_publish_empty_study_is_allowed(self, explicit: bool): TestBase._mark_entity_for_removal("study", study.id) TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], study.id)) - self.assertGreater(study.id, 0) + assert study.id > 0 study_downloaded = openml.study.get_study(study.id) - self.assertEqual(study_downloaded.main_entity_type, "run") - self.assertIsNone(study_downloaded.runs) + assert study_downloaded.main_entity_type == "run" + assert study_downloaded.runs is None def test_publish_empty_study_explicit(self): self._test_publish_empty_study_is_allowed(explicit=True) @@ -146,14 +151,14 @@ def test_publish_empty_study_implicit(self): def test_publish_study(self): # get some random runs to attach run_list = openml.evaluations.list_evaluations("predictive_accuracy", size=10) - self.assertEqual(len(run_list), 10) + assert len(run_list) == 10 fixt_alias = None fixt_name = "unit tested study" fixt_descr = "bla" - fixt_flow_ids = set([evaluation.flow_id for evaluation in run_list.values()]) - fixt_task_ids = set([evaluation.task_id for evaluation in run_list.values()]) - fixt_setup_ids = set([evaluation.setup_id for evaluation in run_list.values()]) + fixt_flow_ids = {evaluation.flow_id for evaluation in run_list.values()} + fixt_task_ids = {evaluation.task_id for evaluation in run_list.values()} + fixt_setup_ids = {evaluation.setup_id for evaluation in run_list.values()} study = openml.study.create_study( alias=fixt_alias, @@ -165,12 +170,12 @@ def test_publish_study(self): study.publish() TestBase._mark_entity_for_removal("study", study.id) TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], study.id)) - self.assertGreater(study.id, 0) + assert study.id > 0 study_downloaded = openml.study.get_study(study.id) - self.assertEqual(study_downloaded.alias, fixt_alias) - self.assertEqual(study_downloaded.name, fixt_name) - self.assertEqual(study_downloaded.description, fixt_descr) - self.assertEqual(study_downloaded.main_entity_type, "run") + assert study_downloaded.alias == fixt_alias + assert study_downloaded.name == fixt_name + assert study_downloaded.description == fixt_descr + assert study_downloaded.main_entity_type == "run" self.assertSetEqual(set(study_downloaded.runs), set(run_list.keys())) self.assertSetEqual(set(study_downloaded.setups), set(fixt_setup_ids)) @@ -183,7 +188,9 @@ def test_publish_study(self): # test whether the list evaluation function also handles study data fine run_ids = openml.evaluations.list_evaluations( - "predictive_accuracy", size=None, study=study.id + "predictive_accuracy", + size=None, + study=study.id, ) self.assertSetEqual(set(run_ids), set(study_downloaded.runs)) @@ -204,16 +211,16 @@ def test_publish_study(self): # test status update function openml.study.update_study_status(study.id, "deactivated") study_downloaded = openml.study.get_study(study.id) - self.assertEqual(study_downloaded.status, "deactivated") + assert study_downloaded.status == "deactivated" res = openml.study.delete_study(study.id) - self.assertTrue(res) + assert res def test_study_attach_illegal(self): run_list = openml.runs.list_runs(size=10) - self.assertEqual(len(run_list), 10) + assert len(run_list) == 10 run_list_more = openml.runs.list_runs(size=20) - self.assertEqual(len(run_list_more), 20) + assert len(run_list_more) == 20 study = openml.study.create_study( alias=None, @@ -227,14 +234,14 @@ def test_study_attach_illegal(self): TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], study.id)) study_original = openml.study.get_study(study.id) - with self.assertRaisesRegex( - openml.exceptions.OpenMLServerException, "Problem attaching entities." + with pytest.raises( + openml.exceptions.OpenMLServerException, match="Problem attaching entities." ): # run id does not exists openml.study.attach_to_study(study.id, [0]) - with self.assertRaisesRegex( - openml.exceptions.OpenMLServerException, "Problem attaching entities." + with pytest.raises( + openml.exceptions.OpenMLServerException, match="Problem attaching entities." ): # some runs already attached openml.study.attach_to_study(study.id, list(run_list_more.keys())) @@ -244,8 +251,8 @@ def test_study_attach_illegal(self): def test_study_list(self): study_list = openml.study.list_studies(status="in_preparation", output_format="dataframe") # might fail if server is recently reset - self.assertGreaterEqual(len(study_list), 2) + assert len(study_list) >= 2 def test_study_list_output_format(self): study_list = openml.study.list_studies(status="in_preparation", output_format="dataframe") - self.assertIsInstance(study_list, pd.DataFrame) + assert isinstance(study_list, pd.DataFrame) diff --git a/tests/test_tasks/__init__.py b/tests/test_tasks/__init__.py index e987ab735..26488a8cc 100644 --- a/tests/test_tasks/__init__.py +++ b/tests/test_tasks/__init__.py @@ -1,7 +1,7 @@ # License: BSD 3-Clause -from .test_task import OpenMLTaskTest from .test_supervised_task import OpenMLSupervisedTaskTest +from .test_task import OpenMLTaskTest __all__ = [ "OpenMLTaskTest", diff --git a/tests/test_tasks/test_classification_task.py b/tests/test_tasks/test_classification_task.py index 4f03c77fc..661e8eced 100644 --- a/tests/test_tasks/test_classification_task.py +++ b/tests/test_tasks/test_classification_task.py @@ -1,8 +1,10 @@ # License: BSD 3-Clause +from __future__ import annotations import numpy as np from openml.tasks import TaskType, get_task + from .test_supervised_task import OpenMLSupervisedTaskTest @@ -10,25 +12,25 @@ class OpenMLClassificationTaskTest(OpenMLSupervisedTaskTest): __test__ = True def setUp(self, n_levels: int = 1): - super(OpenMLClassificationTaskTest, self).setUp() + super().setUp() self.task_id = 119 # diabetes self.task_type = TaskType.SUPERVISED_CLASSIFICATION self.estimation_procedure = 1 def test_get_X_and_Y(self): - X, Y = super(OpenMLClassificationTaskTest, self).test_get_X_and_Y() - self.assertEqual((768, 8), X.shape) - self.assertIsInstance(X, np.ndarray) - self.assertEqual((768,), Y.shape) - self.assertIsInstance(Y, np.ndarray) - self.assertEqual(Y.dtype, int) + X, Y = super().test_get_X_and_Y() + assert X.shape == (768, 8) + assert isinstance(X, np.ndarray) + assert Y.shape == (768,) + assert isinstance(Y, np.ndarray) + assert Y.dtype == int def test_download_task(self): - task = super(OpenMLClassificationTaskTest, self).test_download_task() - self.assertEqual(task.task_id, self.task_id) - self.assertEqual(task.task_type_id, TaskType.SUPERVISED_CLASSIFICATION) - self.assertEqual(task.dataset_id, 20) + task = super().test_download_task() + assert task.task_id == self.task_id + assert task.task_type_id == TaskType.SUPERVISED_CLASSIFICATION + assert task.dataset_id == 20 def test_class_labels(self): task = get_task(self.task_id) - self.assertEqual(task.class_labels, ["tested_negative", "tested_positive"]) + assert task.class_labels == ["tested_negative", "tested_positive"] diff --git a/tests/test_tasks/test_clustering_task.py b/tests/test_tasks/test_clustering_task.py index d7a414276..bc59ad26c 100644 --- a/tests/test_tasks/test_clustering_task.py +++ b/tests/test_tasks/test_clustering_task.py @@ -1,34 +1,40 @@ # License: BSD 3-Clause +from __future__ import annotations + +import pytest import openml +from openml.exceptions import OpenMLServerException from openml.tasks import TaskType from openml.testing import TestBase + from .test_task import OpenMLTaskTest -from openml.exceptions import OpenMLServerException class OpenMLClusteringTaskTest(OpenMLTaskTest): __test__ = True def setUp(self, n_levels: int = 1): - super(OpenMLClusteringTaskTest, self).setUp() + super().setUp() self.task_id = 146714 self.task_type = TaskType.CLUSTERING self.estimation_procedure = 17 + @pytest.mark.production() def test_get_dataset(self): # no clustering tasks on test server openml.config.server = self.production_server task = openml.tasks.get_task(self.task_id) task.get_dataset() + @pytest.mark.production() def test_download_task(self): # no clustering tasks on test server openml.config.server = self.production_server - task = super(OpenMLClusteringTaskTest, self).test_download_task() - self.assertEqual(task.task_id, self.task_id) - self.assertEqual(task.task_type_id, TaskType.CLUSTERING) - self.assertEqual(task.dataset_id, 36) + task = super().test_download_task() + assert task.task_id == self.task_id + assert task.task_type_id == TaskType.CLUSTERING + assert task.dataset_id == 36 def test_upload_task(self): compatible_datasets = self._get_compatible_rand_dataset() @@ -44,7 +50,7 @@ def test_upload_task(self): task = task.publish() TestBase._mark_entity_for_removal("task", task.id) TestBase.logger.info( - "collected from {}: {}".format(__file__.split("/")[-1], task.id) + "collected from {}: {}".format(__file__.split("/")[-1], task.id), ) # success break @@ -58,5 +64,5 @@ def test_upload_task(self): raise e else: raise ValueError( - "Could not create a valid task for task type ID {}".format(self.task_type) + f"Could not create a valid task for task type ID {self.task_type}", ) diff --git a/tests/test_tasks/test_learning_curve_task.py b/tests/test_tasks/test_learning_curve_task.py index b3543f9ca..0e781c8ff 100644 --- a/tests/test_tasks/test_learning_curve_task.py +++ b/tests/test_tasks/test_learning_curve_task.py @@ -1,8 +1,10 @@ # License: BSD 3-Clause +from __future__ import annotations import numpy as np from openml.tasks import TaskType, get_task + from .test_supervised_task import OpenMLSupervisedTaskTest @@ -10,25 +12,25 @@ class OpenMLLearningCurveTaskTest(OpenMLSupervisedTaskTest): __test__ = True def setUp(self, n_levels: int = 1): - super(OpenMLLearningCurveTaskTest, self).setUp() + super().setUp() self.task_id = 801 # diabetes self.task_type = TaskType.LEARNING_CURVE self.estimation_procedure = 13 def test_get_X_and_Y(self): - X, Y = super(OpenMLLearningCurveTaskTest, self).test_get_X_and_Y() - self.assertEqual((768, 8), X.shape) - self.assertIsInstance(X, np.ndarray) - self.assertEqual((768,), Y.shape) - self.assertIsInstance(Y, np.ndarray) - self.assertEqual(Y.dtype, int) + X, Y = super().test_get_X_and_Y() + assert X.shape == (768, 8) + assert isinstance(X, np.ndarray) + assert Y.shape == (768,) + assert isinstance(Y, np.ndarray) + assert Y.dtype == int def test_download_task(self): - task = super(OpenMLLearningCurveTaskTest, self).test_download_task() - self.assertEqual(task.task_id, self.task_id) - self.assertEqual(task.task_type_id, TaskType.LEARNING_CURVE) - self.assertEqual(task.dataset_id, 20) + task = super().test_download_task() + assert task.task_id == self.task_id + assert task.task_type_id == TaskType.LEARNING_CURVE + assert task.dataset_id == 20 def test_class_labels(self): task = get_task(self.task_id) - self.assertEqual(task.class_labels, ["tested_negative", "tested_positive"]) + assert task.class_labels == ["tested_negative", "tested_positive"] diff --git a/tests/test_tasks/test_regression_task.py b/tests/test_tasks/test_regression_task.py index c958bb3dd..29a8254df 100644 --- a/tests/test_tasks/test_regression_task.py +++ b/tests/test_tasks/test_regression_task.py @@ -1,13 +1,15 @@ # License: BSD 3-Clause +from __future__ import annotations import ast + import numpy as np import openml -from openml.tasks import TaskType -from openml.testing import TestBase -from openml.testing import check_task_existence from openml.exceptions import OpenMLServerException +from openml.tasks import TaskType +from openml.testing import TestBase, check_task_existence + from .test_supervised_task import OpenMLSupervisedTaskTest @@ -15,7 +17,7 @@ class OpenMLRegressionTaskTest(OpenMLSupervisedTaskTest): __test__ = True def setUp(self, n_levels: int = 1): - super(OpenMLRegressionTaskTest, self).setUp() + super().setUp() task_meta_data = { "task_type": TaskType.SUPERVISED_REGRESSION, @@ -34,7 +36,7 @@ def setUp(self, n_levels: int = 1): task_id = new_task.task_id # mark to remove the uploaded task TestBase._mark_entity_for_removal("task", task_id) - TestBase.logger.info("collected from test_run_functions: {}".format(task_id)) + TestBase.logger.info(f"collected from test_run_functions: {task_id}") except OpenMLServerException as e: if e.code == 614: # Task already exists # the exception message contains the task_id that was matched in the format @@ -47,15 +49,15 @@ def setUp(self, n_levels: int = 1): self.estimation_procedure = 7 def test_get_X_and_Y(self): - X, Y = super(OpenMLRegressionTaskTest, self).test_get_X_and_Y() - self.assertEqual((194, 32), X.shape) - self.assertIsInstance(X, np.ndarray) - self.assertEqual((194,), Y.shape) - self.assertIsInstance(Y, np.ndarray) - self.assertEqual(Y.dtype, float) + X, Y = super().test_get_X_and_Y() + assert X.shape == (194, 32) + assert isinstance(X, np.ndarray) + assert Y.shape == (194,) + assert isinstance(Y, np.ndarray) + assert Y.dtype == float def test_download_task(self): - task = super(OpenMLRegressionTaskTest, self).test_download_task() - self.assertEqual(task.task_id, self.task_id) - self.assertEqual(task.task_type_id, TaskType.SUPERVISED_REGRESSION) - self.assertEqual(task.dataset_id, 105) + task = super().test_download_task() + assert task.task_id == self.task_id + assert task.task_type_id == TaskType.SUPERVISED_REGRESSION + assert task.dataset_id == 105 diff --git a/tests/test_tasks/test_split.py b/tests/test_tasks/test_split.py index 7d8004a91..12cb632d9 100644 --- a/tests/test_tasks/test_split.py +++ b/tests/test_tasks/test_split.py @@ -1,7 +1,9 @@ # License: BSD 3-Clause +from __future__ import annotations import inspect import os +from pathlib import Path import numpy as np @@ -17,18 +19,17 @@ def setUp(self): __file__ = inspect.getfile(OpenMLSplitTest) self.directory = os.path.dirname(__file__) # This is for dataset - self.arff_filename = os.path.join( - self.directory, - "..", - "files", - "org", - "openml", - "test", - "tasks", - "1882", - "datasplits.arff", + self.arff_filepath = ( + Path(self.directory).parent + / "files" + / "org" + / "openml" + / "test" + / "tasks" + / "1882" + / "datasplits.arff" ) - self.pd_filename = self.arff_filename.replace(".arff", ".pkl.py3") + self.pd_filename = self.arff_filepath.with_suffix(".pkl.py3") def tearDown(self): try: @@ -38,49 +39,49 @@ def tearDown(self): pass def test_eq(self): - split = OpenMLSplit._from_arff_file(self.arff_filename) - self.assertEqual(split, split) + split = OpenMLSplit._from_arff_file(self.arff_filepath) + assert split == split - split2 = OpenMLSplit._from_arff_file(self.arff_filename) + split2 = OpenMLSplit._from_arff_file(self.arff_filepath) split2.name = "a" - self.assertNotEqual(split, split2) + assert split != split2 - split2 = OpenMLSplit._from_arff_file(self.arff_filename) + split2 = OpenMLSplit._from_arff_file(self.arff_filepath) split2.description = "a" - self.assertNotEqual(split, split2) + assert split != split2 - split2 = OpenMLSplit._from_arff_file(self.arff_filename) - split2.split[10] = dict() - self.assertNotEqual(split, split2) + split2 = OpenMLSplit._from_arff_file(self.arff_filepath) + split2.split[10] = {} + assert split != split2 - split2 = OpenMLSplit._from_arff_file(self.arff_filename) - split2.split[0][10] = dict() - self.assertNotEqual(split, split2) + split2 = OpenMLSplit._from_arff_file(self.arff_filepath) + split2.split[0][10] = {} + assert split != split2 def test_from_arff_file(self): - split = OpenMLSplit._from_arff_file(self.arff_filename) - self.assertIsInstance(split.split, dict) - self.assertIsInstance(split.split[0], dict) - self.assertIsInstance(split.split[0][0], dict) - self.assertIsInstance(split.split[0][0][0][0], np.ndarray) - self.assertIsInstance(split.split[0][0][0].train, np.ndarray) - self.assertIsInstance(split.split[0][0][0].train, np.ndarray) - self.assertIsInstance(split.split[0][0][0][1], np.ndarray) - self.assertIsInstance(split.split[0][0][0].test, np.ndarray) - self.assertIsInstance(split.split[0][0][0].test, np.ndarray) + split = OpenMLSplit._from_arff_file(self.arff_filepath) + assert isinstance(split.split, dict) + assert isinstance(split.split[0], dict) + assert isinstance(split.split[0][0], dict) + assert isinstance(split.split[0][0][0][0], np.ndarray) + assert isinstance(split.split[0][0][0].train, np.ndarray) + assert isinstance(split.split[0][0][0].train, np.ndarray) + assert isinstance(split.split[0][0][0][1], np.ndarray) + assert isinstance(split.split[0][0][0].test, np.ndarray) + assert isinstance(split.split[0][0][0].test, np.ndarray) for i in range(10): for j in range(10): - self.assertGreaterEqual(split.split[i][j][0].train.shape[0], 808) - self.assertGreaterEqual(split.split[i][j][0].test.shape[0], 89) - self.assertEqual( - split.split[i][j][0].train.shape[0] + split.split[i][j][0].test.shape[0], 898 + assert split.split[i][j][0].train.shape[0] >= 808 + assert split.split[i][j][0].test.shape[0] >= 89 + assert ( + split.split[i][j][0].train.shape[0] + split.split[i][j][0].test.shape[0] == 898 ) def test_get_split(self): - split = OpenMLSplit._from_arff_file(self.arff_filename) + split = OpenMLSplit._from_arff_file(self.arff_filepath) train_split, test_split = split.get(fold=5, repeat=2) - self.assertEqual(train_split.shape[0], 808) - self.assertEqual(test_split.shape[0], 90) + assert train_split.shape[0] == 808 + assert test_split.shape[0] == 90 self.assertRaisesRegex( ValueError, "Repeat 10 not known", diff --git a/tests/test_tasks/test_supervised_task.py b/tests/test_tasks/test_supervised_task.py index 69b6a3c1d..00ce1f276 100644 --- a/tests/test_tasks/test_supervised_task.py +++ b/tests/test_tasks/test_supervised_task.py @@ -1,11 +1,12 @@ # License: BSD 3-Clause +from __future__ import annotations -from typing import Tuple import unittest import numpy as np from openml.tasks import get_task + from .test_task import OpenMLTaskTest @@ -21,12 +22,12 @@ class OpenMLSupervisedTaskTest(OpenMLTaskTest): def setUpClass(cls): if cls is OpenMLSupervisedTaskTest: raise unittest.SkipTest("Skip OpenMLSupervisedTaskTest tests," " it's a base class") - super(OpenMLSupervisedTaskTest, cls).setUpClass() + super().setUpClass() def setUp(self, n_levels: int = 1): - super(OpenMLSupervisedTaskTest, self).setUp() + super().setUp() - def test_get_X_and_Y(self) -> Tuple[np.ndarray, np.ndarray]: + def test_get_X_and_Y(self) -> tuple[np.ndarray, np.ndarray]: task = get_task(self.task_id) X, Y = task.get_X_and_y() return X, Y diff --git a/tests/test_tasks/test_task.py b/tests/test_tasks/test_task.py index cd8e515c1..ec5a8caf5 100644 --- a/tests/test_tasks/test_task.py +++ b/tests/test_tasks/test_task.py @@ -1,16 +1,16 @@ # License: BSD 3-Clause +from __future__ import annotations import unittest -from typing import List from random import randint, shuffle -from openml.exceptions import OpenMLServerException -from openml.testing import TestBase from openml.datasets import ( get_dataset, list_datasets, ) +from openml.exceptions import OpenMLServerException from openml.tasks import TaskType, create_task, get_task +from openml.testing import TestBase class OpenMLTaskTest(TestBase): @@ -25,10 +25,10 @@ class OpenMLTaskTest(TestBase): def setUpClass(cls): if cls is OpenMLTaskTest: raise unittest.SkipTest("Skip OpenMLTaskTest tests," " it's a base class") - super(OpenMLTaskTest, cls).setUpClass() + super().setUpClass() def setUp(self, n_levels: int = 1): - super(OpenMLTaskTest, self).setUp() + super().setUp() def test_download_task(self): return get_task(self.task_id) @@ -53,7 +53,7 @@ def test_upload_task(self): task.publish() TestBase._mark_entity_for_removal("task", task.id) TestBase.logger.info( - "collected from {}: {}".format(__file__.split("/")[-1], task.id) + "collected from {}: {}".format(__file__.split("/")[-1], task.id), ) # success break @@ -67,10 +67,10 @@ def test_upload_task(self): raise e else: raise ValueError( - "Could not create a valid task for task type ID {}".format(self.task_type) + f"Could not create a valid task for task type ID {self.task_type}", ) - def _get_compatible_rand_dataset(self) -> List: + def _get_compatible_rand_dataset(self) -> list: active_datasets = list_datasets(status="active", output_format="dataframe") # depending on the task type, find either datasets diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py index 481ef2d83..3dc776a2b 100644 --- a/tests/test_tasks/test_task_functions.py +++ b/tests/test_tasks/test_task_functions.py @@ -1,41 +1,42 @@ # License: BSD 3-Clause +from __future__ import annotations import os +import unittest from typing import cast from unittest import mock +import pandas as pd import pytest import requests -from openml.tasks import TaskType -from openml.testing import TestBase, create_request_response +import openml from openml import OpenMLSplit, OpenMLTask from openml.exceptions import OpenMLCacheException, OpenMLNotAuthorizedError, OpenMLServerException -import openml -import unittest -import pandas as pd +from openml.tasks import TaskType +from openml.testing import TestBase, create_request_response class TestTask(TestBase): _multiprocess_can_split_ = True def setUp(self): - super(TestTask, self).setUp() + super().setUp() def tearDown(self): - super(TestTask, self).tearDown() + super().tearDown() def test__get_cached_tasks(self): openml.config.set_root_cache_directory(self.static_cache_dir) tasks = openml.tasks.functions._get_cached_tasks() - self.assertIsInstance(tasks, dict) - self.assertEqual(len(tasks), 3) - self.assertIsInstance(list(tasks.values())[0], OpenMLTask) + assert isinstance(tasks, dict) + assert len(tasks) == 3 + assert isinstance(next(iter(tasks.values())), OpenMLTask) def test__get_cached_task(self): openml.config.set_root_cache_directory(self.static_cache_dir) task = openml.tasks.functions._get_cached_task(1) - self.assertIsInstance(task, OpenMLTask) + assert isinstance(task, OpenMLTask) def test__get_cached_task_not_cached(self): openml.config.set_root_cache_directory(self.static_cache_dir) @@ -48,12 +49,11 @@ def test__get_cached_task_not_cached(self): def test__get_estimation_procedure_list(self): estimation_procedures = openml.tasks.functions._get_estimation_procedure_list() - self.assertIsInstance(estimation_procedures, list) - self.assertIsInstance(estimation_procedures[0], dict) - self.assertEqual( - estimation_procedures[0]["task_type_id"], TaskType.SUPERVISED_CLASSIFICATION - ) + assert isinstance(estimation_procedures, list) + assert isinstance(estimation_procedures[0], dict) + assert estimation_procedures[0]["task_type_id"] == TaskType.SUPERVISED_CLASSIFICATION + @pytest.mark.production() def test_list_clustering_task(self): # as shown by #383, clustering tasks can give list/dict casting problems openml.config.server = self.production_server @@ -61,28 +61,28 @@ def test_list_clustering_task(self): # the expected outcome is that it doesn't crash. No assertions. def _check_task(self, task): - self.assertEqual(type(task), dict) - self.assertGreaterEqual(len(task), 2) - self.assertIn("did", task) - self.assertIsInstance(task["did"], int) - self.assertIn("status", task) - self.assertIsInstance(task["status"], str) - self.assertIn(task["status"], ["in_preparation", "active", "deactivated"]) + assert type(task) == dict + assert len(task) >= 2 + assert "did" in task + assert isinstance(task["did"], int) + assert "status" in task + assert isinstance(task["status"], str) + assert task["status"] in ["in_preparation", "active", "deactivated"] def test_list_tasks_by_type(self): num_curves_tasks = 198 # number is flexible, check server if fails ttid = TaskType.LEARNING_CURVE tasks = openml.tasks.list_tasks(task_type=ttid, output_format="dataframe") - self.assertGreaterEqual(len(tasks), num_curves_tasks) + assert len(tasks) >= num_curves_tasks for task in tasks.to_dict(orient="index").values(): - self.assertEqual(ttid, task["ttid"]) + assert ttid == task["ttid"] self._check_task(task) def test_list_tasks_output_format(self): ttid = TaskType.LEARNING_CURVE tasks = openml.tasks.list_tasks(task_type=ttid, output_format="dataframe") - self.assertIsInstance(tasks, pd.DataFrame) - self.assertGreater(len(tasks), 100) + assert isinstance(tasks, pd.DataFrame) + assert len(tasks) > 100 def test_list_tasks_empty(self): tasks = cast( @@ -94,13 +94,13 @@ def test_list_tasks_empty(self): def test_list_tasks_by_tag(self): num_basic_tasks = 100 # number is flexible, check server if fails tasks = openml.tasks.list_tasks(tag="OpenML100", output_format="dataframe") - self.assertGreaterEqual(len(tasks), num_basic_tasks) + assert len(tasks) >= num_basic_tasks for task in tasks.to_dict(orient="index").values(): self._check_task(task) def test_list_tasks(self): tasks = openml.tasks.list_tasks(output_format="dataframe") - self.assertGreaterEqual(len(tasks), 900) + assert len(tasks) >= 900 for task in tasks.to_dict(orient="index").values(): self._check_task(task) @@ -109,7 +109,7 @@ def test_list_tasks_paginate(self): max = 100 for i in range(0, max, size): tasks = openml.tasks.list_tasks(offset=i, size=size, output_format="dataframe") - self.assertGreaterEqual(size, len(tasks)) + assert size >= len(tasks) for task in tasks.to_dict(orient="index").values(): self._check_task(task) @@ -124,11 +124,14 @@ def test_list_tasks_per_type_paginate(self): for j in task_types: for i in range(0, max, size): tasks = openml.tasks.list_tasks( - task_type=j, offset=i, size=size, output_format="dataframe" + task_type=j, + offset=i, + size=size, + output_format="dataframe", ) - self.assertGreaterEqual(size, len(tasks)) + assert size >= len(tasks) for task in tasks.to_dict(orient="index").values(): - self.assertEqual(j, task["ttid"]) + assert j == task["ttid"] self._check_task(task) def test__get_task(self): @@ -136,8 +139,9 @@ def test__get_task(self): openml.tasks.get_task(1882) @unittest.skip( - "Please await outcome of discussion: https://github.com/openml/OpenML/issues/776" - ) # noqa: E501 + "Please await outcome of discussion: https://github.com/openml/OpenML/issues/776", + ) + @pytest.mark.production() def test__get_task_live(self): # Test the following task as it used to throw an Unicode Error. # https://github.com/openml/openml-python/issues/378 @@ -146,66 +150,36 @@ def test__get_task_live(self): def test_get_task(self): task = openml.tasks.get_task(1) # anneal; crossvalidation - self.assertIsInstance(task, OpenMLTask) - self.assertTrue( - os.path.exists( - os.path.join( - self.workdir, - "org", - "openml", - "test", - "tasks", - "1", - "task.xml", - ) - ) + assert isinstance(task, OpenMLTask) + assert os.path.exists( + os.path.join(self.workdir, "org", "openml", "test", "tasks", "1", "task.xml") ) - self.assertTrue( - os.path.exists( - os.path.join(self.workdir, "org", "openml", "test", "tasks", "1", "datasplits.arff") - ) + assert os.path.exists( + os.path.join(self.workdir, "org", "openml", "test", "tasks", "1", "datasplits.arff") ) - self.assertTrue( - os.path.exists( - os.path.join(self.workdir, "org", "openml", "test", "datasets", "1", "dataset.arff") - ) + assert os.path.exists( + os.path.join(self.workdir, "org", "openml", "test", "datasets", "1", "dataset.arff") ) def test_get_task_lazy(self): task = openml.tasks.get_task(2, download_data=False) # anneal; crossvalidation - self.assertIsInstance(task, OpenMLTask) - self.assertTrue( - os.path.exists( - os.path.join( - self.workdir, - "org", - "openml", - "test", - "tasks", - "2", - "task.xml", - ) - ) + assert isinstance(task, OpenMLTask) + assert os.path.exists( + os.path.join(self.workdir, "org", "openml", "test", "tasks", "2", "task.xml") ) - self.assertEqual(task.class_labels, ["1", "2", "3", "4", "5", "U"]) + assert task.class_labels == ["1", "2", "3", "4", "5", "U"] - self.assertFalse( - os.path.exists( - os.path.join(self.workdir, "org", "openml", "test", "tasks", "2", "datasplits.arff") - ) + assert not os.path.exists( + os.path.join(self.workdir, "org", "openml", "test", "tasks", "2", "datasplits.arff") ) # Since the download_data=False is propagated to get_dataset - self.assertFalse( - os.path.exists( - os.path.join(self.workdir, "org", "openml", "test", "datasets", "2", "dataset.arff") - ) + assert not os.path.exists( + os.path.join(self.workdir, "org", "openml", "test", "datasets", "2", "dataset.arff") ) task.download_split() - self.assertTrue( - os.path.exists( - os.path.join(self.workdir, "org", "openml", "test", "tasks", "2", "datasplits.arff") - ) + assert os.path.exists( + os.path.join(self.workdir, "org", "openml", "test", "tasks", "2", "datasplits.arff") ) @mock.patch("openml.tasks.functions.get_dataset") @@ -224,13 +198,14 @@ def assert_and_raise(*args, **kwargs): except WeirdException: pass # Now the file should no longer exist - self.assertFalse(os.path.exists(os.path.join(os.getcwd(), "tasks", "1", "tasks.xml"))) + assert not os.path.exists(os.path.join(os.getcwd(), "tasks", "1", "tasks.xml")) def test_get_task_with_cache(self): openml.config.set_root_cache_directory(self.static_cache_dir) task = openml.tasks.get_task(1) - self.assertIsInstance(task, OpenMLTask) + assert isinstance(task, OpenMLTask) + @pytest.mark.production() def test_get_task_different_types(self): openml.config.server = self.production_server # Regression task @@ -243,11 +218,9 @@ def test_get_task_different_types(self): def test_download_split(self): task = openml.tasks.get_task(1) # anneal; crossvalidation split = task.download_split() - self.assertEqual(type(split), OpenMLSplit) - self.assertTrue( - os.path.exists( - os.path.join(self.workdir, "org", "openml", "test", "tasks", "1", "datasplits.arff") - ) + assert type(split) == OpenMLSplit + assert os.path.exists( + os.path.join(self.workdir, "org", "openml", "test", "tasks", "1", "datasplits.arff") ) def test_deletion_of_cache_dir(self): @@ -256,9 +229,9 @@ def test_deletion_of_cache_dir(self): "tasks", 1, ) - self.assertTrue(os.path.exists(tid_cache_dir)) + assert os.path.exists(tid_cache_dir) openml.utils._remove_cache_dir_for_id("tasks", tid_cache_dir) - self.assertFalse(os.path.exists(tid_cache_dir)) + assert not os.path.exists(tid_cache_dir) @mock.patch.object(requests.Session, "delete") @@ -266,7 +239,8 @@ def test_delete_task_not_owned(mock_delete, test_files_directory, test_api_key): openml.config.start_using_configuration_for_example() content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_not_owned.xml" mock_delete.return_value = create_request_response( - status_code=412, content_filepath=content_file + status_code=412, + content_filepath=content_file, ) with pytest.raises( @@ -287,7 +261,8 @@ def test_delete_task_with_run(mock_delete, test_files_directory, test_api_key): openml.config.start_using_configuration_for_example() content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_has_runs.xml" mock_delete.return_value = create_request_response( - status_code=412, content_filepath=content_file + status_code=412, + content_filepath=content_file, ) with pytest.raises( @@ -308,7 +283,8 @@ def test_delete_success(mock_delete, test_files_directory, test_api_key): openml.config.start_using_configuration_for_example() content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_successful.xml" mock_delete.return_value = create_request_response( - status_code=200, content_filepath=content_file + status_code=200, + content_filepath=content_file, ) success = openml.tasks.delete_task(361323) @@ -326,7 +302,8 @@ def test_delete_unknown_task(mock_delete, test_files_directory, test_api_key): openml.config.start_using_configuration_for_example() content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_not_exist.xml" mock_delete.return_value = create_request_response( - status_code=412, content_filepath=content_file + status_code=412, + content_filepath=content_file, ) with pytest.raises( diff --git a/tests/test_tasks/test_task_methods.py b/tests/test_tasks/test_task_methods.py index 4f15ccce2..552fbe949 100644 --- a/tests/test_tasks/test_task_methods.py +++ b/tests/test_tasks/test_task_methods.py @@ -1,4 +1,5 @@ # License: BSD 3-Clause +from __future__ import annotations from time import time @@ -9,40 +10,50 @@ # Common methods between tasks class OpenMLTaskMethodsTest(TestBase): def setUp(self): - super(OpenMLTaskMethodsTest, self).setUp() + super().setUp() def tearDown(self): - super(OpenMLTaskMethodsTest, self).tearDown() + super().tearDown() def test_tagging(self): task = openml.tasks.get_task(1) # anneal; crossvalidation - tag = "testing_tag_{}_{}".format(self.id(), time()) + # tags can be at most 64 alphanumeric (+ underscore) chars + unique_indicator = str(time()).replace(".", "") + tag = f"test_tag_OpenMLTaskMethodsTest_{unique_indicator}" tasks = openml.tasks.list_tasks(tag=tag, output_format="dataframe") - self.assertEqual(len(tasks), 0) + assert len(tasks) == 0 task.push_tag(tag) tasks = openml.tasks.list_tasks(tag=tag, output_format="dataframe") - self.assertEqual(len(tasks), 1) - self.assertIn(1, tasks["tid"]) + assert len(tasks) == 1 + assert 1 in tasks["tid"] task.remove_tag(tag) tasks = openml.tasks.list_tasks(tag=tag, output_format="dataframe") - self.assertEqual(len(tasks), 0) + assert len(tasks) == 0 def test_get_train_and_test_split_indices(self): openml.config.set_root_cache_directory(self.static_cache_dir) task = openml.tasks.get_task(1882) train_indices, test_indices = task.get_train_test_split_indices(0, 0) - self.assertEqual(16, train_indices[0]) - self.assertEqual(395, train_indices[-1]) - self.assertEqual(412, test_indices[0]) - self.assertEqual(364, test_indices[-1]) + assert train_indices[0] == 16 + assert train_indices[-1] == 395 + assert test_indices[0] == 412 + assert test_indices[-1] == 364 train_indices, test_indices = task.get_train_test_split_indices(2, 2) - self.assertEqual(237, train_indices[0]) - self.assertEqual(681, train_indices[-1]) - self.assertEqual(583, test_indices[0]) - self.assertEqual(24, test_indices[-1]) + assert train_indices[0] == 237 + assert train_indices[-1] == 681 + assert test_indices[0] == 583 + assert test_indices[-1] == 24 self.assertRaisesRegex( - ValueError, "Fold 10 not known", task.get_train_test_split_indices, 10, 0 + ValueError, + "Fold 10 not known", + task.get_train_test_split_indices, + 10, + 0, ) self.assertRaisesRegex( - ValueError, "Repeat 10 not known", task.get_train_test_split_indices, 0, 10 + ValueError, + "Repeat 10 not known", + task.get_train_test_split_indices, + 0, + 10, ) diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py index 93bfdb890..cae947917 100644 --- a/tests/test_utils/test_utils.py +++ b/tests/test_utils/test_utils.py @@ -1,118 +1,205 @@ +from __future__ import annotations + import os -import tempfile import unittest.mock - +import pytest +import shutil import openml -from openml.testing import TestBase +from openml.testing import _check_dataset -class OpenMLTaskTest(TestBase): - _multiprocess_can_split_ = True +@pytest.fixture(autouse=True) +def as_robot(): + policy = openml.config.retry_policy + n_retries = openml.config.connection_n_retries + openml.config.set_retry_policy("robot", n_retries=20) + yield + openml.config.set_retry_policy(policy, n_retries) - def mocked_perform_api_call(call, request_method): - # TODO: JvR: Why is this not a staticmethod? - url = openml.config.server + "/" + call - return openml._api_calls._download_text_file(url) - def test_list_all(self): - openml.utils._list_all(listing_call=openml.tasks.functions._list_tasks) - openml.utils._list_all( - listing_call=openml.tasks.functions._list_tasks, output_format="dataframe" - ) +@pytest.fixture(autouse=True) +def with_test_server(): + openml.config.start_using_configuration_for_example() + yield + openml.config.stop_using_configuration_for_example() - def test_list_all_with_multiple_batches(self): - res = openml.utils._list_all( - listing_call=openml.tasks.functions._list_tasks, output_format="dict", batch_size=1050 - ) - # Verify that test server state is still valid for this test to work as intended - # -> If the number of results is less than 1050, the test can not test the - # batching operation. By having more than 1050 results we know that batching - # was triggered. 1050 appears to be a number of tasks that is available on a fresh - # test server. - assert len(res) > 1050 - openml.utils._list_all( - listing_call=openml.tasks.functions._list_tasks, - output_format="dataframe", - batch_size=1050, - ) - # Comparing the number of tasks is not possible as other unit tests running in - # parallel might be adding or removing tasks! - # assert len(res) <= len(res2) - - @unittest.mock.patch("openml._api_calls._perform_api_call", side_effect=mocked_perform_api_call) - def test_list_all_few_results_available(self, _perform_api_call): - # we want to make sure that the number of api calls is only 1. - # Although we have multiple versions of the iris dataset, there is only - # one with this name/version combination - - datasets = openml.datasets.list_datasets( - size=1000, data_name="iris", data_version=1, output_format="dataframe" - ) - self.assertEqual(len(datasets), 1) - self.assertEqual(_perform_api_call.call_count, 1) - def test_list_all_for_datasets(self): - required_size = 127 # default test server reset value - datasets = openml.datasets.list_datasets( - batch_size=100, size=required_size, output_format="dataframe" +@pytest.fixture(autouse=True) +def with_test_cache(test_files_directory, request): + if not test_files_directory.exists(): + raise ValueError( + f"Cannot find test cache dir, expected it to be {test_files_directory!s}!", ) + _root_cache_directory = openml.config._root_cache_directory + tmp_cache = test_files_directory / request.node.name + openml.config.set_root_cache_directory(tmp_cache) + yield + openml.config.set_root_cache_directory(_root_cache_directory) + if tmp_cache.exists(): + shutil.rmtree(tmp_cache) - self.assertEqual(len(datasets), required_size) - for dataset in datasets.to_dict(orient="index").values(): - self._check_dataset(dataset) - def test_list_all_for_tasks(self): - required_size = 1068 # default test server reset value - tasks = openml.tasks.list_tasks( - batch_size=1000, size=required_size, output_format="dataframe" - ) - self.assertEqual(len(tasks), required_size) +@pytest.fixture() +def min_number_tasks_on_test_server() -> int: + """After a reset at least 1068 tasks are on the test server""" + return 1068 - def test_list_all_for_flows(self): - required_size = 15 # default test server reset value - flows = openml.flows.list_flows( - batch_size=25, size=required_size, output_format="dataframe" - ) - self.assertEqual(len(flows), required_size) - def test_list_all_for_setups(self): - required_size = 50 - # TODO apparently list_setups function does not support kwargs - setups = openml.setups.list_setups(size=required_size) +@pytest.fixture() +def min_number_datasets_on_test_server() -> int: + """After a reset at least 127 datasets are on the test server""" + return 127 - # might not be on test server after reset, please rerun test at least once if fails - self.assertEqual(len(setups), required_size) - def test_list_all_for_runs(self): - required_size = 21 - runs = openml.runs.list_runs(batch_size=25, size=required_size) +@pytest.fixture() +def min_number_flows_on_test_server() -> int: + """After a reset at least 127 flows are on the test server""" + return 15 - # might not be on test server after reset, please rerun test at least once if fails - self.assertEqual(len(runs), required_size) - def test_list_all_for_evaluations(self): - required_size = 22 - # TODO apparently list_evaluations function does not support kwargs - evaluations = openml.evaluations.list_evaluations( - function="predictive_accuracy", size=required_size - ) +@pytest.fixture() +def min_number_setups_on_test_server() -> int: + """After a reset at least 50 setups are on the test server""" + return 50 + - # might not be on test server after reset, please rerun test at least once if fails - self.assertEqual(len(evaluations), required_size) - - @unittest.mock.patch("openml.config.get_cache_directory") - @unittest.skipIf(os.name == "nt", "https://github.com/openml/openml-python/issues/1033") - def test__create_cache_directory(self, config_mock): - with tempfile.TemporaryDirectory(dir=self.workdir) as td: - config_mock.return_value = td - openml.utils._create_cache_directory("abc") - self.assertTrue(os.path.exists(os.path.join(td, "abc"))) - subdir = os.path.join(td, "def") - os.mkdir(subdir) - os.chmod(subdir, 0o444) - config_mock.return_value = subdir - with self.assertRaisesRegex( - openml.exceptions.OpenMLCacheException, - r"Cannot create cache directory", - ): - openml.utils._create_cache_directory("ghi") +@pytest.fixture() +def min_number_runs_on_test_server() -> int: + """After a reset at least 50 runs are on the test server""" + return 21 + + +@pytest.fixture() +def min_number_evaluations_on_test_server() -> int: + """After a reset at least 22 evaluations are on the test server""" + return 22 + + +def _mocked_perform_api_call(call, request_method): + url = openml.config.server + "/" + call + return openml._api_calls._download_text_file(url) + + +@pytest.mark.server() +def test_list_all(): + openml.utils._list_all(listing_call=openml.tasks.functions._list_tasks) + openml.utils._list_all( + listing_call=openml.tasks.functions._list_tasks, + list_output_format="dataframe", + ) + + +@pytest.mark.server() +def test_list_all_for_tasks(min_number_tasks_on_test_server): + tasks = openml.tasks.list_tasks( + batch_size=1000, + size=min_number_tasks_on_test_server, + output_format="dataframe", + ) + assert min_number_tasks_on_test_server == len(tasks) + + +@pytest.mark.server() +def test_list_all_with_multiple_batches(min_number_tasks_on_test_server): + # By setting the batch size one lower than the minimum we guarantee at least two + # batches and at the same time do as few batches (roundtrips) as possible. + batch_size = min_number_tasks_on_test_server - 1 + res = openml.utils._list_all( + listing_call=openml.tasks.functions._list_tasks, + list_output_format="dataframe", + batch_size=batch_size, + ) + assert min_number_tasks_on_test_server <= len(res) + + +@pytest.mark.server() +def test_list_all_for_datasets(min_number_datasets_on_test_server): + datasets = openml.datasets.list_datasets( + batch_size=100, + size=min_number_datasets_on_test_server, + output_format="dataframe", + ) + + assert min_number_datasets_on_test_server == len(datasets) + for dataset in datasets.to_dict(orient="index").values(): + _check_dataset(dataset) + + +@pytest.mark.server() +def test_list_all_for_flows(min_number_flows_on_test_server): + flows = openml.flows.list_flows( + batch_size=25, + size=min_number_flows_on_test_server, + output_format="dataframe", + ) + assert min_number_flows_on_test_server == len(flows) + + +@pytest.mark.server() +@pytest.mark.flaky() # Other tests might need to upload runs first +def test_list_all_for_setups(min_number_setups_on_test_server): + # TODO apparently list_setups function does not support kwargs + setups = openml.setups.list_setups(size=min_number_setups_on_test_server) + assert min_number_setups_on_test_server == len(setups) + + +@pytest.mark.server() +@pytest.mark.flaky() # Other tests might need to upload runs first +def test_list_all_for_runs(min_number_runs_on_test_server): + runs = openml.runs.list_runs(batch_size=25, size=min_number_runs_on_test_server) + assert min_number_runs_on_test_server == len(runs) + + +@pytest.mark.server() +@pytest.mark.flaky() # Other tests might need to upload runs first +def test_list_all_for_evaluations(min_number_evaluations_on_test_server): + # TODO apparently list_evaluations function does not support kwargs + evaluations = openml.evaluations.list_evaluations( + function="predictive_accuracy", + size=min_number_evaluations_on_test_server, + ) + assert min_number_evaluations_on_test_server == len(evaluations) + + +@pytest.mark.server() +@unittest.mock.patch("openml._api_calls._perform_api_call", side_effect=_mocked_perform_api_call) +def test_list_all_few_results_available(_perform_api_call): + datasets = openml.datasets.list_datasets( + size=1000, + data_name="iris", + data_version=1, + output_format="dataframe", + ) + assert len(datasets) == 1, "only one iris dataset version 1 should be present" + assert _perform_api_call.call_count == 1, "expect just one call to get one dataset" + + +@unittest.skipIf(os.name == "nt", "https://github.com/openml/openml-python/issues/1033") +@unittest.mock.patch("openml.config.get_cache_directory") +def test__create_cache_directory(config_mock, tmp_path): + config_mock.return_value = tmp_path + openml.utils._create_cache_directory("abc") + assert (tmp_path / "abc").exists() + + subdir = tmp_path / "def" + subdir.mkdir() + subdir.chmod(0o444) + config_mock.return_value = subdir + with pytest.raises( + openml.exceptions.OpenMLCacheException, + match="Cannot create cache directory", + ): + openml.utils._create_cache_directory("ghi") + + +@pytest.mark.server() +def test_correct_test_server_download_state(): + """This test verifies that the test server downloads the data from the correct source. + + If this tests fails, it is highly likely that the test server is not configured correctly. + Usually, this means that the test server is serving data from the task with the same ID from the production server. + That is, it serves parquet files wrongly associated with the test server's task. + """ + task = openml.tasks.get_task(119) + dataset = task.get_dataset() + assert len(dataset.features) == dataset.get_data(dataset_format="dataframe")[0].shape[1] \ No newline at end of file