From 5f7bed6ac18075fee875116bfc9a987fdd8a3752 Mon Sep 17 00:00:00 2001 From: Ari Hartikainen <> Date: Tue, 24 Nov 2020 13:00:28 +0200 Subject: [PATCH 01/15] add github database --- python/setup.py | 2 + python/src/posteriordb/__init__.py | 3 +- python/src/posteriordb/dataset.py | 6 +- python/src/posteriordb/model.py | 6 +- python/src/posteriordb/posterior.py | 6 +- .../posteriordb/posterior_database_github.py | 270 ++++++++++++++++++ 6 files changed, 289 insertions(+), 4 deletions(-) create mode 100644 python/src/posteriordb/posterior_database_github.py diff --git a/python/setup.py b/python/setup.py index 611c8e34..701252d1 100644 --- a/python/setup.py +++ b/python/setup.py @@ -5,6 +5,7 @@ PROJECT_ROOT = os.path.dirname(os.path.realpath(__file__)) VERSION_FILE = os.path.join(PROJECT_ROOT, "src", "posteriordb", "__init__.py") + def get_version(): lines = open(VERSION_FILE, "rt").readlines() version_regex = r"^__version__ = ['\"]([^'\"]*)['\"]" @@ -14,6 +15,7 @@ def get_version(): return mo.group(1) raise RuntimeError("Unable to find version in %s." % (VERSION_FILE,)) + setup( name="posteriordb", version=get_version(), diff --git a/python/src/posteriordb/__init__.py b/python/src/posteriordb/__init__.py index a6facd3c..11d2526d 100644 --- a/python/src/posteriordb/__init__.py +++ b/python/src/posteriordb/__init__.py @@ -1,3 +1,4 @@ from .posterior_database import PosteriorDatabase +from .posterior_database_github import PosteriorDatabaseGithub -__version__ = "0.1.0" +__version__ = "0.2.0" diff --git a/python/src/posteriordb/dataset.py b/python/src/posteriordb/dataset.py index dce51656..8aaeb0d0 100644 --- a/python/src/posteriordb/dataset.py +++ b/python/src/posteriordb/dataset.py @@ -1,14 +1,18 @@ import json import os import tempfile +from typing import Union from zipfile import ZipFile from .posterior_database import PosteriorDatabase +from .posterior_database_github import PosteriorDatabaseGithub from .util import drop_keys class Dataset: - def __init__(self, name: str, posterior_db: PosteriorDatabase): + def __init__( + self, name: str, posterior_db: Union[PosteriorDatabase, PosteriorDatabaseGithub] + ): self.name = name self.posterior_db = posterior_db full_information = self.posterior_db.get_data_info(name=self.name) diff --git a/python/src/posteriordb/model.py b/python/src/posteriordb/model.py index 071d9070..680fc256 100644 --- a/python/src/posteriordb/model.py +++ b/python/src/posteriordb/model.py @@ -1,6 +1,8 @@ import os +from typing import Union from .posterior_database import PosteriorDatabase +from .posterior_database_github import PosteriorDatabaseGithub from .pymc3_model_implementation import PyMC3ModelImplementation from .stan_model_implementation import StanModelImplementation from .util import drop_keys @@ -9,7 +11,9 @@ class Model: - def __init__(self, name: str, posterior_db: PosteriorDatabase): + def __init__( + self, name: str, posterior_db: Union[PosteriorDatabase, PosteriorDatabaseGithub] + ): self.name = name self.posterior_db = posterior_db full_model_info = self.posterior_db.get_model_info(name=self.name) diff --git a/python/src/posteriordb/posterior.py b/python/src/posteriordb/posterior.py index 3d43b3cc..43e41bfe 100644 --- a/python/src/posteriordb/posterior.py +++ b/python/src/posteriordb/posterior.py @@ -1,15 +1,19 @@ import json +from typing import Union from zipfile import ZipFile from .dataset import Dataset from .model import Model from .posterior_database import load_json_file from .posterior_database import PosteriorDatabase +from .posterior_database_github import PosteriorDatabaseGithub from .util import drop_keys class Posterior: - def __init__(self, name: str, posterior_db: PosteriorDatabase): + def __init__( + self, name: str, posterior_db: Union[PosteriorDatabase, PosteriorDatabaseGithub] + ): self.name = name assert name in posterior_db.posterior_names() diff --git a/python/src/posteriordb/posterior_database_github.py b/python/src/posteriordb/posterior_database_github.py new file mode 100644 index 00000000..327f0926 --- /dev/null +++ b/python/src/posteriordb/posterior_database_github.py @@ -0,0 +1,270 @@ +import json +import os +import tempfile +from pathlib import Path +import requests + + +def temporary_no_assertions(x): + return x + + +def get_requests(*args, n=3, **kwargs): + for _ in range(n): + try: + response = requests.get(*args, **kwargs) + if response.ok: + break + except Exception as e: + print(e) + continue + else: + print("response:", response.json()) + raise ValueError("response was not successful") + return response + + +def github_pat(): + GITHUB_PAT = os.environ.get("GITHUB_PAT") + headers = {} + if GITHUB_PAT is not None: + headers["Authorization"] = "token {}".format(GITHUB_PAT) + else: + print( + "GitHub access token was not defined (env GITHUB_PAT not defined), using limited API rate limit." + ) + return headers + + +def get_content(url, path=None, headers=None): + """Download contents assuming GitHub API v3""" + + if headers is None: + headers = github_pat() + if path is not None: + path = Path(path) + verify = os.environ.get("REQUESTS_VERIFY", True) + if str(verify).lower() in ("0", "false"): + verify = False + verify = bool(verify) + + response = get_requests(url, verify=verify, headers=headers) + contents = {} + + for content in response.json(): + try: + if content["type"] == "dir": + contents.update( + get_content(content["_links"]["self"], path=path, headers=headers) + ) + else: + if path: + key = path / content["path"] + else: + key = content["path"] + contents[key] = content + except Exception as e: + print("Invalid content:", content, "Exception was raised (and ignored):", e) + continue + return contents + + +def download_file(url, path): + """Download file with a requests. + + To manually disable requests verify + set environmental variable REQUESTS_VERIFY to false. + """ + verify = os.environ.get("REQUESTS_VERIFY", True) + if str(verify).lower() in ("0", "false"): + verify = False + verify = bool(verify) + + headers = github_pat() + try: + response = get_requests(url, verify=verify, headers=headers) + + path.parent.mkdir(parents=True, exist_ok=True) + + with path.open(mode="wb") as f: + f.write(response.content) + except Exception as e: + print("Exception was raised (and ignored):", e) + return False + return True + + +def load_json_file(path, metadata): + if not path.exists(): + if metadata is not None: + download_file(metadata["download_url"], path) + else: + raise TypeError("File was not found in GitHub") + with path.open(encoding="utf-8") as f: + data = json.load(f) + return data + + +def load_info(path, assertion_function, metadata): + info = load_json_file(path, metadata) + assertion_function(info) + return info + + +def filenames_in_dir_no_extension(directory, gh_directory, extension): + path = Path(directory) + + filenames = [p.with_suffix("").stem for p in path.glob("*" + extension)] + gh_filenames = [p.with_suffix("").stem for p in gh_directory] + return sorted(set(filenames + gh_filenames)) + + +class PosteriorDatabaseGithub: + def __init__( + self, path: str = None, repo="MansMeg/posteriordb", ref="master", refresh=True + ): + if path is None: + path = os.environ.get("POSTERIOR_DB_DIR") + if path is None: + path = Path.home() / ".posteriordb" / "posterior_database" + path.mkdir(parents=True, exist_ok=True) + self.path = Path(path) + + self._url = "https://api.github.com/repos/{repo}/contents/posterior_database?ref={ref}".format( + repo=repo, ref=ref + ) + if refresh_github: + self.refresh_github() + else: + self.links = {} + + # TODO assert that path is a valid posterior database + + def refresh_github(self): + self._links = get_content(self._url, path=self.path.parent) + + def full_path(self, path: str): + return self.path / path + + def posterior(self, name): + # inline import needed to avoid import loop + from .posterior import Posterior + + return Posterior(name, self) + + def model(self, name): + # inline import needed to avoid import loop + from .model import Model + + return Model(name, self) + + def data(self, name): + # inline import needed to avoid import loop + from .dataset import Dataset + + return Dataset(name, self) + + def posterior_info_path(self, name: str): + path = self.path / "posteriors" / (name + ".json") + return path + + def get_posterior_info(self, name: str): + path = self.posterior_info_path(name) + return load_info(path, temporary_no_assertions, self._links.get(path)) + + def get_model_info(self, name: str): + # load from the correct path + file_name = name + ".info.json" + path = self.path / "models" / "info" / file_name + + return load_info(path, temporary_no_assertions, self._links.get(path)) + + def get_data_info(self, name: str): + file_name = name + ".info.json" + path = self.path / "data" / "info" / file_name + + return load_info(path, temporary_no_assertions, self._links.get(path)) + + def get_reference_draws_path(self, name: str): + reference_root = self.path / "reference_posteriors" / "draws" / "draws" + reference_name = self.get_posterior_info(name).get("reference_posterior_name") + assert reference_name is not None + path = reference_root / (reference_name + ".json") + return path + + def get_reference_draws_info(self, name: str): + reference_root = self.path / "reference_posteriors" / "draws" / "info" + reference_name = self.get_posterior_info(name).get("reference_posterior_name") + assert reference_name is not None + path = reference_root / (reference_name + ".info.json") + return load_info(path, temporary_no_assertions, self._links.get(path, None)) + + def get_model_code_path(self, name, framework): + model_info = self.get_model_info(name) + path_within_posterior_db = model_info["model_implementations"][framework][ + "model_code" + ] + path = self.path / path_within_posterior_db + + # download model code + if not path.exists(): + download_file(self._links[path]["download_url"], path) + + return path + + def get_dataset_path(self, name): + data_info = self.get_data_info(name) + path = self.path / (data_info["data_file"] + ".zip") + + # download model code + if not path.exists(): + download_file(self._links[path]["download_url"], path) + + return path + + def posterior_names(self): + directory = self.path / "posteriors" + # walk directory, find json files + # strip file extension + gh_directory = [ + key for key in self._links if str(directory.resolve()) in str(key.resolve()) + ] + return filenames_in_dir_no_extension(directory, gh_directory, ".json") + + def model_names(self): + directory = self.path / "models" / "info" + gh_directory = [ + key for key in self._links if str(directory.resolve()) in str(key.resolve()) + ] + return filenames_in_dir_no_extension(directory, gh_directory, ".info.json") + + def dataset_names(self): + directory = self.path / "data" / "info" + gh_directory = [ + key for key in self._links if str(directory.resolve()) in str(key.resolve()) + ] + return filenames_in_dir_no_extension(directory, gh_directory, ".info.json") + + def posteriors(self): + names = self.posterior_names() + # inline import needed to avoid import loop + from .posterior import Posterior + + for name in names: + yield Posterior(name, self) + + def models(self): + names = self.model_names() + # inline import needed to avoid import loop + from .model import Model + + for name in names: + yield Model(name, self) + + def datasets(self): + names = self.dataset_names() + # inline import needed to avoid import loop + from .dataset import Dataset + + for name in names: + yield Dataset(name, self) From 553303455c8a031b5cfe33ab0fad9225a8488d67 Mon Sep 17 00:00:00 2001 From: Ari Hartikainen <> Date: Tue, 24 Nov 2020 15:02:01 +0200 Subject: [PATCH 02/15] update readme and enable download all --- python/README.md | 30 +++++++++++++++---- python/src/posteriordb/posterior.py | 1 - .../posteriordb/posterior_database_github.py | 29 +++++++++++++++--- 3 files changed, 50 insertions(+), 10 deletions(-) diff --git a/python/README.md b/python/README.md index 722b3c93..2882a821 100644 --- a/python/README.md +++ b/python/README.md @@ -4,12 +4,19 @@ Currently only python 3.6+ is supported. Python 3.5+ support can be added if nee ## Installation -Currently only local install is supported. From the main directory of this project run +Installation from PyPI is recommended. + ```bash -pip install python/ +pip install posteriordb ``` -Installing from git url will be supported soon. Publishing the package to PyPI will also happen at some point. +Or with a local git clone + +```bash +git clone https://github.com/MansMeg/posteriordb +cd posteriordb +pip install python/ +``` ## Using the posterior database from python @@ -23,9 +30,22 @@ First we create the posterior database to use, here the cloned posterior databas >>> pdb_path = os.path.join(os.getcwd(), "posterior_database") >>> my_pdb = PosteriorDatabase(pdb_path) ``` - The above code requires that your working directory is in the main folder of your copy -of this project. Alternatively, you can specify the path to the folder directly. To list the posteriors available, use `posterior_names`. +of this project. Alternatively, you can specify the path to the folder directly. + +To use online database use `PosteriorDatabaseGithub` class. Remember to create and set `GITHUB_PAT` environmental variable. +It is recommended that users create a read-only Personal Access Token for `posteriordb` use. +If not explicitly defined, `PosteriorDatabaseGithub` will create a new or use old database located at `POSTERIOR_DB_DIR` if +defined and `$HOME/.posteriordb/posterior_database`. Each used model and data is downloaded from online dynamically when needed. + +```python +>>> from posteriordb import PosteriorDatabaseGithub +>>> import os +>>> os.environ["GITHUB_PAT"] = "token-string-here" +>>> my_pdb = PosteriorDatabaseGithub() +``` + +To list the posteriors available, use `posterior_names`. ```python >>> pos = my_pdb.posterior_names() diff --git a/python/src/posteriordb/posterior.py b/python/src/posteriordb/posterior.py index 43e41bfe..33d44f38 100644 --- a/python/src/posteriordb/posterior.py +++ b/python/src/posteriordb/posterior.py @@ -4,7 +4,6 @@ from .dataset import Dataset from .model import Model -from .posterior_database import load_json_file from .posterior_database import PosteriorDatabase from .posterior_database_github import PosteriorDatabaseGithub from .util import drop_keys diff --git a/python/src/posteriordb/posterior_database_github.py b/python/src/posteriordb/posterior_database_github.py index 327f0926..319d4b72 100644 --- a/python/src/posteriordb/posterior_database_github.py +++ b/python/src/posteriordb/posterior_database_github.py @@ -130,19 +130,40 @@ def __init__( path.mkdir(parents=True, exist_ok=True) self.path = Path(path) - self._url = "https://api.github.com/repos/{repo}/contents/posterior_database?ref={ref}".format( - repo=repo, ref=ref - ) + self._repo = repo + self._ref = ref + + self.refresh_url() + if refresh_github: self.refresh_github() else: - self.links = {} + self._links = {} # TODO assert that path is a valid posterior database + def refresh_url(self, repo=None, ref=None): + if repo is not None: + self._repo = repo + if ref is not None: + self._ref = ref + + self._url = "https://api.github.com/repos/{repo}/contents/posterior_database?ref={ref}".format( + repo=self._repo, ref=self._ref + ) + def refresh_github(self): self._links = get_content(self._url, path=self.path.parent) + def download_all(self, refresh=True): + """Download all files for database.""" + if refresh: + self.refresh_github() + n = len(self._links) + for i, (path, metadata) in enumerate(self._links.items(), 1): + print("\rFile ({}/{})".format(i, n), end="") + download_file(metadata["download_url"], path) + def full_path(self, path: str): return self.path / path From 7efa7059fb5f353c5a693c5ea7651974bbe6c075 Mon Sep 17 00:00:00 2001 From: Ari Hartikainen <> Date: Tue, 24 Nov 2020 15:38:27 +0200 Subject: [PATCH 03/15] add tests --- .github/workflows/workflow-python.yml | 2 + python/setup.py | 8 ++++ python/tests/test_pdb.py | 63 ++++++++++++++++++++++++++- 3 files changed, 72 insertions(+), 1 deletion(-) diff --git a/.github/workflows/workflow-python.yml b/.github/workflows/workflow-python.yml index cc89e592..30b03809 100644 --- a/.github/workflows/workflow-python.yml +++ b/.github/workflows/workflow-python.yml @@ -16,6 +16,8 @@ jobs: os: [ubuntu-latest, macos-latest, windows-latest] python-version: [3.6, 3.7, 3.8, 3.9] fail-fast: false + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: - name: Checkout github uses: actions/checkout@v2 diff --git a/python/setup.py b/python/setup.py index 701252d1..efeff887 100644 --- a/python/setup.py +++ b/python/setup.py @@ -4,6 +4,12 @@ PROJECT_ROOT = os.path.dirname(os.path.realpath(__file__)) VERSION_FILE = os.path.join(PROJECT_ROOT, "src", "posteriordb", "__init__.py") +README_FILE = os.path.join(PROJECT_ROOT, "README.md") + + +def get_long_description(): + with open(README_FILE, "rt") as buff: + return buff.read() def get_version(): @@ -24,6 +30,8 @@ def get_version(): author="Eero Linna", author_email="eero.linna@aalto.fi", license="GPL", + long_description=get_long_description(), + long_description_content_type="text/markdown", packages=["posteriordb"], package_dir={"": "src"}, zip_safe=False, diff --git a/python/tests/test_pdb.py b/python/tests/test_pdb.py index 4ab7aae5..f097b43a 100644 --- a/python/tests/test_pdb.py +++ b/python/tests/test_pdb.py @@ -1,6 +1,6 @@ import os -from posteriordb import PosteriorDatabase +from posteriordb import PosteriorDatabase, PosteriorDatabaseGithub def test_posterior_database(): @@ -61,3 +61,64 @@ def test_posterior_database(): datasets = list(pdb.datasets()) assert len(datasets) > 0 + + +def test_posterior_database_github(): + # Skip test if GITHUB_PAT not defined + if "GITHUB_PAT" not in os.environ: + return + + pdb = PosteriorDatabaseGithub() + + model_names = pdb.model_names() + assert len(model_names) > 0 + + dataset_names = pdb.dataset_names() + assert len(dataset_names) > 0 + + posterior_names = pdb.posterior_names() + + assert len(posterior_names) > 0 + + for name in posterior_names: + posterior = pdb.posterior(name) + + assert posterior.name is not None + + # test dataset methods + data = posterior.data + assert data.name is not None + + assert data.values() is not None + assert data.file_path() is not None + + assert data.information is not None + + # test that pdb.data works + data2 = pdb.data(data.name) + assert data2 is not None + + # test model methods + model = posterior.model + + assert model.name is not None + + assert model.code("stan") is not None + assert model.code_file_path("stan") is not None + assert model.stan_code() is not None + assert model.stan_code_file_path() is not None + + assert model.information is not None + + # test that pdb.model works + model2 = pdb.model(model.name) + assert model2 is not None + + posteriors = list(pdb.posteriors()) + assert len(posteriors) > 0 + + models = list(pdb.models()) + assert len(models) > 0 + + datasets = list(pdb.datasets()) + assert len(datasets) > 0 From a57fefbb3dd07594da16595058a0e34589120f89 Mon Sep 17 00:00:00 2001 From: Ari Hartikainen <> Date: Tue, 24 Nov 2020 15:40:22 +0200 Subject: [PATCH 04/15] update wording --- python/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/README.md b/python/README.md index 2882a821..f9a6e327 100644 --- a/python/README.md +++ b/python/README.md @@ -10,7 +10,7 @@ Installation from PyPI is recommended. pip install posteriordb ``` -Or with a local git clone +Installing from the local clone. ```bash git clone https://github.com/MansMeg/posteriordb From 0dbe055a494d5f1a9510405e8bd783eee1324506 Mon Sep 17 00:00:00 2001 From: Ari Hartikainen <> Date: Tue, 24 Nov 2020 15:42:42 +0200 Subject: [PATCH 05/15] add instructions for PAT --- python/README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/README.md b/python/README.md index f9a6e327..b27a9a9d 100644 --- a/python/README.md +++ b/python/README.md @@ -34,7 +34,10 @@ The above code requires that your working directory is in the main folder of you of this project. Alternatively, you can specify the path to the folder directly. To use online database use `PosteriorDatabaseGithub` class. Remember to create and set `GITHUB_PAT` environmental variable. -It is recommended that users create a read-only Personal Access Token for `posteriordb` use. +It is recommended that users create a read-only (no extra permissions) Personal Access Token (PAT) for `posteriordb` use. + +https://docs.github.com/en/free-pro-team@latest/github/authenticating-to-github/creating-a-personal-access-token + If not explicitly defined, `PosteriorDatabaseGithub` will create a new or use old database located at `POSTERIOR_DB_DIR` if defined and `$HOME/.posteriordb/posterior_database`. Each used model and data is downloaded from online dynamically when needed. From ebd87c5bab50a12e7e49490827283e7ea698b90f Mon Sep 17 00:00:00 2001 From: Ari Hartikainen <> Date: Tue, 24 Nov 2020 15:45:15 +0200 Subject: [PATCH 06/15] try all branches --- .github/workflows/workflow-python.yml | 4 ++-- .github/workflows/workflow-r.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/workflow-python.yml b/.github/workflows/workflow-python.yml index 30b03809..90ad26a6 100644 --- a/.github/workflows/workflow-python.yml +++ b/.github/workflows/workflow-python.yml @@ -3,9 +3,9 @@ name: Python on: push: branches: - - '*' + - '**' tags: - - '*' + - '**' jobs: models: diff --git a/.github/workflows/workflow-r.yml b/.github/workflows/workflow-r.yml index 5b7972d2..358c666f 100644 --- a/.github/workflows/workflow-r.yml +++ b/.github/workflows/workflow-r.yml @@ -3,9 +3,9 @@ name: R on: push: branches: - - '*' + - '**' tags: - - '*' + - '**' jobs: models: From 59b93cd37f9f369c4a0ffd61a03ac4d343dd3a97 Mon Sep 17 00:00:00 2001 From: Ari Hartikainen Date: Tue, 24 Nov 2020 16:37:33 +0200 Subject: [PATCH 07/15] add requests to requirements --- python/setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/setup.py b/python/setup.py index efeff887..9143c50d 100644 --- a/python/setup.py +++ b/python/setup.py @@ -34,5 +34,6 @@ def get_version(): long_description_content_type="text/markdown", packages=["posteriordb"], package_dir={"": "src"}, + install_requires=["requests"], zip_safe=False, ) From 50c2a708831c32db9094aaf7cc753f7ccf8ff810 Mon Sep 17 00:00:00 2001 From: Ari Hartikainen Date: Tue, 24 Nov 2020 16:52:48 +0200 Subject: [PATCH 08/15] Update link and a comment where one should put the PAT --- python/README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/README.md b/python/README.md index b27a9a9d..204188c5 100644 --- a/python/README.md +++ b/python/README.md @@ -34,9 +34,9 @@ The above code requires that your working directory is in the main folder of you of this project. Alternatively, you can specify the path to the folder directly. To use online database use `PosteriorDatabaseGithub` class. Remember to create and set `GITHUB_PAT` environmental variable. -It is recommended that users create a read-only (no extra permissions) Personal Access Token (PAT) for `posteriordb` use. +It is recommended that users create a read-only (no extra permissions) [GitHub Personal Access Token (PAT)](https://docs.github.com/en/free-pro-team@latest/github/authenticating-to-github/creating-a-personal-access-token) for `posteriordb` use. It is recommended that the +GITHUB_PAT variable is added to user environmental variables and it is not set in Python script as shown in the example. -https://docs.github.com/en/free-pro-team@latest/github/authenticating-to-github/creating-a-personal-access-token If not explicitly defined, `PosteriorDatabaseGithub` will create a new or use old database located at `POSTERIOR_DB_DIR` if defined and `$HOME/.posteriordb/posterior_database`. Each used model and data is downloaded from online dynamically when needed. @@ -44,6 +44,8 @@ defined and `$HOME/.posteriordb/posterior_database`. Each used model and data is ```python >>> from posteriordb import PosteriorDatabaseGithub >>> import os +>>> # It is recommended that GITHUB_PAT is added to the user environmental variables +>>> # outside python and not in a python script as shown in this example code >>> os.environ["GITHUB_PAT"] = "token-string-here" >>> my_pdb = PosteriorDatabaseGithub() ``` From 9fea797a174a50e5cf15af86c9be0e6e1e250093 Mon Sep 17 00:00:00 2001 From: Ari Hartikainen <> Date: Tue, 24 Nov 2020 18:49:59 +0200 Subject: [PATCH 09/15] check hash if overwrite --- .../posteriordb/posterior_database_github.py | 91 +++++++++++++++---- python/tests/test_pdb.py | 11 ++- 2 files changed, 84 insertions(+), 18 deletions(-) diff --git a/python/src/posteriordb/posterior_database_github.py b/python/src/posteriordb/posterior_database_github.py index 319d4b72..30e8361d 100644 --- a/python/src/posteriordb/posterior_database_github.py +++ b/python/src/posteriordb/posterior_database_github.py @@ -1,3 +1,4 @@ +import hashlib import json import os import tempfile @@ -69,12 +70,17 @@ def get_content(url, path=None, headers=None): return contents -def download_file(url, path): +def download_file(url, path, overwrite=False, sha=None): """Download file with a requests. To manually disable requests verify set environmental variable REQUESTS_VERIFY to false. """ + # Check if file exists and + if overwrite and (sha is not None) and path.exists(): + if sha256 == get_sha256_hash(path): + return True + verify = os.environ.get("REQUESTS_VERIFY", True) if str(verify).lower() in ("0", "false"): verify = False @@ -94,19 +100,32 @@ def download_file(url, path): return True -def load_json_file(path, metadata): +def get_sha1_hash(path): + sha1_hash = hashlib.sha1() + size = path.stat().st_size + with path.open("rb") as f: + sha1_hash.update(bytes("blob {}".format(size), encoding="utf-8") + b"\0") + for byte_block in iter(lambda: f.read(4096), b""): + sha1_hash.update(byte_block) + return sha1_hash.hexdigest() + + +def load_json_file(path, metadata, overwrite=False): if not path.exists(): if metadata is not None: download_file(metadata["download_url"], path) else: raise TypeError("File was not found in GitHub") + elif overwrite: + if metadata is not None: + download_file(metadata["download_url"], path, sha=metadata["sha"]) with path.open(encoding="utf-8") as f: data = json.load(f) return data -def load_info(path, assertion_function, metadata): - info = load_json_file(path, metadata) +def load_info(path, assertion_function, metadata, overwrite=False): + info = load_json_file(path, metadata, overwrite=overwrite) assertion_function(info) return info @@ -121,7 +140,12 @@ def filenames_in_dir_no_extension(directory, gh_directory, extension): class PosteriorDatabaseGithub: def __init__( - self, path: str = None, repo="MansMeg/posteriordb", ref="master", refresh=True + self, + path: str = None, + repo="MansMeg/posteriordb", + ref="master", + refresh=True, + overwrite=False, ): if path is None: path = os.environ.get("POSTERIOR_DB_DIR") @@ -130,12 +154,13 @@ def __init__( path.mkdir(parents=True, exist_ok=True) self.path = Path(path) + self.overwrite = overwrite self._repo = repo self._ref = ref self.refresh_url() - if refresh_github: + if refresh: self.refresh_github() else: self._links = {} @@ -155,14 +180,18 @@ def refresh_url(self, repo=None, ref=None): def refresh_github(self): self._links = get_content(self._url, path=self.path.parent) - def download_all(self, refresh=True): + def download_all(self, refresh=True, overwrite=None): """Download all files for database.""" if refresh: self.refresh_github() + if overwrite is None: + overwrite = self.overwrite n = len(self._links) for i, (path, metadata) in enumerate(self._links.items(), 1): print("\rFile ({}/{})".format(i, n), end="") - download_file(metadata["download_url"], path) + download_file( + metadata["download_url"], path, overwrite=overwrite, sha=metadata["sha"] + ) def full_path(self, path: str): return self.path / path @@ -191,20 +220,35 @@ def posterior_info_path(self, name: str): def get_posterior_info(self, name: str): path = self.posterior_info_path(name) - return load_info(path, temporary_no_assertions, self._links.get(path)) + return load_info( + path, + temporary_no_assertions, + self._links.get(path), + overwrite=self.overwrite, + ) def get_model_info(self, name: str): # load from the correct path file_name = name + ".info.json" path = self.path / "models" / "info" / file_name - return load_info(path, temporary_no_assertions, self._links.get(path)) + return load_info( + path, + temporary_no_assertions, + self._links.get(path), + overwrite=self.overwrite, + ) def get_data_info(self, name: str): file_name = name + ".info.json" path = self.path / "data" / "info" / file_name - return load_info(path, temporary_no_assertions, self._links.get(path)) + return load_info( + path, + temporary_no_assertions, + self._links.get(path), + overwrite=self.overwrite, + ) def get_reference_draws_path(self, name: str): reference_root = self.path / "reference_posteriors" / "draws" / "draws" @@ -218,7 +262,12 @@ def get_reference_draws_info(self, name: str): reference_name = self.get_posterior_info(name).get("reference_posterior_name") assert reference_name is not None path = reference_root / (reference_name + ".info.json") - return load_info(path, temporary_no_assertions, self._links.get(path, None)) + return load_info( + path, + temporary_no_assertions, + self._links.get(path, None), + overwrite=self.overwrite, + ) def get_model_code_path(self, name, framework): model_info = self.get_model_info(name) @@ -228,8 +277,13 @@ def get_model_code_path(self, name, framework): path = self.path / path_within_posterior_db # download model code - if not path.exists(): - download_file(self._links[path]["download_url"], path) + if not path.exists() or self.overwrite: + download_file( + self._links[path]["download_url"], + path, + overwrite=self.overwrite, + sha=self._links[path]["sha"], + ) return path @@ -238,8 +292,13 @@ def get_dataset_path(self, name): path = self.path / (data_info["data_file"] + ".zip") # download model code - if not path.exists(): - download_file(self._links[path]["download_url"], path) + if not path.exists() or self.overwrite: + download_file( + self._links[path]["download_url"], + path, + overwrite=self.overwrite, + sha=self._links[path]["sha"], + ) return path diff --git a/python/tests/test_pdb.py b/python/tests/test_pdb.py index f097b43a..83bab8bd 100644 --- a/python/tests/test_pdb.py +++ b/python/tests/test_pdb.py @@ -1,4 +1,5 @@ import os +import re from posteriordb import PosteriorDatabase, PosteriorDatabaseGithub @@ -65,10 +66,16 @@ def test_posterior_database(): def test_posterior_database_github(): # Skip test if GITHUB_PAT not defined - if "GITHUB_PAT" not in os.environ: + if os.environ.get("GITHUB_PAT") is None: return - pdb = PosteriorDatabaseGithub() + kwargs = {} + if os.environ.get("GITHUB_ACTIONS"): + kwargs["repo"] = os.environ.get("GITHUB_REPOSITORY", "MansMeg/posteriordb") + kwargs["ref"] = re.sub( + "^refs/heads/", "", os.environ.get("GITHUB_REF", "master") + ) + pdb = PosteriorDatabaseGithub(**kwargs) model_names = pdb.model_names() assert len(model_names) > 0 From de60281f669832bb3a57ffb51b9e61f8a59f784c Mon Sep 17 00:00:00 2001 From: Ari Hartikainen Date: Tue, 24 Nov 2020 20:19:40 +0200 Subject: [PATCH 10/15] Fix ref --- rpackage/R/pdb_github.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rpackage/R/pdb_github.R b/rpackage/R/pdb_github.R index 97c8ff5e..d95a82ca 100644 --- a/rpackage/R/pdb_github.R +++ b/rpackage/R/pdb_github.R @@ -155,8 +155,8 @@ github_ref <- function(pdb = NULL) { ref <- Sys.getenv("GITHUB_REF") if (nzchar(ref)) { # This is to handle that GITHUB_REF on Github Actions return 'refs/heads/[ref]' - ref <- strsplit(ref, "/")[[1]] - return(ref[length(ref)]) + ref <- sub("^refs/heads/", "", ref) + return(ref) } else { return("master") } From 82c0404ae87e5f2bbe12d1c114e84382237fb8d7 Mon Sep 17 00:00:00 2001 From: Ari Hartikainen <> Date: Tue, 24 Nov 2020 22:21:37 +0200 Subject: [PATCH 11/15] fix overwrite and add progress bar and test sha1 --- .../posteriordb/posterior_database_github.py | 20 +++++++++++++++---- python/tests/test_pdb.py | 13 ++++++++++++ 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/python/src/posteriordb/posterior_database_github.py b/python/src/posteriordb/posterior_database_github.py index 30e8361d..b6931c57 100644 --- a/python/src/posteriordb/posterior_database_github.py +++ b/python/src/posteriordb/posterior_database_github.py @@ -76,9 +76,10 @@ def download_file(url, path, overwrite=False, sha=None): To manually disable requests verify set environmental variable REQUESTS_VERIFY to false. """ - # Check if file exists and + if (not overwrite) and path.exists(): + return True if overwrite and (sha is not None) and path.exists(): - if sha256 == get_sha256_hash(path): + if sha == get_sha1_hash(path): return True verify = os.environ.get("REQUESTS_VERIFY", True) @@ -160,6 +161,9 @@ def __init__( self.refresh_url() + self._progressbar_width = 30 + self._progressbar_marker = b"\xe2\x96\xa0".decode("utf-8") + if refresh: self.refresh_github() else: @@ -180,7 +184,7 @@ def refresh_url(self, repo=None, ref=None): def refresh_github(self): self._links = get_content(self._url, path=self.path.parent) - def download_all(self, refresh=True, overwrite=None): + def download_all(self, refresh=False, overwrite=None, show_progress=True): """Download all files for database.""" if refresh: self.refresh_github() @@ -188,7 +192,15 @@ def download_all(self, refresh=True, overwrite=None): overwrite = self.overwrite n = len(self._links) for i, (path, metadata) in enumerate(self._links.items(), 1): - print("\rFile ({}/{})".format(i, n), end="") + print( + "\rDownloading {:<30} ({}/{})".format( + self._progressbar_marker * int(i / n * self._progressbar_width), + i, + n, + ), + end="", + flush=True, + ) download_file( metadata["download_url"], path, overwrite=overwrite, sha=metadata["sha"] ) diff --git a/python/tests/test_pdb.py b/python/tests/test_pdb.py index 83bab8bd..66ca63fa 100644 --- a/python/tests/test_pdb.py +++ b/python/tests/test_pdb.py @@ -2,6 +2,7 @@ import re from posteriordb import PosteriorDatabase, PosteriorDatabaseGithub +from posteriordb.posterior_database_github import get_sha1_hash def test_posterior_database(): @@ -121,6 +122,18 @@ def test_posterior_database_github(): model2 = pdb.model(model.name) assert model2 is not None + # check sha1 + ((model_file_path, sha1_github),) = [ + (key, item["sha"]) + for key, item in pdb._links.items() + if ( + (key.with_suffix("").stem == model.name) + and ("/models/" in key.as_posix()) + ) + ] + sha1_file = get_sha1_hash(model_file_path) + assert sha1_file == sha1_github + posteriors = list(pdb.posteriors()) assert len(posteriors) > 0 From bdf6b87896f6ca69000b9ed387b859cf96e6893c Mon Sep 17 00:00:00 2001 From: Ari Hartikainen <> Date: Tue, 24 Nov 2020 22:48:43 +0200 Subject: [PATCH 12/15] update test --- python/tests/test_pdb.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/tests/test_pdb.py b/python/tests/test_pdb.py index 66ca63fa..696b9d25 100644 --- a/python/tests/test_pdb.py +++ b/python/tests/test_pdb.py @@ -123,7 +123,7 @@ def test_posterior_database_github(): assert model2 is not None # check sha1 - ((model_file_path, sha1_github),) = [ + model_files = [ (key, item["sha"]) for key, item in pdb._links.items() if ( @@ -131,8 +131,9 @@ def test_posterior_database_github(): and ("/models/" in key.as_posix()) ) ] - sha1_file = get_sha1_hash(model_file_path) - assert sha1_file == sha1_github + for model_file_path, sha1_github in model_files: + sha1_file = get_sha1_hash(model_file_path) + assert sha1_file == sha1_github posteriors = list(pdb.posteriors()) assert len(posteriors) > 0 From 187d5a4ae500db75427505a4b47a86e03831b581 Mon Sep 17 00:00:00 2001 From: Ari Hartikainen <> Date: Tue, 24 Nov 2020 22:50:46 +0200 Subject: [PATCH 13/15] fail on ci if github_pat is not given --- python/tests/test_pdb.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/tests/test_pdb.py b/python/tests/test_pdb.py index 696b9d25..8a572476 100644 --- a/python/tests/test_pdb.py +++ b/python/tests/test_pdb.py @@ -68,6 +68,8 @@ def test_posterior_database(): def test_posterior_database_github(): # Skip test if GITHUB_PAT not defined if os.environ.get("GITHUB_PAT") is None: + if os.environ.get("CI"): + raise TypeError("GITHUB_PAT environmental variable is missing") return kwargs = {} From 1080bcad8b45311e5a5c1714d2f0f04b3b643fad Mon Sep 17 00:00:00 2001 From: Ari Hartikainen <> Date: Tue, 24 Nov 2020 23:00:26 +0200 Subject: [PATCH 14/15] reword README --- python/README.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/python/README.md b/python/README.md index 204188c5..cfe29698 100644 --- a/python/README.md +++ b/python/README.md @@ -33,13 +33,14 @@ First we create the posterior database to use, here the cloned posterior databas The above code requires that your working directory is in the main folder of your copy of this project. Alternatively, you can specify the path to the folder directly. -To use online database use `PosteriorDatabaseGithub` class. Remember to create and set `GITHUB_PAT` environmental variable. -It is recommended that users create a read-only (no extra permissions) [GitHub Personal Access Token (PAT)](https://docs.github.com/en/free-pro-team@latest/github/authenticating-to-github/creating-a-personal-access-token) for `posteriordb` use. It is recommended that the -GITHUB_PAT variable is added to user environmental variables and it is not set in Python script as shown in the example. +Online database can be used with the `PosteriorDatabaseGithub` class. Remember to create and set `GITHUB_PAT` environmental variable. +It's recommended that users create a read-only (no extra permissions) [GitHub Personal Access Token (PAT)](https://docs.github.com/en/free-pro-team@latest/github/authenticating-to-github/creating-a-personal-access-token) for `posteriordb` use. It's also recommended that the +`GITHUB_PAT` environmental variable is added to user environmental variables and it is not shown in Python script as in the example below. -If not explicitly defined, `PosteriorDatabaseGithub` will create a new or use old database located at `POSTERIOR_DB_DIR` if -defined and `$HOME/.posteriordb/posterior_database`. Each used model and data is downloaded from online dynamically when needed. +If not explicitly defined, `PosteriorDatabaseGithub` will create a new (or use old database) located at `POSTERIOR_DB_DIR` if it's +defined and finally as a fallback `$HOME/.posteriordb/posterior_database` is used. +Each model and data is only downloaded and cached when needed. ```python >>> from posteriordb import PosteriorDatabaseGithub From 1adf53b705e3176fbb4d50f0352606f5f201a81f Mon Sep 17 00:00:00 2001 From: Ari Hartikainen <> Date: Wed, 25 Nov 2020 11:43:56 +0200 Subject: [PATCH 15/15] change to POSTERIOR_DB_PATH --- python/README.md | 4 ++-- python/src/posteriordb/posterior_database.py | 6 ++++-- python/src/posteriordb/posterior_database_github.py | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/python/README.md b/python/README.md index cfe29698..818d2c6a 100644 --- a/python/README.md +++ b/python/README.md @@ -38,8 +38,8 @@ It's recommended that users create a read-only (no extra permissions) [GitHub Pe `GITHUB_PAT` environmental variable is added to user environmental variables and it is not shown in Python script as in the example below. -If not explicitly defined, `PosteriorDatabaseGithub` will create a new (or use old database) located at `POSTERIOR_DB_DIR` if it's -defined and finally as a fallback `$HOME/.posteriordb/posterior_database` is used. +If not explicitly defined, `PosteriorDatabase` and `PosteriorDatabaseGithub` will create a new (or use old database) located at `POSTERIOR_DB_PATH` if it's +defined. `PosteriorDatabaseGithub` will finally use `$HOME/.posteriordb/posterior_database` as a fallback location if no environmental variables have been set. Each model and data is only downloaded and cached when needed. ```python diff --git a/python/src/posteriordb/posterior_database.py b/python/src/posteriordb/posterior_database.py index aa0db56c..7340ca31 100644 --- a/python/src/posteriordb/posterior_database.py +++ b/python/src/posteriordb/posterior_database.py @@ -32,8 +32,10 @@ def filenames_in_dir_no_extension(directory, extension): class PosteriorDatabase: - def __init__(self, path: str): - self.path = path + def __init__(self, path: str = None): + if path is None: + path = os.environ.get("POSTERIOR_DB_PATH") + self.path = str(path) # TODO assert that path is a valid posterior database def full_path(self, path: str): diff --git a/python/src/posteriordb/posterior_database_github.py b/python/src/posteriordb/posterior_database_github.py index b6931c57..8478be67 100644 --- a/python/src/posteriordb/posterior_database_github.py +++ b/python/src/posteriordb/posterior_database_github.py @@ -149,7 +149,7 @@ def __init__( overwrite=False, ): if path is None: - path = os.environ.get("POSTERIOR_DB_DIR") + path = os.environ.get("POSTERIOR_DB_PATH") if path is None: path = Path.home() / ".posteriordb" / "posterior_database" path.mkdir(parents=True, exist_ok=True)