Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new source hashing methods: content_sha256, content_sha384, content_sha512 #5277

Merged
merged 38 commits into from
Jan 8, 2025
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
eac67a9
add content_sha256 hash checks
jaimergp Apr 12, 2024
af571af
fix algo id
jaimergp Apr 12, 2024
08d7691
pre-commit
jaimergp Apr 12, 2024
19235f1
extend tests and include path, type and executable bit in the hash
jaimergp Apr 13, 2024
704ba21
make it cross-platform
jaimergp Apr 15, 2024
0426db2
add news
jaimergp Apr 15, 2024
47fe18d
use dash separator
jaimergp Apr 15, 2024
ab810a4
update hashes
jaimergp Apr 15, 2024
91d3a4d
Merge branch 'main' into content-hash
jaimergp Jun 18, 2024
4e0f6dd
Update source.py
jaimergp Jun 18, 2024
002b309
Merge branch 'main' of github.com:conda/conda-build into content-hash
jaimergp Nov 19, 2024
1439e4e
change algorithm a bit and update tests
jaimergp Nov 19, 2024
4f4178b
move to Path.rglob() and allow skips
jaimergp Nov 20, 2024
190e120
register new keys
jaimergp Nov 20, 2024
4b3d56d
update recipe
jaimergp Nov 20, 2024
f513069
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 20, 2024
c409505
add docs
jaimergp Nov 20, 2024
5327e4a
pre-commit
jaimergp Nov 20, 2024
27b9eaf
normalize line endings
jaimergp Nov 20, 2024
73a23ae
prevent partial hash changes in hybrid text/binary files
jaimergp Nov 20, 2024
16260bf
sort by str, not Path
jaimergp Nov 23, 2024
b7f59ef
use separate git cache for this one
jaimergp Nov 24, 2024
c9b7e7b
override src_cache_root instead
jaimergp Nov 24, 2024
b7a4547
pre-commit
jaimergp Nov 24, 2024
50b219e
revert
jaimergp Nov 25, 2024
f60bcdd
force checkout
jaimergp Nov 25, 2024
ffcda69
try with constructor
jaimergp Nov 25, 2024
d434dce
stop force
jaimergp Nov 25, 2024
a950958
add `?` separator for unknown file types
jaimergp Nov 26, 2024
d97d081
pre-commit
jaimergp Nov 26, 2024
36f23c3
Merge branch 'main' of github.com:conda/conda-build into content-hash
jaimergp Nov 26, 2024
2afa293
drop content_{md5,sha1} and add content_{sha384,sha512}
jaimergp Nov 26, 2024
98d8813
add here too
jaimergp Nov 26, 2024
c192799
use a 10MB SpooledTemporaryFile
jaimergp Nov 27, 2024
5edfb20
pre-commit
jaimergp Nov 27, 2024
bcc7ad5
do error on unreadable files and unknown types
jaimergp Nov 27, 2024
6fed084
remove `log` at the bottom
jaimergp Jan 6, 2025
33eb089
Merge branch 'main' into content-hash
jaimergp Jan 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions conda_build/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,10 @@ def parse(data, config, path=None):
"md5": str,
"sha1": None,
"sha256": None,
"content_md5": str,
"content_sha1": None,
"content_sha256": None,
"content_hash_skip": list,
"path": str,
"path_via_symlink": None,
"git_url": str,
Expand Down
34 changes: 29 additions & 5 deletions conda_build/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
LoggingContext,
check_call_env,
check_output_env,
compute_content_hash,
convert_path_for_cygwin_or_msys2,
convert_unix_path_to_win,
copy_into,
Expand All @@ -46,6 +47,8 @@

git_submod_re = re.compile(r"(?:.+)\.(.+)\.(?:.+)\s(.+)")
ext_re = re.compile(r"(.*?)(\.(?:tar\.)?[^.]+)$")
HASH_KEYS = ("md5", "sha1", "sha256")
CONTENT_HASH_KEYS = ("content_md5", "content_sha1", "content_sha256")


def append_hash_to_fn(fn, hash_value):
Expand All @@ -66,7 +69,7 @@ def download_to_cache(cache_folder, recipe_path, source_dict, verbose=False):
source_dict["fn"] if "fn" in source_dict else basename(source_urls[0])
)
hash_added = False
for hash_type in ("md5", "sha1", "sha256"):
for hash_type in HASH_KEYS:
if hash_type in source_dict:
if source_dict[hash_type] in (None, ""):
raise ValueError(f"Empty {hash_type} hash provided for {fn}")
Expand All @@ -75,8 +78,10 @@ def download_to_cache(cache_folder, recipe_path, source_dict, verbose=False):
break
else:
log.warning(
f"No hash (md5, sha1, sha256) provided for {unhashed_fn}. Source download forced. "
"Add hash to recipe to use source cache."
"No hash %s provided for %s. Source download forced. "
"Add hash to recipe to use source cache.",
HASH_KEYS,
unhashed_fn,
)
path = join(cache_folder, fn)
if isfile(path):
Expand Down Expand Up @@ -116,7 +121,7 @@ def download_to_cache(cache_folder, recipe_path, source_dict, verbose=False):
raise RuntimeError(f"Could not download {url}")

hashed = None
for tp in ("md5", "sha1", "sha256"):
for tp in HASH_KEYS:
if tp in source_dict:
expected_hash = source_dict[tp]
hashed = compute_sum(path, tp)
Expand Down Expand Up @@ -1030,7 +1035,7 @@ def provide(metadata):
git = None

try:
for source_dict in metadata.get_section("source"):
for idx, source_dict in enumerate(metadata.get_section("source")):
folder = source_dict.get("folder")
src_dir = os.path.join(metadata.config.work_dir, folder if folder else "")
if any(k in source_dict for k in ("fn", "url")):
Expand Down Expand Up @@ -1109,6 +1114,25 @@ def provide(metadata):
if not isdir(src_dir):
os.makedirs(src_dir)

for hash_type in CONTENT_HASH_KEYS:
if hash_type in source_dict:
expected_content_hash = source_dict[hash_type]
if expected_content_hash in (None, ""):
raise ValueError(
f"Empty {hash_type} hash provided for source item #{idx}"
)
algorithm = hash_type[len("content_") :]
obtained_content_hash = compute_content_hash(
src_dir,
algorithm,
skip=ensure_list(source_dict.get("content_hash_skip") or ()),
)
if expected_content_hash != obtained_content_hash:
raise RuntimeError(
f"{hash_type} mismatch in source item #{idx}: "
beckermr marked this conversation as resolved.
Show resolved Hide resolved
f"obtained '{obtained_content_hash}' != "
f"expected '{expected_content_hash}'"
)
patches = ensure_list(source_dict.get("patches", []))
patch_attributes_output = []
for patch in patches:
Expand Down
82 changes: 81 additions & 1 deletion conda_build/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import urllib.request as urllib
from collections import OrderedDict, defaultdict
from collections.abc import Iterable
from functools import cache
from functools import cache, partial
from glob import glob
from io import StringIO
from itertools import filterfalse
Expand Down Expand Up @@ -1987,6 +1987,86 @@ def sha256_checksum(filename, buffersize=65536):
return sha256.hexdigest()


def compute_content_hash(
directory: str | Path, algorithm="sha256", skip: Iterable[str] = ()
) -> str:
"""
Given a directory, recursively scan all its contents (without following symlinks) and sort them
by their full path. For each entry in the contents table, compute the hash for the concatenated
bytes of:

- UTF-8 encoded path, relative to the input directory. Backslashes are normalized
to forward slashes before encoding.
- Then, depending on the type:
- For regular files, the UTF-8 bytes of an `F` separator, followed by the bytes of its
contents.
- For a directory, the UTF-8 bytes of a `D` separator, and nothing else.
- For a symlink, the UTF-8 bytes of an `L` separator, followed by the UTF-8 encoded bytes
for the path it points to. Backslashes MUST be normalized to forward slashes before
encoding.
- UTF-8 encoded bytes of the string `-`, as separator.

Parameters
----------
directory: The path whose contents will be hashed
algorithm: Name of the algorithm to be used, as expected by `hashlib.new()`
skip: iterable of paths that should not be checked. If a path ends with a slash, it's
interpreted as a directory that won't be traversed. It matches the relative paths
already slashed-normalized.
jaimergp marked this conversation as resolved.
Show resolved Hide resolved

Returns
-------
str
The hexdigest of the computed hash, as described above.
"""
log = get_logger(__name__)
jezdez marked this conversation as resolved.
Show resolved Hide resolved
hasher = hashlib.new(algorithm)
for path in sorted(Path(directory).rglob("*")):
relpath = path.relative_to(directory)
relpathstr = str(relpath).replace("\\", "/")
if skip and any(
(
# Skip directories like .git/
skip_item.endswith("/")
and relpathstr.startswith(skip_item)
or f"{relpathstr}/" == skip_item
)
# Skip full relpath match
or relpathstr == skip_item
for skip_item in skip
):
continue
# encode the relative path to directory, for files, dirs and others
hasher.update(relpathstr.encode("utf-8"))
if path.is_symlink():
hasher.update(b"L")
hasher.update(str(path.readlink()).replace("\\", "/").encode("utf-8"))
elif path.is_dir():
hasher.update(b"D")
elif path.is_file():
hasher.update(b"F")
# We need to normalize line endings for Windows-Unix compat
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

# Attempt normalized line-by-line hashing (text mode). If
# Python fails to open in text mode, then it's binary and we hash
# the raw bytes directly.
try:
try:
jezdez marked this conversation as resolved.
Show resolved Hide resolved
with open(path) as fh:
for line in fh:
hasher.update(line.replace("\r\n", "\n").encode("utf-8"))
except UnicodeDecodeError:
# file must be binary
with open(path, "rb") as fh:
for chunk in iter(partial(fh.read, 8192), b""):
hasher.update(chunk)
except OSError as exc:
log.debug("Skipping %s for hashing", path.name, exc_info=exc)
else:
log.debug("Can't detect type for path %s. Skipping...", path)
beckermr marked this conversation as resolved.
Show resolved Hide resolved
hasher.update(b"-")
return hasher.hexdigest()


def write_bat_activation_text(file_handle, m):
from .os_utils.external import find_executable

Expand Down
14 changes: 14 additions & 0 deletions docs/source/resources/define-metadata.rst
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,20 @@ the repository. Using path allows you to build packages with
unstaged and uncommitted changes in the working directory.
git_url can build only up to the latest commit.

Hashes
------

Conda-build can check the integrity of the provided sources
using different hashing algorithms:

- ``md5``, ``sha1`` and ``sha256`` will check the provided
hexdigest against the downloaded archive, prior to extraction.
- ``content_md5``, ``content_sha1`` and ``content_sha256`` will
check the provided hexdigest against the contents of the
(extracted) directory. ``content_hash_skip`` can take a list of
relative files and directories to be ignored during the check
(e.g. useful to ignore the ``.git/`` directory when ``git_url``
is used to clone a repository).

Patches
-------
Expand Down
20 changes: 20 additions & 0 deletions news/5277-content-hash
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
### Enhancements

* Add new hashing methods (`content_md5`, `content_sha1`, `content_sha256`) to calculate the
checksum of the extracted contents of the downloaded artifacts. (#4821 via #5277)

### Bug fixes

* <news item>

### Deprecations

* <news item>

### Docs

* <news item>

### Other

* <news item>
1 change: 1 addition & 0 deletions tests/test-recipes/metadata/source_url/bld.bat
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
cd conda-build-tar-gz
set PYTHONPATH=.
python -c "import conda_build; assert conda_build.__version__ == 'tag: 1.8.1'"
if errorlevel 1 exit 1
1 change: 1 addition & 0 deletions tests/test-recipes/metadata/source_url/build.sh
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
cd conda-build-tar-gz
# Not sure how versioneer comes up with this version
PYTHONPATH=. python -c "import conda_build; assert conda_build.__version__ == 'tag: 1.8.1'"
38 changes: 33 additions & 5 deletions tests/test-recipes/metadata/source_url/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,39 @@ package:
version: 1.0

source:
fn: conda-build-1.8.1.tar.gz
url: https://github.com/conda/conda-build/archive/1.8.1.tar.gz
md5: 0bf1f3598a659a0e8fb5ee6bbb3fd9fd
sha1: c464a8995ad6bbf0480abd2883876cc9b4913fa7
sha256: f82b0bd5c809c9a7c7256c26364a0065e57732788b7a74c7ea2169135ed2f598
- fn: conda-build-1.8.1.tar.gz
folder: conda-build-tar-gz
url: https://github.com/conda/conda-build/archive/1.8.1.tar.gz
md5: 0bf1f3598a659a0e8fb5ee6bbb3fd9fd
sha1: c464a8995ad6bbf0480abd2883876cc9b4913fa7
sha256: f82b0bd5c809c9a7c7256c26364a0065e57732788b7a74c7ea2169135ed2f598
content_md5: 4d6349dbe4bb1430dc2155a1895e84ce
content_sha1: 9eceebcd86adcf64c2dcd31e36058a377ae2e8cf
content_sha256: 0e3d93b4ba3e6e156a2fc365f5825f0079403cd23f00f89aba21c091e4b0f41a
content_hash_skip:
- conda_build/_version.py
# This is the same tarball but compressed differently. They should have the same content hashes!
- fn: conda-build-1.8.1.zip
folder: conda-build-zip
url: https://github.com/conda/conda-build/archive/1.8.1.zip
md5: 25d59bc816f3d1107f063d77ddfcbe76
sha1: 195104165d395a92c7ecd4c7f98975906950b9dd
sha256: 6d142da3f0f47613d1d0124ec8caf0faf66ec524e6aa356ac49987c3e32b6d95
content_md5: 4d6349dbe4bb1430dc2155a1895e84ce
content_sha1: 9eceebcd86adcf64c2dcd31e36058a377ae2e8cf
content_sha256: 0e3d93b4ba3e6e156a2fc365f5825f0079403cd23f00f89aba21c091e4b0f41a
content_hash_skip:
- conda_build/_version.py
# This is the same tag as above, but cloned directly. They should have the same content hashes!
- folder: conda-build-git
git_url: https://github.com/conda/conda-build.git
git_rev: "1.8.1"
content_md5: 4d6349dbe4bb1430dc2155a1895e84ce
content_sha1: 9eceebcd86adcf64c2dcd31e36058a377ae2e8cf
content_sha256: 0e3d93b4ba3e6e156a2fc365f5825f0079403cd23f00f89aba21c091e4b0f41a
content_hash_skip:
- .git/
- conda_build/_version.py

requirements:
build:
Expand Down
Loading